In [1]:
from typing import Dict, Union
from loguru import logger
import json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config as BotoConfig
import pymongo
from urllib.parse import quote_plus
from typing import Dict


def get_secret(secret_name: str) -> Union[Dict[str, str], str]:
    logger.info(f"Retrieving secret {secret_name}")
    session = boto3.session.Session()
    boto_config = BotoConfig(
        connect_timeout=10,
        retries={
            "max_attempts": 3,
            "mode":"standard"
        }
    )
    client = session.client(
        service_name='secretsmanager',
        config=boto_config,
        region_name='us-east-1',
    )
    try:
        logger.info(f"{secret_name}")
        response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e
    secret_string = response["SecretString"]
    try:
        return json.loads(secret_string)
    except json.JSONDecodeError:
        return secret_string

# frontend server secret name:
# secret_name = "dev/tai_backend/mongodb/user"
# tai service secret name:
secret_name = "dev/tai_service/document_DB/read_write_user_password"
credentials = get_secret(secret_name)
user_name = quote_plus(credentials["username"])
password = quote_plus(credentials["password"])
# frontend server uri:
# db_uri = f"mongodb+srv://tai-testing:tai-testing@cluster0.cx5zd.mongodb.net/?retryWrites=true&w=majority"
# tai service uri:
db_uri = f"mongodb://{user_name}:{password}@tai-service-645860363137.us-east-1.docdb-elastic.amazonaws.com:27017/?tls=true&retryWrites=false"
# escape the url
# db_uri = urllib.parse.quote_plus(db_uri)

##Create a MongoDB client, open a connection to Amazon DocumentDB as a replica set and specify the read preference as secondary preferred
client = pymongo.MongoClient(db_uri) 
##Specify the database to be used
# print all databases|
print(client.list_database_names())

db = client.class_resources
# db = client.TAI_frontend
collection_list = db.list_collection_names()
print(collection_list)
document_counts = {}
indexes = {}
index_sizes = {}
sum_of_indexes = {}
size_of_objects = {}

for x in collection_list:
    # print all doucments in each collect'''ion
    col = db[x]
    document_counts[x] = col.estimated_document_count()
    indexes[x] = col.index_information()
    index_size = db.command('collStats', x)['indexSizes']
    # convert index size to GB 
    index_sizes[x] = {k: str(v / 1024 / 1024 / 1024) + " GB" for k, v in index_size.items()}
    # add all index sizes together
    sum_of_indexes[x] = str(sum(index_size.values()) / 1024 / 1024 / 1024) + " GB"
    # get average size of objects
    # size_of_objects[x] = db.command('collStats', x)['avgObjSize']

print(f"Indexes: {indexes}")
print(f"Estimated document counts: {document_counts}")
print(f"Index sizes: {index_sizes}")
print(f"Sum of indexes: {sum_of_indexes}")
print(f"Size of objects: {size_of_objects}")


[32m2023-09-01 08:09:20.283[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/document_DB/read_write_user_password[0m
[32m2023-09-01 08:09:20.336[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/document_DB/read_write_user_password[0m


['class_resources']
['class_resource_chunk', 'class_resource']
Indexes: {'class_resource_chunk': {'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'chunk_id_1': {'v': 4, 'key': [('chunk_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, '_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource_chunk'}}, 'class_resource': {'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource'}, '_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource'}}}
Estimated document counts: {'class_resource_chunk': 2673, 'class_resource': 53}
Index sizes: {'class_resource_chunk': {'resource_id_1': '0.0018157958984375 GB', 'chunk_id_1': '0.0013885498046875 GB', 'class_id_1': '0.00295257568359375 GB

In [10]:

# find all in class_resource
collection_list = db.list_collection_names()
for col_name in collection_list:
    print(col_name)
    col = db[col_name]
    for x in col.find():
        print(x)
    # if col_name == "sessions":
    col.delete_many({})




sessions
users
{'_id': ObjectId('64ef6bc6884ecc230625a521'), 'name': 'Jacob Petterle', 'email': 'jacobpetterle@gmail.com', 'image': 'https://lh3.googleusercontent.com/a/AAcHTteEtR-U9WBweFlTfO5rN2BYDnUd3wIMZdk91BXIJXFPOOk=s96-c', 'emailVerified': None, 'bookmarks': [], 'chats': [], 'classes': [], 'id': 'b73b1ba3-dd1e-4fb4-9923-9c4416a2e24e', 'role': 'professor'}
{'_id': ObjectId('64ef6c50997355efbfd51192'), 'name': 'Dan Baker CSU', 'email': 'baker@rams.colostate.edu', 'image': 'https://lh3.googleusercontent.com/a/AAcHTtdAGg0J71GAUTG6_gQVdRUNLKeLMg0R4lzBiCdT0mV1QA=s96-c', 'emailVerified': None, 'bookmarks': [], 'chats': [], 'classes': [], 'id': '63128fcc-a076-454a-ae63-f4d762de6ec9', 'role': 'student'}
chats
classes
accounts
{'_id': ObjectId('64ef6bc6884ecc230625a522'), 'provider': 'google', 'type': 'oauth', 'providerAccountId': '116167054129833127128', 'access_token': 'ya29.a0AfB_byCJdjVEVP1ZNfieJ1bJS41ea0EiF7DKgD38yI_U7XwlgLYpHJlrZb-lRDHNKBW4fHaGVlh6Y4R_vCBixVAFd-JGk4t2PrESrranN66lVH3h

In [8]:
import pinecone

api_key = get_secret("dev/tai_service/pinecone_db/api_key")

pinecone.init(api_key=api_key, environment="us-east-1-aws")
indexes = pinecone.list_indexes()
print(indexes)
for index in indexes:
    print(pinecone.describe_index(index))
    index: pinecone.Index = pinecone.Index(index)
    print(index.describe_index_stats())
    namespaces = index.describe_index_stats()["namespaces"]
    # for namespace in namespaces:
    #     index.delete(delete_all=True, namespace=namespace)

[32m2023-08-30 16:28:04.707[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/pinecone_db/api_key[0m
[32m2023-08-30 16:28:04.775[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/pinecone_db/api_key[0m


['keyword-search-demo', 'tai-index']
IndexDescription(name='keyword-search-demo', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='s1.x1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 19}},
 'total_vector_count': 19}
IndexDescription(name='tai-index', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='s1.x1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'5d2f1056-c806-4bc0-ba8d-b9bfa68c48a3': {'vector_count': 141},
                '7e0c3c65-4f66-49dc-9103-257b85ffb733': {'vector_count': 19},
                '83179c1d-8aa0-4d56-9b09-256e81a7d951': {'vector_count': 19},
                '83179c1d-8aa0-4d56-9b09-256e81a7d95d': {'vector_count': 2}},
 'total_vector_count': 181}


In [10]:
# find one class resource in class resources db in the class resource collection with id : a72cceb5-4461-4e40-a52d-966b3b1d1ee1
class_id = '2c56f557-8aba-49d7-89ba-f296b41e87b7'
db = client.class_resources
col = db.class_resource_chunk
for doc in list(col.find({"class_id": class_id})):
    print(doc)
    # col.delete_one({"_id": doc["_id"]})
# ids = ['a5975bb7-1778-49fe-aa52-bfed14b8ca80', 'da2104ad-b456-480a-a1b3-5eeca40076aa', '7d9d4d68-e4f9-4efb-ad90-4669690895ca', 'ed3930ce-3fb5-429e-baa7-63aca1be4636', '7b118407-20e6-4057-a984-79e14340e34c', '65a5b7b9-a162-4017-aecf-cb932905147b']
# for doc in list(col.find({"_id": {"$in": ids}})):
#     print(doc)
# col = db.class_resource
# for doc in list(col.find({"class_id": class_id})):
#     print(doc)
    # col.delete_one({"_id": doc["_id"]})

{'_id': 'ddc0957a-9b18-4573-b272-d7ff01223533', 'chunk': '2898�5305 · $42.67 paid on August 7, 2023\nPage 1 of 2\nReceipt\nInvoice number\n4206D36B�0007\nReceipt number\n2898�5305\nDate paid\nAugust 7, 2023\nPayment method American Express - 9783\nOpenAI, LLC\n548 Market Street\nPMB 97273\nSan Francisco, California 94104�5401\nUnited States\nar@openai.com\nBill to\nJacob Petterle\n3197 W 2450 N\nLehi, Utah 84043\nUnited States\njacobpetterle@gmail.com\nShip to\nJacob Petterle\n3197 W 2450 N\nLehi, Utah 84043\nUnited States\n$42.67 paid on August 7, 2023\nDescription\nQty', 'class_id': '2c56f557-8aba-49d7-89ba-f296b41e87a3', 'create_timestamp': datetime.datetime(2023, 9, 1, 8, 6, 47, 861000), 'full_resource_url': 'https://tai-service-class-resource-cold-store-dev.s3.amazonaws.com/class_id%3D2c56f557-8aba-49d7-89ba-f296b41e87a3%2Fdocument_hash%3Dbef93d15975061d5f1f4b4f175605010a679f793%2FReceipt-2898-5305_page_1.pdf', 'metadata': {'title': 'Big pdf', 'description': 'This is a dummy pdf f

In [61]:
from langchain.document_loaders import UnstructuredHTMLLoader

In [3]:
from pydantic import Field, BaseSettings

class Test(BaseSettings):
    a: str = Field(
        default="a",
        description="a",
        env="A_B",
    )

print(Test().dict(by_alias=True))

{'a': 'a'}
