In [1]:
from typing import Dict, Union
from loguru import logger
import json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config as BotoConfig
import pymongo
from urllib.parse import quote_plus
from typing import Dict


def get_secret(secret_name: str) -> Union[Dict[str, str], str]:
    logger.info(f"Retrieving secret {secret_name}")
    session = boto3.session.Session()
    boto_config = BotoConfig(
        connect_timeout=10,
        retries={
            "max_attempts": 3,
            "mode":"standard"
        }
    )
    client = session.client(
        service_name='secretsmanager',
        config=boto_config,
        region_name='us-east-1',
    )
    try:
        logger.info(f"{secret_name}")
        response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e
    secret_string = response["SecretString"]
    try:
        return json.loads(secret_string)
    except json.JSONDecodeError:
        return secret_string

# frontend server secret name:
# secret_name = "dev/tai_backend/mongodb/user"
# tai service secret name:
secret_name = "dev/tai_service/document_DB/read_write_user_password"
credentials = get_secret(secret_name)
user_name = quote_plus(credentials["username"])
password = quote_plus(credentials["password"])
# frontend server uri:
# db_uri = f"mongodb+srv://tai-testing:tai-testing@cluster0.cx5zd.mongodb.net/?retryWrites=true&w=majority"
# tai service uri:
db_uri = f"mongodb://{user_name}:{password}@tai-service-645860363137.us-east-1.docdb-elastic.amazonaws.com:27017/?tls=true&retryWrites=false"
# escape the url
# db_uri = urllib.parse.quote_plus(db_uri)

##Create a MongoDB client, open a connection to Amazon DocumentDB as a replica set and specify the read preference as secondary preferred
client = pymongo.MongoClient(db_uri) 
##Specify the database to be used
# print all databases|
print(client.list_database_names())

db = client.class_resources
# db = client.TAI_frontend
collection_list = db.list_collection_names()
print(collection_list)
document_counts = {}
indexes = {}
index_sizes = {}
sum_of_indexes = {}
size_of_objects = {}

for x in collection_list:
    # print all doucments in each collect'''ion
    col = db[x]
    document_counts[x] = col.estimated_document_count()
    indexes[x] = col.index_information()
    index_size = db.command('collStats', x)['indexSizes']
    # convert index size to GB 
    index_sizes[x] = {k: str(v / 1024 / 1024 / 1024) + " GB" for k, v in index_size.items()}
    # add all index sizes together
    sum_of_indexes[x] = str(sum(index_size.values()) / 1024 / 1024 / 1024) + " GB"
    # get average size of objects
    # size_of_objects[x] = db.command('collStats', x)['avgObjSize']

print(f"Indexes: {indexes}")
print(f"Estimated document counts: {document_counts}")
print(f"Index sizes: {index_sizes}")
print(f"Sum of indexes: {sum_of_indexes}")
print(f"Size of objects: {size_of_objects}")


[32m2023-09-04 22:50:02.493[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/document_DB/read_write_user_password[0m
[32m2023-09-04 22:50:02.536[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/document_DB/read_write_user_password[0m


['class_resources']
['class_resource_chunk', 'class_resource']
Indexes: {'class_resource_chunk': {'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'chunk_id_1': {'v': 4, 'key': [('chunk_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, '_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource_chunk'}}, 'class_resource': {'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource'}, '_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource'}}}
Estimated document counts: {'class_resource_chunk': 3271, 'class_resource': 199}
Index sizes: {'class_resource_chunk': {'resource_id_1': '0.00193023681640625 GB', 'chunk_id_1': '0.001556396484375 GB', 'class_id_1': '0.00296783447265625 G

In [2]:

# find all in class_resource
collection_list = db.list_collection_names()
for col_name in collection_list:
    print(col_name)
    col = db[col_name]
    for x in col.find():
        print(x)
    # if col_name == "sessions":
    # col.delete_many({})




class_resource_chunk
{'_id': '3f5ca37a-c60e-4517-a15c-3bf1d10f2eca', 'chunk': "while I walk away while Craig continues making\xa0\xa0 the world's greatest swing arm you want to\xa0\nwatch the master over here I finished my part\xa0\xa0 early and started trying to solve the other\xa0\nproblem that we have which is how do we cut\xa0\xa0 these tires in half and have them still hold\xa0\ntheir shape so I first cut some tires in half\xa0\xa0 it's tougher than I thought because they\xa0\nwouldn't this rubber would not slide easily there's someone out there saying", 'class_id': 'bba5546a-9cd1-4113-bda9-1effe6c7ac21', 'create_timestamp': datetime.datetime(2023, 8, 31, 5, 59, 5, 793000), 'full_resource_url': 'https://www.youtube.com/watch?v=-OWL0OEuqrY', 'metadata': {'title': 'dummy.pdf', 'description': 'This is a dummy pdf file.', 'tags': ['dummy', 'pdf'], 'resource_type': 'textbook', 'total_page_count': None, 'class_id': 'bba5546a-9cd1-4113-bda9-1effe6c7ac21', 'page_number': None, 'vector_id'

In [3]:
import pinecone

api_key = get_secret("dev/tai_service/pinecone_db/api_key")

pinecone.init(api_key=api_key, environment="us-east-1-aws")
indexes = pinecone.list_indexes()
print(indexes)
for index in indexes:
    print(pinecone.describe_index(index))
    index: pinecone.Index = pinecone.Index(index)
    print(index.describe_index_stats())
    namespaces = index.describe_index_stats()["namespaces"]
    # for namespace in namespaces:
    #     index.delete(delete_all=True, namespace=namespace)

  from tqdm.autonotebook import tqdm
[32m2023-09-02 22:46:59.280[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/pinecone_db/api_key[0m
[32m2023-09-02 22:46:59.365[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/pinecone_db/api_key[0m


['keyword-search-demo', 'tai-index']
IndexDescription(name='keyword-search-demo', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='s1.x1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 19}},
 'total_vector_count': 19}
IndexDescription(name='tai-index', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='s1.x1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'03c24c2a-3b3b-4301-b725-a4c82c117b6e': {'vector_count': 135},
                '04541cb0-792a-423a-9457-06d3ac8694dc': {'vector_count': 35},
                '0633f42d-ebf4-412a-a767-5d2beb43a95f': {'vector_count': 2},
                '11f9358b-741f-45ba-abb9-45314d79ec84': {'vector_count': 2},
                '215c6a90-5a22-435e-9abd-890ef141242c': {'vector_cou

In [2]:
# find one class resource in class resources db in the class resource collection with id : a72cceb5-4461-4e40-a52d-966b3b1d1ee1
class_id = '4dc7704e-c9c3-4c9a-9131-c78a668abbc5'
db = client.class_resources
col = db.class_resource
for doc in list(col.find({"_id": class_id})):
    print(doc)
    # col.delete_one({"_id": doc["_id"]})
# ids = ['a5975bb7-1778-49fe-aa52-bfed14b8ca80', 'da2104ad-b456-480a-a1b3-5eeca40076aa', '7d9d4d68-e4f9-4efb-ad90-4669690895ca', 'ed3930ce-3fb5-429e-baa7-63aca1be4636', '7b118407-20e6-4057-a984-79e14340e34c', '65a5b7b9-a162-4017-aecf-cb932905147b']
# for doc in list(col.find({"_id": {"$in": ids}})):
#     print(doc)
# col = db.class_resource
# for doc in list(col.find({"class_id": class_id})):
#     print(doc)
    # col.delete_one({"_id": doc["_id"]})

{'_id': '4dc7704e-c9c3-4c9a-9131-c78a668abbc5', 'child_resource_ids': [], 'class_id': '6c387fc6-defd-4264-ab31-deb009445876', 'class_resource_chunk_ids': ['4f9fde48-196d-4c9e-84d3-d2327fff1156', '6b5fddac-1e13-497d-81a5-e88b2692417c', '4eab5f4e-813a-4ad2-b751-bb4b5940832f', '38b1634c-bcfb-4564-b686-43bde8fe9d96'], 'create_timestamp': datetime.datetime(2023, 9, 4, 22, 47, 21, 750000), 'data_pointer': 'https://tai-service-class-resource-cold-store-dev.s3.amazonaws.com/class_id%3D6c387fc6-defd-4264-ab31-deb009445876%2Fdocument_hash%3Df9da5cf2cb433d1a499b612f3a88d2033b96b5cb%2FHomework%2B10_page_1.pdf', 'full_resource_url': 'https://tai-service-class-resource-cold-store-dev.s3.amazonaws.com/class_id%3D6c387fc6-defd-4264-ab31-deb009445876%2Fdocument_hash%3Df9da5cf2cb433d1a499b612f3a88d2033b96b5cb%2FHomework%2B10_page_1.pdf', 'hashed_document_contents': '87dc866d62d607bdc02b750455bd1cf04aa69cdf', 'metadata': {'title': 'dummy.pdf', 'description': 'This is a dummy pdf file.', 'tags': ['dummy',

In [61]:
from langchain.document_loaders import UnstructuredHTMLLoader

In [3]:
from pydantic import Field, BaseSettings

class Test(BaseSettings):
    a: str = Field(
        default="a",
        description="a",
        env="A_B",
    )

print(Test().dict(by_alias=True))

{'a': 'a'}
