In [4]:
from typing import Dict, Union
from loguru import logger
import json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config as BotoConfig
import pymongo
from urllib.parse import quote_plus
from typing import Dict


def get_secret(secret_name: str) -> Union[Dict[str, str], str]:
    logger.info(f"Retrieving secret {secret_name}")
    session = boto3.session.Session()
    boto_config = BotoConfig(
        connect_timeout=10,
        retries={
            "max_attempts": 3,
            "mode":"standard"
        }
    )
    client = session.client(
        service_name='secretsmanager',
        config=boto_config,
        region_name='us-east-1',
    )
    try:
        logger.info(f"{secret_name}")
        response = client.get_secret_value(
            SecretId=secret_name
        )
    except ClientError as e:
        raise e
    secret_string = response["SecretString"]
    try:
        return json.loads(secret_string)
    except json.JSONDecodeError:
        return secret_string

credentials = get_secret("dev/tai_service/document_DB/read_ONLY_user_password")
user_name = quote_plus(credentials["username"])
password = quote_plus(credentials["password"])
db_uri = f"mongodb://{user_name}:{password}@tai-service-645860363137.us-east-1.docdb-elastic.amazonaws.com:27017/?tls=true&retryWrites=false"
# escape the url
# db_uri = urllib.parse.quote_plus(db_uri)

##Create a MongoDB client, open a connection to Amazon DocumentDB as a replica set and specify the read preference as secondary preferred
client = pymongo.MongoClient(db_uri) 
##Specify the database to be used
# print all databases|
print(client.list_database_names())

db = client.class_resources
collection_list = db.list_collection_names()
print(collection_list)
document_counts = {}
indexes = {}
index_sizes = {}
sum_of_indexes = {}
size_of_objects = {}

for x in collection_list:
    # print all doucments in each collect'''ion
    col = db[x]
    document_counts[x] = col.estimated_document_count()
    indexes[x] = col.index_information()
    index_size = db.command('collStats', x)['indexSizes']
    # convert index size to GB 
    index_sizes[x] = {k: str(v / 1024 / 1024 / 1024) + " GB" for k, v in index_size.items()}
    # add all index sizes together
    sum_of_indexes[x] = str(sum(index_size.values()) / 1024 / 1024 / 1024) + " GB"
    # get average size of objects
    size_of_objects[x] = db.command('collStats', x)['avgObjSize']

print(f"Indexes: {indexes}")
print(f"Estimated document counts: {document_counts}")
print(f"Index sizes: {index_sizes}")
print(f"Sum of indexes: {sum_of_indexes}")
print(f"Size of objects: {size_of_objects}")


[32m2023-07-19 04:03:59.736[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/document_DB/read_ONLY_user_password[0m
[32m2023-07-19 04:03:59.795[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/document_DB/read_ONLY_user_password[0m


['class_resources']
['class_resource_chunk', 'class_resource']
Indexes: {'class_resource_chunk': {'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'chunk_id_1': {'v': 4, 'key': [('chunk_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, '_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource_chunk'}}, 'class_resource': {'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource'}, '_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource'}}}
Estimated document counts: {'class_resource_chunk': 238, 'class_resource': 24}
Index sizes: {'class_resource_chunk': {'resource_id_1': '0.0009765625 GB', 'chunk_id_1': '0.0009765625 GB', '_id_': '0.0009765625 GB', 'class_id_1': '0.000976

In [9]:
col = db["class_resource"]
print(col.find_one({"class_id": "6ddb10a6-81bf-412b-9bdd-0c8070cf5a20"}))

{'_id': 'ac56173f-d54a-41a8-8224-8d5ec35f091c', 'child_resource_ids': None, 'class_id': '6ddb10a6-81bf-412b-9bdd-0c8070cf5a20', 'class_resource_chunk_ids': ['c7c21987-2393-45aa-955a-539cdb3d410c', '9f7bbf7c-5cab-4ce7-a384-3bc81784cd12', '701bd844-15ff-4543-9711-5a9bd64a78d4', 'c6618ae5-1a04-42f0-b4e6-41e18da93f47', 'a7881412-39de-4eda-ab83-46b05384a41f', '72cdb97b-932b-495f-9d4f-439375151f2b', '7980d452-d8b8-46a5-8673-4aca2c3da522', 'af374dcf-9b2c-4560-9c65-9dfd07017a0a', '52b39ea7-86d9-402f-bded-1e8ecc9d2a7b', '38bda1ad-4619-456a-87b3-e3670fe1f1a0', 'c6157741-dd4c-4bfd-9442-8ae9a7a7c14a', '8d5ea600-fd88-4d2b-86e5-bc9cb81d5f0d', '8debed69-1286-4858-be2b-a42125a565a9', 'e2880f1d-2b10-4eb0-97db-c135f6607885', '0d383433-2134-4925-85c9-6d4306088652', '7b7f8a39-8de2-43d7-b93a-6f0f0185f413'], 'create_timestamp': datetime.datetime(2023, 7, 18, 17, 33, 24, 647000), 'full_resource_url': 'https://tai-service-class-resource-cold-store-dev.s3.amazonaws.com/6ddb10a6-81bf-412b-9bdd-0c8070cf5a20%2Fac

In [2]:

# find all in class_resource
col = db["class_resource"]
print("class_resources")
for x in col.find():
    print(x)
# col.delete_many({})

col = db["class_resource_chunk"]
print("class_resource_chunks")
for x in col.find():
    print(x)
# col.delete_many({})

#update the status to processing for the document with the id = b9ba33d3-e4a0-4333-bae6-126f89a74e0d
col = db["class_resource"]


class_resources
{'_id': '5f8825c9-1698-4239-a31f-3edcbf18c809', 'child_resource_ids': None, 'class_id': '6fab04ec-f3a8-4ca6-a4bf-ac8371496bd4', 'class_resource_chunk_ids': ['90db9692-b767-4704-8974-bfb025d20a32', '142a3405-5dbe-44ea-96d7-8bd92f8689bb', '1086a891-5380-4c08-8add-50d295c2e169', '896de855-8a19-4544-a37b-934df0a6cf6c', '939e5733-5685-43b6-92ef-7159bb7c5e0c', '553317ba-3c31-4793-b9ae-32ba2558977c', '81fe3ab1-f2ac-4788-90e2-5c0c60f175fc', 'e0657359-13f3-44ec-9226-6e3387eb3d99', '74732aac-b456-4dc1-91f9-95ce3882d58f', '1afa19c8-288a-47da-a60d-d4de9aecdcb2', '8007a2a6-3d8d-4444-a0fd-2677f224d892', '652c77ee-3382-4acb-936e-17fddbcac997', 'da53c54e-74c2-4ac0-afe8-2e29dced99bf', 'e800afd4-49e1-49ba-85fc-0179448fdcb2', '76060fee-94a0-4dbb-8c75-87181f7411aa', '3590ad8d-4816-4170-b903-d1518b3fc0c9'], 'create_timestamp': datetime.datetime(2023, 7, 18, 5, 11, 15, 536000), 'full_resource_url': 'https://tai-service-class-resource-cold-store-dev.s3.amazonaws.com/6fab04ec-f3a8-4ca6-a4bf-ac

In [3]:
import pinecone

api_key = get_secret("dev/tai_service/pinecone_db/api_key")

pinecone.init(api_key=api_key, environment="us-east-1-aws")
indexes = pinecone.list_indexes()
print(indexes)
for index in indexes:
    print(pinecone.describe_index(index))
    index: pinecone.Index = pinecone.Index(index)
    print(index.describe_index_stats())
    namespaces = index.describe_index_stats()["namespaces"]
    # for namespace in namespaces:
    #     index.delete(delete_all=True, namespace=namespace)

  from tqdm.autonotebook import tqdm
[32m2023-07-19 03:44:19.937[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/pinecone_db/api_key[0m
[32m2023-07-19 03:44:20.011[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/pinecone_db/api_key[0m


['tai-index']
IndexDescription(name='tai-index', metric='dotproduct', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='s1.x1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')
{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'03919e9f-dd4c-4b7b-a007-d545f7411569': {'vector_count': 16},
                '112022d9-c365-4c37-a392-e76c80227e05': {'vector_count': 1},
                '324dbec1-3cfd-4fc2-a02f-0334535c0ddd': {'vector_count': 16},
                '5bae5c1b-526c-4e8f-a39e-f139f23b0c91': {'vector_count': 1},
                '5e81fdd0-160a-4b12-ae57-ad7e04b8c50e': {'vector_count': 16},
                '6ddb10a6-81bf-412b-9bdd-0c8070cf5a20': {'vector_count': 16},
                '6fab04ec-f3a8-4ca6-a4bf-ac8371496bd4': {'vector_count': 55},
                '7e3727cb-bf48-48e4-a239-5ce6674e6473': {'vector_count': 1},
                '8fab378e-db8e-4ef5-a75e-80916ff4a2cf': {'vector_count': 1},
                '95983f3a-e2d4-402e-