In [1]:
from typing import Dict
from loguru import logger
import json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config as BotoConfig
import pymongo
from urllib.parse import quote_plus
from typing import Dict


def get_secret(secret_name: str) -> Dict[str, str]:
    logger.info(f"Retrieving secret {secret_name}")
    session = boto3.session.Session()
    boto_config = BotoConfig(
        connect_timeout=10,
        retries={
            "max_attempts": 3,
            "mode":"standard"
        }
    )
    client = session.client(
        service_name='secretsmanager',
        config=boto_config,
        region_name='us-east-1',
    )
    try:
        logger.info(f"{secret_name}")
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
        return json.loads(get_secret_value_response["SecretString"])
    except ClientError as e:
        raise e

credentials = get_secret("dev/tai_service/document_DB/read_ONLY_user_password")
user_name = quote_plus(credentials["username"])
password = quote_plus(credentials["password"])
db_uri = f"mongodb://{user_name}:{password}@tai-service-645860363137.us-east-1.docdb-elastic.amazonaws.com:27017/?tls=true&retryWrites=false"
# escape the url
# db_uri = urllib.parse.quote_plus(db_uri)

##Create a MongoDB client, open a connection to Amazon DocumentDB as a replica set and specify the read preference as secondary preferred
client = pymongo.MongoClient(db_uri) 
##Specify the database to be used
# print all databases|
print(client.list_database_names())

db = client['class_resources']

collection_list = db.list_collection_names()
print(db.list_collection_names())
document_counts = {}
indexes = {}
index_sizes = {}
sum_of_indexes = {}
size_of_objects = {}
# drop collection called class
collection_names = ['class_resource_chunk', 'class_resource']
for x in collection_names:
    # print all doucments in each collect'''ion
    col = db[x]
    document_counts[x] = col.estimated_document_count()
    indexes[x] = col.index_information()
    index_size = db.command('collStats', x)['indexSizes']
    # convert index size to GB 
    index_sizes[x] = {k: str(v / 1024 / 1024 / 1024) + " GB" for k, v in index_size.items()}
    # add all index sizes together
    sum_of_indexes[x] = str(sum(index_size.values()) / 1024 / 1024 / 1024) + " GB"
    # get average size of objects
    size_of_objects[x] = db.command('collStats', x)['avgObjSize']

print(f"Indexes: {indexes}")
print(f"Estimated document counts: {document_counts}")
print(f"Index sizes: {index_sizes}")
print(f"Sum of indexes: {sum_of_indexes}")
print(f"Size of objects: {size_of_objects}")


[32m2023-07-09 13:44:43.885[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/document_DB/read_ONLY_user_password[0m
[32m2023-07-09 13:44:43.955[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/document_DB/read_ONLY_user_password[0m


ServerSelectionTimeoutError: No servers found yet, Timeout: 30s, Topology Description: <TopologyDescription id: 64aab9cc5a41b9e93ec89152, topology_type: Unknown, servers: [<ServerDescription ('tai-service-645860363137.us-east-1.docdb-elastic.amazonaws.com', 27017) server_type: Unknown, rtt: None>]>

In [23]:
collection = db.class_resource
doc = collection.find_one({"_id": "d56f532a-e9c9-4839-9264-5857197ca501"})
print(f"Class resource: {doc}")
chunk_doc_ids = doc["class_resource_chunk_ids"]
collection = db.class_resource_chunk
chunk_docs = collection.find({"_id": {"$in": chunk_doc_ids}})
print("Class resource chunks:")
for chunk_doc in chunk_docs:
    print("\n\n")
    print(chunk_doc['chunk'])

Class resource: {'_id': 'd56f532a-e9c9-4839-9264-5857197ca501', 'child_resource_ids': None, 'class_id': '63558dc3-5a05-4435-bed1-1651f3d04d12', 'class_resource_chunk_ids': ['95106cbd-be35-46fb-95d3-3046f6a65e15', 'a5e76845-04b6-4de3-9276-ccb9b18c30b2', '95b42f9a-b3cd-49b2-b2a7-7eb43183f781', '0404863a-6bbb-42f4-a51f-4b5cca323ee4'], 'create_timestamp': datetime.datetime(2023, 7, 6, 16, 46, 50, 180000), 'full_resource_url': 'https://tai-class-resource-queue.s3.amazonaws.com/Jacob+Petterle+-+Resume+(1).pdf', 'id': 'd56f532a-e9c9-4839-9264-5857197ca501', 'metadata': {'title': 'dummy.pdf', 'description': 'This is a dummy pdf file.', 'tags': ['dummy', 'pdf'], 'resource_type': 'textbook', 'total_page_count': None}, 'modified_timestamp': datetime.datetime(2023, 7, 6, 16, 46, 50, 456000), 'parent_resource_ids': None, 'preview_image_url': 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf', 'status': 'completed'}
Class resource chunks:



JACOB PETTERLE
720-271-5053 | jacob