In [5]:
from typing import Dict
from loguru import logger
import json
import boto3
from botocore.exceptions import ClientError
from botocore.config import Config as BotoConfig
import pymongo
from urllib.parse import quote_plus
from typing import Dict


def get_secret(secret_name: str) -> Dict[str, str]:
    logger.info(f"Retrieving secret {secret_name}")
    session = boto3.session.Session()
    boto_config = BotoConfig(
        connect_timeout=10,
        retries={
            "max_attempts": 3,
            "mode":"standard"
        }
    )
    client = session.client(
        service_name='secretsmanager',
        config=boto_config,
        region_name='us-east-1',
    )
    try:
        logger.info(f"{secret_name}")
        get_secret_value_response = client.get_secret_value(
            SecretId=secret_name
        )
        return json.loads(get_secret_value_response["SecretString"])
    except ClientError as e:
        raise e

credentials = get_secret("dev/tai_service/document_DB/read_ONLY_user_password")
user_name = quote_plus(credentials["username"])
password = quote_plus(credentials["password"])
db_uri = f"mongodb://{user_name}:{password}@tai-service-645860363137.us-east-1.docdb-elastic.amazonaws.com:27017/?tls=true&retryWrites=false"
# escape the url
# db_uri = urllib.parse.quote_plus(db_uri)

##Create a MongoDB client, open a connection to Amazon DocumentDB as a replica set and specify the read preference as secondary preferred
client = pymongo.MongoClient(db_uri) 
##Specify the database to be used
# print all databases|
print(client.list_database_names())

db = client['class_resources']

collection_list = db.list_collection_names()
print(db.list_collection_names())
document_counts = {}
indexes = {}
index_sizes = {}
sum_of_indexes = {}
size_of_objects = {}
# drop collection called class
collection_names = ['class_resource_chunk', 'class_resource']
for x in collection_names:
    # print all doucments in each collect'''ion
    col = db[x]
    document_counts[x] = col.estimated_document_count()
    indexes[x] = col.index_information()
    index_size = db.command('collStats', x)['indexSizes']
    # convert index size to GB 
    index_sizes[x] = {k: str(v / 1024 / 1024 / 1024) + " GB" for k, v in index_size.items()}
    # add all index sizes together
    sum_of_indexes[x] = str(sum(index_size.values()) / 1024 / 1024 / 1024) + " GB"
    # get average size of objects
    size_of_objects[x] = db.command('collStats', x)['avgObjSize']

print(f"Indexes: {indexes}")
print(f"Estimated document counts: {document_counts}")
print(f"Index sizes: {index_sizes}")
print(f"Sum of indexes: {sum_of_indexes}")
print(f"Size of objects: {size_of_objects}")


[32m2023-07-06 15:02:37.048[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m13[0m - [1mRetrieving secret dev/tai_service/document_DB/read_ONLY_user_password[0m
[32m2023-07-06 15:02:37.110[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_secret[0m:[36m28[0m - [1mdev/tai_service/document_DB/read_ONLY_user_password[0m


['class_resources']
['class_resource_chunk', 'class_resource']
Indexes: {'class_resource_chunk': {'_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource_chunk'}, 'chunk_id_1': {'v': 4, 'key': [('chunk_id', 1)], 'ns': 'class_resources.class_resource_chunk'}}, 'class_resource': {'_id_': {'v': 4, 'key': [('_id', 1)], 'ns': 'class_resources.class_resource'}, 'class_id_1': {'v': 4, 'key': [('class_id', 1)], 'ns': 'class_resources.class_resource'}, 'resource_id_1': {'v': 4, 'key': [('resource_id', 1)], 'ns': 'class_resources.class_resource'}}}
Estimated document counts: {'class_resource_chunk': 0, 'class_resource': 0}
Index sizes: {'class_resource_chunk': {'resource_id_1': '1.52587890625e-05 GB', 'chunk_id_1': '1.52587890625e-05 GB', 'class_id_1': '1.52587890625e-05 GB', '_id_

In [18]:
collection = db.class_resource
doc = collection.find_one({"_id": "d56f532a-e9c9-4839-9264-5857197ca501"})
print(f"Class resource: {doc}")
chunk_doc_ids = doc["class_resource_chunk_ids"]
collection = db.class_resource_chunk
chunk_docs = collection.find({"_id": {"$in": chunk_doc_ids}})
print("Class resource chunks:")
for chunk_doc in chunk_docs:
    print(chunk_doc)

Class resource: {'_id': 'd56f532a-e9c9-4839-9264-5857197ca501', 'child_resource_ids': None, 'class_id': '63558dc3-5a05-4435-bed1-1651f3d04d12', 'class_resource_chunk_ids': ['04eb35ef-30bd-45da-ae48-cef3cb8849f3', '060abb35-365e-4c78-a1f3-e0e000feae4e'], 'create_timestamp': datetime.datetime(2023, 7, 6, 15, 55, 8, 463000), 'full_resource_url': 'https://tai-class-resource-queue.s3.amazonaws.com/Jacob+Petterle+-+Resume+(1).pdf', 'id': 'd56f532a-e9c9-4839-9264-5857197ca501', 'metadata': {'title': 'dummy.pdf', 'description': 'This is a dummy pdf file.', 'tags': ['dummy', 'pdf'], 'resource_type': 'textbook', 'total_page_count': None}, 'modified_timestamp': datetime.datetime(2023, 7, 6, 15, 55, 8, 492000), 'parent_resource_ids': None, 'preview_image_url': 'https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf', 'status': 'completed'}
Class resource chunks:
{'_id': '04eb35ef-30bd-45da-ae48-cef3cb8849f3', 'chunk': 'JACOB PETTERLE\n720-271-5053 | jacobpetterle@gmail.com | https

In [19]:
print('JACOB PETTERLE\n720-271-5053 | jacobpetterle@gmail.com | https://github.com/tai-team-ai\nSKILLS:\n●\nLanguages: Python, SQL, Mongo QL, Typescript, React, CSS, Java\n●\nTechnologies: Docker, Langchain, FASTAPI, pydantic, SPLADE, hybrid search, Spark, pytest, pyTorch\n●\nIAC & Databases: CDK, MongoDB, DynamoDB, Postgres, AWS Redshift, API Gateway, Lambda, SQS, S3\n●\nProduct Management: User-Centric Design, Usability Testing, SCRUM, TDD, Code Lifecycle Management\nWORK EXPERIENCE:\nCEO, Applied AI Architect | T.A.I. Education Systems\nJanuary 2023 - Present\n●\nConducting extensive customer research, with 15 students/professors interviewed and more scheduled\n●\nLeading a cross-functional team of 5 to design a product that students and professors love\n●\nArchitecting a hybrid search engine from scratch covering chunking, indexing, and information retrieval\n●\nDemonstrated interest in TAI by securing pilots covering 4500 students across 5 universities this fall\nData Engineer II | BENlabs\nOctober 2022 - Present\n●\nEngineering a recommendation system serving 200 concurrent users; load tested to 1000s\n●\nDesigning a search engine for tech org, minimizing time to retrieve API docs, customer data, and service docs\n●\nPioneered the use of large scale behavior data, prompting a $100k contract with the vendor\n○\nIngested 200GB of raw unstructured data and created an id system for audience behavior\n○\nIteratively developed customer facing prototypes to validate the VP of the system\n●\nCreated parameterizable IAC to standardize dev environments, soon to be used by the entire DS team\n○\nApp Link: https://huggingface.co/spaces/jacob-petterle/cloudtop-deployer\n●\nIndependently designed & implemented an ETL pipeline that ingests 1.5 TB of data per month\nEngineering Lead | Magna-Shox\nAugust 2020 - June 2022\n●\nArchitectured a finite element automation python codebase, improving design cycle time by 10x\n●')

JACOB PETTERLE
720-271-5053 | jacobpetterle@gmail.com | https://github.com/tai-team-ai
SKILLS:
●
Languages: Python, SQL, Mongo QL, Typescript, React, CSS, Java
●
Technologies: Docker, Langchain, FASTAPI, pydantic, SPLADE, hybrid search, Spark, pytest, pyTorch
●
IAC & Databases: CDK, MongoDB, DynamoDB, Postgres, AWS Redshift, API Gateway, Lambda, SQS, S3
●
Product Management: User-Centric Design, Usability Testing, SCRUM, TDD, Code Lifecycle Management
WORK EXPERIENCE:
CEO, Applied AI Architect | T.A.I. Education Systems
January 2023 - Present
●
Conducting extensive customer research, with 15 students/professors interviewed and more scheduled
●
Leading a cross-functional team of 5 to design a product that students and professors love
●
Architecting a hybrid search engine from scratch covering chunking, indexing, and information retrieval
●
Demonstrated interest in TAI by securing pilots covering 4500 students across 5 universities this fall
Data Engineer II | BENlabs
October 2022 - Pres

In [20]:
print('Leading a cross-functional team of 5 to design a product that students and professors love\n●\nArchitecting a hybrid search engine from scratch covering chunking, indexing, and information retrieval\n●\nDemonstrated interest in TAI by securing pilots covering 4500 students across 5 universities this fall\nData Engineer II | BENlabs\nOctober 2022 - Present\n●\nEngineering a recommendation system serving 200 concurrent users; load tested to 1000s\n●\nDesigning a search engine for tech org, minimizing time to retrieve API docs, customer data, and service docs\n●\nPioneered the use of large scale behavior data, prompting a $100k contract with the vendor\n○\nIngested 200GB of raw unstructured data and created an id system for audience behavior\n○\nIteratively developed customer facing prototypes to validate the VP of the system\n●\nCreated parameterizable IAC to standardize dev environments, soon to be used by the entire DS team\n○\nApp Link: https://huggingface.co/spaces/jacob-petterle/cloudtop-deployer\n●\nIndependently designed & implemented an ETL pipeline that ingests 1.5 TB of data per month\nEngineering Lead | Magna-Shox\nAugust 2020 - June 2022\n●\nArchitectured a finite element automation python codebase, improving design cycle time by 10x\n●\nArchitectured a python convolutional neural network vehicle model to predict passenger comfort\n○\nProcessed 45 million data points including visualization, filtering, and outlier detection\n●\nDesigned a company wide business strategy and VP, raising over $20,000 in seed capital\n●\nManaged multiple teams to design, manufacture, and validate 7 unique prototypes over a 20 month period\n○\nIndependently designed, analyzed, & manufactured a 1100+ part MVP with 28 unique parts\n●\nEstablished an intellectual property strategy & filed a system-wide provisional utility patent\nManufacturing Engineering Intern | Sierra Space\nMay 2021 - August 2021\n●\nDecreased run time of FEA software algorithm used across multiple design teams by 39%\n●\nDesigned a procedure allowing the measurement of inaccessible locations on the launch vehicle\nControls Engineering Intern | JR Automation, SetPoint\nJanuary 2020 - May 2020\n●')

Leading a cross-functional team of 5 to design a product that students and professors love
●
Architecting a hybrid search engine from scratch covering chunking, indexing, and information retrieval
●
Demonstrated interest in TAI by securing pilots covering 4500 students across 5 universities this fall
Data Engineer II | BENlabs
October 2022 - Present
●
Engineering a recommendation system serving 200 concurrent users; load tested to 1000s
●
Designing a search engine for tech org, minimizing time to retrieve API docs, customer data, and service docs
●
Pioneered the use of large scale behavior data, prompting a $100k contract with the vendor
○
Ingested 200GB of raw unstructured data and created an id system for audience behavior
○
Iteratively developed customer facing prototypes to validate the VP of the system
●
Created parameterizable IAC to standardize dev environments, soon to be used by the entire DS team
○
App Link: https://huggingface.co/spaces/jacob-petterle/cloudtop-deployer
●
Ind