# Lecture 4 - Using Embedding Models with Vector Databases

In [4]:
# import novel_pages
import chromadb
import os



In [5]:

########################################################################
# Step 1. Extract the text from the PDF file - get each page of Lord of the Rings
########################################################################
document_dictionary = { 
                    1 : """The United States of America, often referred to as the USA, is a diverse nation with a rich history and a population of over 330 million people. Founded on July 4, 1776, with the signing of the Declaration of Independence, the USA emerged as a nation built on principles of democracy, liberty, and opportunity. Over the centuries, the country has experienced significant growth, welcoming immigrants from all corners of the globe and becoming a melting pot of cultures. Today, the USA remains a global superpower, known for its economic strength, technological innovation, and cultural influence. """,
                    2 : """Russia, the largest country in the world by land area, boasts a population of approximately 145 million people. Its history traces back to the medieval state of Kievan Rus', with the modern Russian state emerging in the late 17th century. With its foundation formally established in 1721, Russia has played a pivotal role in global politics, particularly during the era of the Russian Empire and the Soviet Union. Despite facing challenges, including political upheavals and economic transitions, Russia maintains its status as a major player on the world stage, with vast natural resources and a rich cultural heritage. """,
                    3 : """China, the most populous country in the world, is home to over 1.4 billion people. With a history spanning thousands of years, China's civilization is one of the oldest in the world. The founding of the People's Republic of China on October 1, 1949, marked a significant turning point in its modern history, as the country underwent rapid industrialization and economic reforms. Today, China stands as a global economic powerhouse, boasting the world's second-largest economy and exerting considerable influence in international affairs. With its ancient traditions juxtaposed against modern advancements, China continues to shape the course of the 21st century. """
                   } 




In [6]:
########################################################################
# Step 2 + 3. Split the text into pages and then make an embedding of each page USING an embedding model. Store this in the vector database
########################################################################
client = chromadb.Client() # internally, it is using: all-MiniLM-L6-v2

collection_name = 'random_book_pages'

client.create_collection(name=collection_name)

## get the collection
collection = client.get_or_create_collection(name = collection_name)

## store text into the database - behind the scenes it will automtically create embeddings from the text
collection.add(
    documents = [x for x in list(document_dictionary.values())],
    metadatas = [{"source": "book"} for x in document_dictionary],
    ids = [str(x) for x in document_dictionary]
)



C:\Users\babaa\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|███████████████████████████████████████| 79.3M/79.3M [00:52<00:00, 1.58MiB/s]


In [7]:
########################################################################
# Step 4. + 5. Have a Question you Ask - find the most similar document/page from the Novel -> Chroma handles this for us -> 
### using the embedding function that we already have
########################################################################
question = "What is the largest country by population and by land area?"

results = collection.query(
    query_texts=[question],
    n_results=2
)


results

{'ids': [['2', '3']],
 'distances': [[0.9670072793960571, 1.3053419589996338]],
 'metadatas': [[{'source': 'book'}, {'source': 'book'}]],
 'embeddings': None,
 'documents': [["Russia, the largest country in the world by land area, boasts a population of approximately 145 million people. Its history traces back to the medieval state of Kievan Rus', with the modern Russian state emerging in the late 17th century. With its foundation formally established in 1721, Russia has played a pivotal role in global politics, particularly during the era of the Russian Empire and the Soviet Union. Despite facing challenges, including political upheavals and economic transitions, Russia maintains its status as a major player on the world stage, with vast natural resources and a rich cultural heritage. ",
   "China, the most populous country in the world, is home to over 1.4 billion people. With a history spanning thousands of years, China's civilization is one of the oldest in the world. The founding 

In [11]:

print(type(results['documents']))
results['documents'][0][0] # document 1
results['documents'][0][1] # document 2

<class 'list'>


"Russia, the largest country in the world by land area, boasts a population of approximately 145 million people. Its history traces back to the medieval state of Kievan Rus', with the modern Russian state emerging in the late 17th century. With its foundation formally established in 1721, Russia has played a pivotal role in global politics, particularly during the era of the Russian Empire and the Soviet Union. Despite facing challenges, including political upheavals and economic transitions, Russia maintains its status as a major player on the world stage, with vast natural resources and a rich cultural heritage. "

# Creating a collection with different Embedding Function

In [None]:
# https://docs.trychroma.com/usage-guide 
# we can use any sentence transformer model: https://docs.trychroma.com/embeddings 
# https://www.sbert.net/docs/pretrained_models.html
from chromadb.utils import embedding_functions
embedding_model_name = 'all-mpnet-base-v2'
emb_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=embedding_model_name)

new_collection = client.create_collection(name="another_collection", embedding_function=emb_fn)


new_collection.add(
    documents = ["Here is some sample text"],
    metadatas = [{"source": "typed_text"}],
    ids = ['1']
)



