In [1]:
import chromadb

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [8]:
files = [
    {
        "title": "Pizza",
        "source_url": "https://raw.githubusercontent.com/selva86/datasets/refs/heads/master/lda_sports_politics_docs/pasta.txt",
        "filename": "pizza.txt"
  },
  {
        "title": "Pasta",
        "source_url": "https://raw.githubusercontent.com/selva86/datasets/refs/heads/master/lda_sports_politics_docs/pasta.txt",
        "filename": "pasta.txt"
  }
]


In [11]:
import requests
file_path = f"./{files[0]['filename']}"
response = requests.get(files[0]["source_url"])

if response.status_code == 200:
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(response.text)
    print(f"Downloaded: {files[0]['filename']}")
else:
    raise Exception(f"Failed to download: {files[0]['filename']}")

Downloaded: pizza.txt


In [9]:
# Instantiate a Chroma persistent client
client = chromadb.PersistentClient("./")


## YOUR SOLUTION HERE ##
collection  = client.get_or_create_collection(name = "RAG_Assistant", metadata= {"hnsw:space": "cosine"})


In [12]:
#Read first file content
with open(file_path, "r", encoding="utf-8") as file:
    content = file.read()

# Create a text splitter
## YOUR SOLUTION HERE ##
text_splitter = RecursiveCharacterTextSplitter(
  separators = ["\n\n", "\n", ". ", "? ", "! "],
  chunk_size = 1500,
  chunk_overlap = 200 
  )



# Split the 'content' into chunks
chunks = text_splitter.create_documents([content])

# Print the first document
chunks[:1]

[Document(metadata={}, page_content='Pasta is a staple food of traditional Italian cuisine, with the first reference dating to 1154 in Sicily. It is also commonly used to refer to the variety of pasta dishes. Typically, pasta is a noodle made from an unleavened dough of a durum wheat flour mixed with water or eggs and formed into sheets or various shapes, then cooked by boiling or baking. It can also be made with flour from other cereals or grains. Pastas may be divided into two broad categories, dried (pasta secca) and fresh (pasta fresca).\nMost dried pasta is commercially produced via an extrusion process. Fresh pasta was traditionally produced by hand, sometimes with the aid of simple machines, but today many varieties of fresh pasta are also commercially produced by large-scale machines, and the products are widely available in supermarkets.\nBoth dried and fresh pasta come in a number of shapes and varieties, with 310 specific forms known variably by over 1300 names having been d

In [14]:
#Create empty lists to store each document, metadata, and id
documents = []
metadatas = []
ids = []

#Loop through each file in files
for file_info in files:
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
        #Use text_splitter to create documents
        chunks = text_splitter.create_documents([content])
        #iterate over every chunk
        for index, chunk in enumerate(chunks):
            #Append to metadata list with "title", "source_url", and "index"
            metadatas.append({
                "title": file_info["title"],
                "source_url": file_info["source_url"],
                "chunk_idx": index
            })
            #Append to ids each index
            ids.append(f"{file_info['filename']}_{index}")
            
            #Append to documents each chunk.page_content
            ### YOUR SOLUTION HERE ###
            documents.append(chunk.page_content)
        
            

In [16]:
#Add all documents to the collection
collection.add(documents=documents, metadatas=metadatas, ids=ids)

#Verify documents were added to collection with a sample query
### YOUR SOLUTION HERE ###
collection.query(query_texts=["When was pasta invented?"], n_results = 1)


Add of existing embedding ID: pizza.txt_0
Add of existing embedding ID: pasta.txt_0
Insert of existing embedding ID: pizza.txt_0
Insert of existing embedding ID: pasta.txt_0


{'ids': [['pizza.txt_0']],
 'embeddings': None,
 'documents': [['Pasta is a staple food of traditional Italian cuisine, with the first reference dating to 1154 in Sicily. It is also commonly used to refer to the variety of pasta dishes. Typically, pasta is a noodle made from an unleavened dough of a durum wheat flour mixed with water or eggs and formed into sheets or various shapes, then cooked by boiling or baking. It can also be made with flour from other cereals or grains. Pastas may be divided into two broad categories, dried (pasta secca) and fresh (pasta fresca).\nMost dried pasta is commercially produced via an extrusion process. Fresh pasta was traditionally produced by hand, sometimes with the aid of simple machines, but today many varieties of fresh pasta are also commercially produced by large-scale machines, and the products are widely available in supermarkets.\nBoth dried and fresh pasta come in a number of shapes and varieties, with 310 specific forms known variably by o

The results are returned as a dictionary and the cosine similarities for each result is also returned with the key `distances`.