### Imports and Path setup

In [2]:
from pathlib import Path
import chromadb
import pickle
import os
from dotenv import load_dotenv
load_dotenv()

multiquery_rag_output_path = "../RAG Results/multiquery_rag_results.txt"
Relative_Database_path = "./chroma_Data"
Absolute_Database_path = Path(Relative_Database_path).resolve()
file_path = "../Chunking/Chunk_files/harry_potter_chunks_hierarchical.pkl"
# Create a new collection with a unique name
collection_name = "harry_potter_collection"
# # Set API key
# os.environ["GOOGLE_API_KEY"] = os.environ.get("GEMINI_API_KEY")


### Chroma Setup and Chunk Loading
Sets up persistant client and loads previously computed chunks

In [3]:
# Initialize the persistent client
client = chromadb.PersistentClient(path=Absolute_Database_path)
print(f"[INFO] ChromaDB client initialized at: {Absolute_Database_path}")

# List existing collections
existing_collections = client.list_collections()
print(f"Existing collections: {[c.name for c in existing_collections]}")

[INFO] ChromaDB client initialized at: C:\Users\Gaming window\Desktop\ANLP_Assignment_2\RAG-A2\VectorDB\chroma_Data
Existing collections: ['harry_potter_collection']


In [4]:

# No need for fitz or RecursiveCharacterTextSplitter here, as we are loading from a file.


loaded_docs = []

try:
    with open(file_path, "rb") as f: # 'rb' mode for reading in binary
        loaded_docs = pickle.load(f)
    print(f"Successfully loaded {len(loaded_docs)} chunks from '{file_path}'.")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"Error loading file: {e}")

# Now you can inspect the loaded documents to verify.
print("\nHere is the metadata of a loaded chunk:")
if loaded_docs:
    print(loaded_docs[0].metadata)

Successfully loaded 160 chunks from '../Chunking/Chunk_files/harry_potter_chunks_hierarchical.pkl'.

Here is the metadata of a loaded chunk:
{'source': '../julius-caesar_PDF_FolgerShakespeare.pdf', 'page_number': 5, 'chunk_type': 'section', 'chunk_level': 1, 'section_id': 'page_5_section_0', 'parent_id': 'page_5', 'chunk_index': 0, 'c': 'hierarchical_section', 'ischunk': True}


### Set up Embedding Function
Will use default SentenceTransformer for generating embeddings

In [5]:
# Install if needed
# !pip install sentence_transformers

# Set up embedding function
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
print("Embedding function initialized with model: all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


Embedding function initialized with model: all-MiniLM-L6-v2


### Creating new Collection

In [6]:
from datetime import datetime



# Get or create the collection
client.delete_collection(name=collection_name)  
collection = client.get_or_create_collection(
    name=collection_name,
    embedding_function=embedding_function,
    metadata={
        "description": "Harry Potter book chunks",
        "created": str(datetime.now())
    }
)

print(f"Collection '{collection_name}' created or accessed successfully")

Collection 'harry_potter_collection' created or accessed successfully


### Add data to collection
The chunks have to be given an id and added to the collection now

In [7]:
import uuid

# Prepare documents for ChromaDB
ids = []
documents = []
metadatas = []

# Process each loaded document chunk
for i, doc in enumerate(loaded_docs):
    # Generate a unique ID (you could use a more deterministic approach if needed)
    doc_id = f"hp_chunk_{i}"
    
    # Get the document text
    document_text = doc.page_content
    
    # Get the document metadata
    metadata = doc.metadata
    
    # Add to our lists
    ids.append(doc_id)
    documents.append(document_text)
    metadatas.append(metadata)

# Add documents in batches to avoid memory issues
batch_size = 500
total_added = 0

for i in range(0, len(ids), batch_size):
    end_idx = min(i + batch_size, len(ids))
    
    # collection.update(
    #     ids=ids[i:end_idx],
    #     documents=documents[i:end_idx],
    #     metadatas=metadatas[i:end_idx]
    # )
    collection.add(
        ids=ids[i:end_idx],
        documents=documents[i:end_idx],
        metadatas=metadatas[i:end_idx]
    )
    
    total_added += end_idx - i
    print(f"Added batch: {i} to {end_idx-1} ({end_idx-i} items)")

print(f"Successfully added {total_added} documents to collection '{collection_name}'")

Added batch: 0 to 159 (160 items)
Successfully added 160 documents to collection 'harry_potter_collection'


In [8]:
# Check collection count
count = collection.count()
print(f"Total documents in collection: {count}")

# Peek at the first few entries
peek = collection.peek(limit=3)
print("\nSample entries:")
for i, (doc_id, doc_text, metadata) in enumerate(zip(
    peek['ids'], peek['documents'], peek['metadatas']
)):
    print(f"\n--- Document {i+1} ---")
    print(f"ID: {doc_id}")
    print(f"Text: {doc_text[:100]}...")
    print(f"Metadata: {metadata}")

Total documents in collection: 160

Sample entries:

--- Document 1 ---
ID: hp_chunk_0
Text: Hamlet: “O farewell, honest soldier.  Who hath relieved/you?”). At
any point in the text, you can ho...
Metadata: {'section_id': 'page_5_section_0', 'page_number': 5, 'c': 'hierarchical_section', 'parent_id': 'page_5', 'chunk_level': 1, 'ischunk': True, 'source': '../julius-caesar_PDF_FolgerShakespeare.pdf', 'chunk_type': 'section', 'chunk_index': 0}

--- Document 2 ---
ID: hp_chunk_1
Text: Hamlet: “O farewell, honest soldier.  Who hath relieved/you?”). At
any point in the text, you can ho...
Metadata: {'section_id': 'page_5_section_0', 'parent_id': 'page_5_section_0', 'paragraph_id': 'page_5_section_0_para_0', 'chunk_index': 0, 'ischunk': True, 'c': 'hierarchical_paragraph', 'page_number': 5, 'source': '../julius-caesar_PDF_FolgerShakespeare.pdf', 'chunk_type': 'paragraph', 'chunk_level': 2}

--- Document 3 ---
ID: hp_chunk_2
Text: FLAVIUS
CARPENTER
MARULLUS
COBBLER
MARULLUS
COBBLER
FLAVIUS
En