In [None]:
# Import document loaders for loading PDF files from directories
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader

In [None]:
# Initialize DirectoryLoader to load all PDF files with progress tracking
document = DirectoryLoader(
    "../data/pdf_files",
    loader_cls=PyPDFLoader,
    glob='**/*.pdf',
    show_progress=True
)

In [None]:
# Load all documents from the directory
dir_document = document.load()

100%|██████████| 3/3 [00:06<00:00,  2.31s/it]


In [None]:
# Import text splitter and define function to chunk documents for better RAG performance
from langchain_text_splitters import RecursiveCharacterTextSplitter

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into smaller chunks for better RAG performance.
    
    Parameters:
    - chunk_size: Maximum characters per chunk (adjust based on your LLM)
    - chunk_overlap: Characters to overlap between chunks (preserves context)
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,  # Each chunk: ~1000 characters
        chunk_overlap=chunk_overlap,  # 200 chars overlap for context continuity
        length_function=len,  # How to measure length
        separators=["\n\n", "\n", " ", ""]  # Try paragraph, line, word, then character splits
    )
    # Actually split the documents
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")
    
    # Show what a chunk looks like
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
    
    return split_docs

In [None]:
# Split documents into chunks with specified size and overlap
chunked_documents = split_documents(dir_document, chunk_size=1000, chunk_overlap=200)

Split 170 documents into 720 chunks

Example chunk:
Content: . 
. 
Latest updates: hps://dl.acm.org/doi/10.1145/3773084
. 
. 
RESEARCH-ARTICLE
Large Language Models for Constructing and Optimizing Machine
Learning Workflows: A Survey
YANG GU, Shanghai Jiao Ton...
Metadata: {'producer': 'iText 4.2.0 by 1T3XT', 'creator': 'PyPDF', 'creationdate': '2026-01-21T22:02:52-08:00', 'moddate': '2026-01-21T22:02:53-08:00', 'subject': 'ACM Trans. Softw. Eng. Methodol. 0.0', 'title': 'Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey', 'source': '..\\data\\pdf_files\\Constructing and Optimizing Machine.pdf', 'total_pages': 45, 'page': 0, 'page_label': '1'}


In [None]:
# Inspect the first document
doc = dir_document[0]
doc

Document(metadata={'producer': 'iText 4.2.0 by 1T3XT', 'creator': 'PyPDF', 'creationdate': '2026-01-21T22:02:52-08:00', 'moddate': '2026-01-21T22:02:53-08:00', 'subject': 'ACM Trans. Softw. Eng. Methodol. 0.0', 'title': 'Large Language Models for Constructing and Optimizing Machine Learning Workflows: A Survey', 'source': '..\\data\\pdf_files\\Constructing and Optimizing Machine.pdf', 'total_pages': 45, 'page': 0, 'page_label': '1'}, page_content='. \n. \nLatest updates: h\ue03cps://dl.acm.org/doi/10.1145/3773084\n. \n. \nRESEARCH-ARTICLE\nLarge Language Models for Constructing and Optimizing Machine\nLearning Workflows: A Survey\nYANG GU, Shanghai Jiao Tong University, Shanghai, China\n. \nHENGYU YOU, Shanghai Jiao Tong University, Shanghai, China\n. \nJIAN CAO, Shanghai Jiao Tong University, Shanghai, China\n. \nMURAN YU, Stanford University, Stanford, CA, United States\n. \nHAORAN FAN, Shanghai Jiao Tong University, Shanghai, China\n. \nSHIYOU QIAN, Shanghai Jiao Tong University, Sh

In [None]:
# Check the document object type
type(doc)

langchain_core.documents.base.Document

In [None]:
# Display document metadata and content preview
print(f"Document subject : {doc.metadata.get('subject')}")
print(f"\n\nDocument content : {doc.page_content[:500]}")  # First 500 chars of content

Document subject : ACM Trans. Softw. Eng. Methodol. 0.0


Document content : . 
. 
Latest updates: hps://dl.acm.org/doi/10.1145/3773084
. 
. 
RESEARCH-ARTICLE
Large Language Models for Constructing and Optimizing Machine
Learning Workflows: A Survey
YANG GU, Shanghai Jiao Tong University, Shanghai, China
. 
HENGYU YOU, Shanghai Jiao Tong University, Shanghai, China
. 
JIAN CAO, Shanghai Jiao Tong University, Shanghai, China
. 
MURAN YU, Stanford University, Stanford, CA, United States
. 
HAORAN FAN, Shanghai Jiao Tong University, Shanghai, China
. 
SHIYOU QIAN, Shanghai


In [None]:
# Import required libraries for embeddings generation
import numpy as np
from sentence_transformers import SentenceTransformer
from typing import List, Dict, Any, Tuple

In [None]:
# Load the sentence transformer model for generating embeddings
model_name = "all-MiniLM-L6-v2"
print(f"Loading model : {model_name}")
model = SentenceTransformer(model_name)

model_dimension = model.get_sentence_embedding_dimension()
print(f"\nModel loaded with dimension : {model_dimension}")

Loading model : all-MiniLM-L6-v2


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 869.08it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.



Model loaded with dimension : 384


In [None]:
# Generate embeddings for all chunked documents
print(f"Generating embedding...")
embeddings = model.encode([d.page_content for d in chunked_documents], show_progress_bar=True)  # Convert text chunks to vectors
print(f"Generated embeddings with shape : {embeddings.shape}")
embeddings

Generating embedding...


Batches: 100%|██████████| 23/23 [00:25<00:00,  1.10s/it]

Generated embeddings with shape : (720, 384)





array([[-0.04726338,  0.00032324,  0.0190388 , ...,  0.00690563,
         0.00175409,  0.05870934],
       [-0.026851  , -0.00267686,  0.02108338, ...,  0.01690894,
         0.02223114,  0.03617563],
       [-0.00562933, -0.04167898,  0.00799582, ...,  0.05661926,
         0.10684081,  0.01380027],
       ...,
       [-0.11117143,  0.0337267 , -0.0059452 , ...,  0.0478947 ,
         0.02891634,  0.07149113],
       [-0.216594  , -0.03837626, -0.04161664, ...,  0.02678554,
        -0.00071634,  0.10696496],
       [ 0.02199661, -0.0140572 , -0.01843296, ...,  0.07923552,
         0.07359981,  0.02747879]], shape=(720, 384), dtype=float32)

In [None]:
# Import libraries for ChromaDB vector store management
import os
import chromadb
import uuid

In [None]:
# Define VectorStore class to manage ChromaDB operations and add documents with embeddings
class VectorStore:
    def __init__(self, collection_name: str = "pdf_documents", persist_directory: str = "../data/vector_store") -> None:
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()
    
    def _initialize_store(self) -> None:
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)  # Create persistent client for disk storage

            self.collection = self.client.get_or_create_collection(
                name = self.collection_name,
                metadata={"description":"PDF document embedding for RAG"}
            )

            print(f"Vector store initialized. Collection : {self.collection}")
            print(f"Existing documents in collection : {self.collection.count()}")

        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise
    

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):

        if len(documents) != len(embeddings):
            raise ValueError("Number of embedding must be same to number of embeddings")
        
        print(f"Adding {len(documents)} documents to vector store...")

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc, embedding) in enumerate(zip(documents, embeddings)):

            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"  # Generate unique ID with UUID prefix
            ids.append(doc_id)

            metadata = dict(doc.metadata)  # Copy existing metadata
            metadata['doc_index'] = i  # Add index for tracking
            metadata['content_length'] = len(doc.page_content)  # Store chunk size
            metadatas.append(metadata)

            documents_text.append(doc.page_content)

            embeddings_list.append(embedding.tolist())  # Convert numpy array to list for ChromaDB
        
        try: 
            self.collection.add(
                ids=ids,
                embeddings=embeddings_list,
                metadatas=metadatas,
                documents=documents_text
            )
            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection : {self.collection.count()}")
        except Exception as e:
            print(f"Error adding document to vector store: {e}")


# Clean up existing collection and initialize VectorStore with documents and embeddings
temp_client = chromadb.PersistentClient(path="../data/vector_store")
try:
    temp_client.delete_collection(name="pdf_documents")  # Remove old collection to avoid duplicates
    print("Deleted existing collection")
except:
    print("No existing collection to delete")

vectorstore = VectorStore()
vectorstore.add_documents(chunked_documents, embeddings)

Deleted existing collection
Vector store initialized. Collection : Collection(name=pdf_documents)
Existing documents in collection : 0
Adding 720 documents to vector store...
Successfully added 720 documents to vector store
Total documents in collection : 720
