### Data Injection


In [94]:
from langchain_core.documents import Document

doc = Document(
    page_content="This is the main text content",
    metadata={
        "source": "example.txt",
        "pages": 1,
        "author": "Syed",
        "data_created": "2025-09-01",
    },
)

doc

Document(metadata={'source': 'example.txt', 'pages': 1, 'author': 'Syed', 'data_created': '2025-09-01'}, page_content='This is the main text content')

In [95]:
##Text Loader

from langchain_community.document_loaders import TextLoader

loader = TextLoader("../data/textfiles/python.txt", "utf-8")

document = loader.load()
print(document)

[Document(metadata={'source': '../data/textfiles/python.txt'}, page_content='Python is a high-level, interpreted programming language known for its simplicity, readability, and versatility. It was created by Guido van Rossum and first released in 1991.\nHere’s a breakdown of what makes Python special:\n🧠 Key Features\nEasy to Read and Write – Python’s syntax is close to natural English, which makes it beginner-friendly.\nInterpreted – You don’t need to compile Python code; it runs line by line.\nDynamically Typed – You don’t need to declare variable types (e.g., x = 10 just works).\nObject-Oriented and Functional – Supports multiple programming paradigms.\nExtensive Libraries – Comes with a large standard library and third-party modules (e.g., NumPy, Pandas, TensorFlow).\nCross-Platform – Runs on Windows, macOS, Linux, and more.\n⚙️ Common Uses\nWeb development – with frameworks like Django and Flask\nData science & machine learning – using NumPy, Pandas, scikit-learn, TensorFlow, PyTo

In [96]:
##Directory Loader

from langchain_community.document_loaders import DirectoryLoader


dir_loader = DirectoryLoader(
    "../data/textfiles",
    glob="**/*.txt",  # pattern to match filename
    loader_cls=TextLoader,  # loader class to use
    loader_kwargs={"encoding": "utf-8"},
    show_progress=False,
)

documents = dir_loader.load()
documents

[Document(metadata={'source': '../data/textfiles/machinelearning.txt'}, page_content='Machine Learning (ML) is a branch of Artificial Intelligence (AI) that enables computers to learn from data and improve their performance over time — without being explicitly programmed to perform specific tasks.\n🧠 In Simple Terms\nInstead of writing rules for every possible situation, we feed data to an algorithm, and the algorithm finds patterns and makes decisions or predictions on its own.\n⚙️ How It Works\nCollect Data → Example: pictures of cats and dogs.\nTrain a Model → The algorithm learns patterns (e.g., shapes, colors, features) from the data.\nTest the Model → See how well it performs on new, unseen data.\nMake Predictions → The model can now predict, for example, whether a new image is a cat or a dog.\n🧩 Types of Machine Learning\nSupervised Learning\nThe model learns from labeled data (input and correct output).\n📘 Example: Predicting house prices from features like size and location.\n

In [98]:
from langchain_community.document_loaders import PyMuPDFLoader

dir_loader = DirectoryLoader(
    "../data/pdf",
    glob="**/*.pdf",  # pattern to match filename
    loader_cls=PyMuPDFLoader,  # loader class to use
    show_progress=False,
)

pdf_documents = dir_loader.load()


### RAG Pipelines - Data Injection to VectorDB Pipeline


In [99]:
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader

In [100]:
### Read all pdfs inside directory


def process_all_pdfs(pdf_directory):
    """Process all PDF files in directory"""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # find all pdf recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process")

    for pdf_file in pdf_files:
        print(f"\n Processing : {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add metadata
            for doc in documents:
                doc.metadata["source_file"] = pdf_file.name
                doc.metadata["file_type"] = "pdf"

            all_documents.extend(documents)
            print(f"✅ Loaded {len(documents)} pages")
        except Exception as e:
            print(f"❌ Error {e}")

    print(f"\n Total documents loaded:{len(all_documents)}")
    return all_documents

In [101]:
all_pdf_documents = process_all_pdfs("../data")


Found 1 PDF files to process

 Processing : repealedfileopen.pdf
✅ Loaded 119 pages

 Total documents loaded:119


In [102]:
##Text splitting into chunks


def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split documents into smaller chunks for better RAG performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", " ", ""],
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"split {len(documents)} documents into {len(split_docs)} chunks")

    # show one chunk
    if split_docs:
        print(f"\nExample chunk:")
        print(f"Content : {split_docs[0].page_content[:200]}...")
        print(f"Metadata : {split_docs[0].metadata}")

    return split_docs

In [103]:
chunk = split_documents(all_pdf_documents)


split 119 documents into 643 chunks

Example chunk:
Content : 1 
 
THE INDIAN PENAL CODE 
___________ 
ARRANGEMENT OF SECTIONS 
__________ 
CHAPTER I 
INTRODUCTION 
PREAMBLE 
SECTIONS 
1. Title and extent of operation of the Code.  
2. Punishment of offences com...
Metadata : {'producer': 'Online2PDF.com', 'creator': 'Online2PDF.com', 'creationdate': '2023-06-28T10:58:56+02:00', 'source': '../data/pdf/repealedfileopen.pdf', 'total_pages': 119, 'page': 0, 'page_label': '1', 'source_file': 'repealedfileopen.pdf', 'file_type': 'pdf'}


### Embedding and Vector Store DB


In [104]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [105]:
class EmbeddingManager:
    """Handles document embedding generation using sentence transformer"""

    def __init__(self,model_name:str ='all-MiniLM-L6-v2'):
        """
        Initialize the embedding manager

        Args:
            model_name: HuggingFace model name for sentence embeddings
        """

        self.model_name = model_name
        self.model = None
        self._load_model()
    
    def _load_model(self):
        """Load sentence transformer model"""
        try:
            print(f"Loading embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model Loaded Successfully. Embedding dimension: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error loading model {self.model_name} : {e}")
            raise

    def generate_embeddings(self,texts: List[str]) -> np.ndarray:
        """
        Generate embeddings for a list of texts

        Args:
            text : List of text strings to embed
        
        Returns:
            numpay array of embedding with length and embedded dimension
        """

        if not self.model:
            raise ValueError("Model not loaded. ")
        
        print(f"Generating embeddings for {len(texts)} texts..")
        embeddings = self.model.encode(texts,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings

embedding_manager = EmbeddingManager()
embedding_manager

Loading embedding model: all-MiniLM-L6-v2
Model Loaded Successfully. Embedding dimension: 384


<__main__.EmbeddingManager at 0x1619fe510>

### Vector Store

In [106]:
class VectorStore:
    """Manages document embeddings in a chromaDb vector store"""

    def __init__(
        self,
        collection_name: str = "pdf_documents",
        persist_directory: str = "../data/vector_store",
    ):
        """
        Initialize the vector store

        Args:
            collection_name: Name of the chromaDB collection,
            persist_directory: Directory to persist vector store
        """

        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        """Initialize chromaDB client and collection"""
        try:
            os.makedirs(self.persist_directory, exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)

            # Get or create collection

            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description": "PDF document embeddings for RAG"},
            )

            print(f"Vector store initialized. Collection : {self.collection_name}")
            print(f"Existing document in collection : {self.collection.count()}")
        except Exception as e:
            print(f"Error initializing vector store: {e}")
            raise

    def add_documents(self, documents: List[Any], embeddings: np.ndarray):
        """
        Add documents and their embeddings to the vector store

        Args:
            documents: List of Langchain documents
            embeddings: Corresponding embeddings for the documents
        """

        if len(documents) != len(embeddings):
            raise ValueError("Number of documents must match numnber of documents")

        print(f"Adding {len(documents)} documents to vector store.....")

        # prepare data for chromadb

        ids = []
        metadatas = []
        documents_text = []
        embeddings_list = []

        for i, (doc,embedding) in enumerate(zip(documents, embeddings)):
            #Generate unique ID
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #Preparing metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)

            metadatas.append(metadata)

            #Document content
            documents_text.append(doc.page_content)

            #embedding
            embeddings_list.append(embedding.tolist())

        # Add to collection

        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                documents=documents_text,
                embeddings=embeddings_list
            )

            print(f"Successfully added {len(documents)} documents to vector store")
            print(f"Total documents in collection: {self.collection.count()}")
        except Exception as e:
            print(f"Eroor adding documents to chromaDB: {e}")
            raise



vector_store = VectorStore()
vector_store

Vector store initialized. Collection : pdf_documents
Existing document in collection : 1286


<__main__.VectorStore at 0x1680e5010>

In [107]:
texts = [doc.page_content for doc in chunk]


In [109]:
##Generate embeddings

embeddings = embedding_manager.generate_embeddings(texts)


Generating embeddings for 643 texts..


Batches: 100%|██████████| 21/21 [00:04<00:00,  4.82it/s]

Generated embeddings with shape: (643, 384)





In [110]:
#store in the vector database
vector_store.add_documents(chunk,embeddings)

Adding 643 documents to vector store.....
Successfully added 643 documents to vector store
Total documents in collection: 1929


## Retriver Pipeline from VectorStore

In [111]:
class RAGRetriever:
    """Handles query based retrievel from the vector store"""

    def __init__(self, vector_store: VectorStore, embedding_manager: EmbeddingManager):
        """
        Initialize the retriever

        Args:
            vector_store: Vector store containing document embeddings,
            embedding_manager: Manager for generating quer embeddings
        """

        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query: str, top_k:int = 5, score_threshold:float = 0.0) -> List[Dict[str,Any]]:
        """ 
        Retrieve relevant documents for a query

        Args:
            query: The search query
            top_k: Number of top results to return
            score_threshold: Minimum similarity score threshold
        
        Returns:
            List of dictionaries containing retrieved documents and metadata    

        """

        print(f"Retrieving documents for query: '{query}'")
        print(f"Top K: {top_k}, Score threshold: {score_threshold}")

        # Generate query embedding
        query_embedding = self.embedding_manager.generate_embeddings([query])[0]

        #search in vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            retrieved_docs = []

            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i,(doc_id,document,metadata,distance) in enumerate(zip(ids,documents,metadatas,distances)):
                    #Convert distance to similarity
                    similarity_score = 1 - distance

                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id': doc_id,
                            'content': document,
                            'metadata':metadata,
                            'similarity_score':similarity_score,
                            'distance':distance,
                            'rank': i + 1
                        })
                print(f"Retrieved {len(retrieved_docs)} documents (after filtering)")
            else: 
                print("No documents found")
            
            return retrieved_docs
        except Exception as e:
            print(f"Error during retrievel: {e}")
            return []

rag_retriever = RAGRetriever(vector_store,embedding_manager)

In [112]:
rag_retriever

<__main__.RAGRetriever at 0x1619fd940>

In [113]:
rag_retriever.retrieve("What is punishment for theft?")

Retrieving documents for query: 'What is punishment for theft?'
Top K: 5, Score threshold: 0.0
Generating embeddings for 1 texts..


Batches: 100%|██████████| 1/1 [00:00<00:00, 16.08it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





[{'id': 'doc_cabfd986_503',
  'content': "a term which may extend to seven years, and shall also be liable to fine.  \n382. Theft after preparation made for causing death, hurt or restraint in order to the \ncommitting of the theft .—Whoever commits theft, having made preparation for causing death, or hurt, \nor restraint, or fear of death, or of hurt, or of restraint, to any pe rson, in order to the committing of such \ntheft, or in order to the effecting of his escape after the committing of such theft, or in order to the \nretaining of property taken by such theft, shall be punished with rigorous impri sonment for a term which \nmay extend to ten years, and shall also be liable to fine. \n  Illustrations \n(a) A commits theft on property in Z's possession; and while committin g this theft, he has a loaded pistol under his garment \nhaving provided this pistol for the purpose of hurting Z in case Z sh ould resist. A has committed the offence defined in this \nsection.",
  'metadata':

## Vectordb context integration with LLM

In [114]:
##Simple RAG Pipeline with gemini
from dotenv import load_dotenv
load_dotenv()

GEMINI_KEY = os.getenv('GEMINI_API')


In [115]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",  
    api_key=GEMINI_KEY
)

def rag_simple(query,retriver,llm,top_k=3):
    ##Retriever the context
    results = retriver.retrieve(query,top_k=top_k)
    context = "\n\n".join([doc['content']for doc in results]) if results else ""
    if not context:
        return "No relevent context found to answer the question"
    
    ##Generate anser using gemini
    prompt = f"""
    Use the following context to answer question concisely
        Context: {context}

        Question: {query}

        Answer:
    """
        
    response = llm.invoke([prompt.format(context=context,query=query)])
    return response.content

In [116]:
answer = rag_simple("Murder offences?",rag_retriever,llm)
print(answer)

Retrieving documents for query: 'Murder offences?'
Top K: 3, Score threshold: 0.0
Generating embeddings for 1 texts..


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.40it/s]


Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)
The provided context does not define or describe murder offences. It discusses culpable homicide and gives an example where an act leading to death was *not* considered culpable homicide due to lack of intent or knowledge.


## Enhanced RAG Pipeline features

In [117]:
def rag_advanced(query,retriver,llm,top_k=5,min_score=0.2,return_context = False):
    """ 
    RAG Pipeline with extra features:
        returns answer,sources,confidence score and optionally full context
    """

    results = retriver.retrieve(query,top_k=top_k,score_threshold=min_score)
    if not results:
        return {'answer':'No relevant context found','sources':[],'confidence':0.0,'context':''}
    
    # Prepare context and sources

    context = "\n\n".join([doc['content']for doc in results])
    sources = [{
        'source': doc['metadata'].get('source','unknown'),
        'page': doc['metadata'].get('page','unknown'),
        'score': doc['similarity_score'],
        'preview': doc['content'][:120]+"....."
    } for doc in results]
    confidence = max([doc['similarity_score'] for doc in results])

    # Generate answer

    prompt =f""" 
        Use the following context to answer question concisely
        context:{context}
        question:{query}
        answer:
    """

    response = llm.invoke([prompt.format(context=context, query=query)])

    output = {
        'answer' :  response.content,
        'sources' : sources,
        'confidence' : confidence
    }

    if return_context:
        output['context'] = context
    
    return output

In [118]:
result = rag_advanced("Punishment for sexual harrasment",retriver=rag_retriever,llm=llm)
result

Retrieving documents for query: 'Punishment for sexual harrasment'
Top K: 5, Score threshold: 0.2
Generating embeddings for 1 texts..


Batches: 100%|██████████| 1/1 [00:00<00:00, 12.36it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents (after filtering)





{'answer': 'For sexual harassment involving a demand or request for sexual favours, or showing pornography against the will of a woman, the punishment is rigorous imprisonment for a term which may extend to three years, or with fine, or with both.\n\nFor sexual harassment involving making sexually coloured remarks, the punishment is imprisonment of either description for a term which may extend to one year, or with fine, or with both.',
 'sources': [{'source': '../data/pdf/repealedfileopen.pdf',
   'page': 83,
   'score': 0.3714944124221802,
   'preview': '84 \n \n(ii) a demand or request for sexual favours; or \n(iii) showing pornography against the will of a woman; or \n(iv) m.....'},
  {'source': '../data/pdf/repealedfileopen.pdf',
   'page': 83,
   'score': 0.3714944124221802,
   'preview': '84 \n \n(ii) a demand or request for sexual favours; or \n(iii) showing pornography against the will of a woman; or \n(iv) m.....'},
  {'source': '../data/pdf/repealedfileopen.pdf',
   'page': 

In [119]:
# Advanced pipeline with steaming,History,citations,summarization...

from typing import Dict, Any
import time


class AdvancedRAGPipeline:
    def __init__(self, retriever, llm):
        self.retriever = retriever
        self.llm = llm
        self.history = []  # To store query history

    def query(
        self,
        question: str,
        top_k: int = 5,
        min_score: float = 0.2,
        stream: bool = False,
        summarize: bool = False,
    ) -> Dict[str, Any]:
        # retrieve relevent documents
        results = self.retriever.retrieve(
            question, top_k=top_k, score_threshold=min_score
        )
        if not results:
            answer = "No relevant context found."
            sources = []
            context = ""
        else:
            context = "\n\n".join([doc["content"] for doc in results])
            sources = [
                {
                    "source": doc["metadata"].get("source", "unknown"),
                    "page": doc["metadata"].get("page", "unknown"),
                    "score": doc["similarity_score"],
                    "preview": doc["content"][:120] + ".....",
                }
                for doc in results
            ]

            # Streaming answer simulation
            prompt = f""" 
            Use the following context to answer question concisely
            context:{context}
            question:{question}
            answer:
            """

            if stream:
                print("Streaming answer:")
                for i in range(0, len(prompt), 80):
                    print(prompt[i:i+80], end='', flush=True)
                    time.sleep(0.05)
                print()

            response = self.llm.invoke([prompt.format(context=context, question=question)])
            answer = response.content

        #Adding citation

        citations = [f"[{i+1}] {src['source']} page{src['page']}" for i, src in enumerate(sources)]
        answer_with_citation = answer + "\n\nCitations:\n" + "\n".join(citations) if citations else answer


        #Summarize the answer
        summary = None

        if summarize and answer:
            summary_prompt= f"Summarize the following answer in 2 snetences:\n{answer}"
            summary_response = self.llm.invoke([summary_prompt])
            summary = summary_response.content

        #Store query history

        self.history.append({
            'question':question,
            'answer':answer,
            'sources':sources,
            'summary': summary
        })


        return {
            'question': question,
            'answer': answer_with_citation,
            'sources': sources,
            'summary': summary,
            'history': self.history
        }

In [120]:
adv_rag = AdvancedRAGPipeline(rag_retriever,llm)
result = adv_rag.query(
    "Offences related to religion",
    top_k=3,
    min_score=0.1,
    
    summarize=True
)
print("\nFinal Answer:", result['answer'])
print("Summary:", result['summary'])
print("History:", result['history'][-1])

Retrieving documents for query: 'Offences related to religion'
Top K: 3, Score threshold: 0.1
Generating embeddings for 1 texts..


Batches: 100%|██████████| 1/1 [00:00<00:00,  3.79it/s]


Generated embeddings with shape: (1, 384)
Retrieved 3 documents (after filtering)

Final Answer: *   295. Injuring or defiling place of worship, with intent to insult the religion of any class.
*   295A. Deliberate and malicious acts, intended to outrage religious feelings of any class by insulting its religion or religious beliefs.
*   296. Disturbing religious assembly.
*   297. Trespassing on burial places, etc.
*   298. Uttering words, etc., with deliberate intent to wound the religious feelings.

Citations:
[1] ../data/pdf/repealedfileopen.pdf page7
[2] ../data/pdf/repealedfileopen.pdf page7
[3] ../data/pdf/repealedfileopen.pdf page7
Summary: These legal provisions criminalize deliberate acts intended to insult or outrage religious feelings and beliefs, encompassing actions like defiling places of worship or uttering offensive words. Additionally, they address disturbing religious assemblies and trespassing on sacred burial grounds.
History: {'question': 'Offences related to relig