In [2]:
import os
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path

In [4]:
#read all the pdfs from the directory
def process_all_pdfs(directory):
    all_documents = []
    pdf_dir = Path(directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))
    print(f'Found {len(pdf_files)} PDF files')
    
    for file in pdf_files:
        print(f'\n processing: {file.name}:')
        try:
            pdf_loader = PyMuPDFLoader(str(file))
            docs = pdf_loader.load()

            for doc in docs:
                doc.metadata['source_file'] = file.name
                doc.metadata['file_type'] = 'pdf'

            all_documents.extend(docs)
            print(f'loaded {len(docs)} pages')

        except Exception as e:
            print(f'Error : {e}')
    
    print(f'Total document loaded {len(all_documents)}')
    return all_documents


In [5]:
all_pdf_documents = process_all_pdfs('../pdfs')

Found 2 PDF files

 processing: encoder-decoder.pdf:
loaded 9 pages

 processing: transformers_comparision.pdf:
loaded 58 pages
Total document loaded 67


In [6]:
#create chunks
def split_document(document,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap,
        length_function = len,
        separators = ["\n\n","\n"," ", ""]
    )

    split_docs = text_splitter.split_documents(document)
    print(f"split {len(document)} into {len(split_docs)} chunks")

    if split_docs:
        print("Example chunk")
        print(f"content: {split_docs[0].page_content[:200]}...")
        print(f"metadata: {split_docs[0].metadata}")
    
    return split_docs

In [7]:
chunks = split_document(all_pdf_documents)

split 67 into 374 chunks
Example chunk
content: arXiv:1409.3215v3  [cs.CL]  14 Dec 2014
Sequence to Sequence Learning
with Neural Networks
Ilya Sutskever
Google
ilyasu@google.com
Oriol Vinyals
Google
vinyals@google.com
Quoc V. Le
Google
qvl@google....
metadata: {'producer': 'GPL Ghostscript GIT PRERELEASE 9.08', 'creator': 'dvips(k) 5.991 Copyright 2011 Radical Eye Software', 'creationdate': '2014-12-15T20:32:57-05:00', 'source': '..\\pdfs\\encoder-decoder.pdf', 'file_path': '..\\pdfs\\encoder-decoder.pdf', 'total_pages': 9, 'format': 'PDF 1.4', 'title': 'arXiv:1409.3215v3  [cs.CL]  14 Dec 2014', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2014-12-15T20:32:57-05:00', 'trapped': '', 'modDate': "D:20141215203257-05'00'", 'creationDate': "D:20141215203257-05'00'", 'page': 0, 'source_file': 'encoder-decoder.pdf', 'file_type': 'pdf'}


## Vector Embeddings

In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
import uuid
from typing import List, Dict, Any, Tuple
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
class EmbeddingManager:
    def __init__(self,model_name="all-MiniLM-L6-v2"):
        self.model_name = model_name
        self.model = None
        self._load_model()

    def _load_model(self):
        try:
            print(f"Loading Embedding model: {self.model_name}")
            self.model = SentenceTransformer(self.model_name)
            print(f"Model successfully loaded with Embedding dimensions: {self.model.get_sentence_embedding_dimension()}")
        except Exception as e:
            print(f"Error: {e}")
            raise

    def generateEmbeddings(self,text: List[str]) -> np.ndarray:
        if not self.model:
            raise ValueError("Model not loaded")
        
        print(f"Genetating embedding for {len(text)} texts...")
        embeddings = self.model.encode(text,show_progress_bar=True)
        print(f"Generated embeddings with shape: {embeddings.shape}")
        return embeddings
    
    def get_embedding_dimension(self)-> int:
        if not self.model:
            raise ValueError("model not loaded")
        return self.model.get_sentence_embedding_dimension()
    

embedding_manager = EmbeddingManager()

Loading Embedding model: all-MiniLM-L6-v2
Model successfully loaded with Embedding dimensions: 384


In [10]:
class VectorStore:
    def __init__(self, collection_name:str = "pdf_document", persist_directory:str= "../vector_store"):
        self.collection_name = collection_name
        self.persist_directory = persist_directory
        self.client = None
        self.collection = None
        self._initialize_store()

    def _initialize_store(self):
        try:
            os.makedirs(self.persist_directory,exist_ok=True)
            self.client = chromadb.PersistentClient(path=self.persist_directory)
            #get or create collection
            self.collection = self.client.get_or_create_collection(
                name=self.collection_name,
                metadata={"description":"PDF document embedding for RAG"}
            )
        except Exception as e:
            print(f"Error while loading vector store: {e}")

    def add_document(self,document:list[Any], embeddings :np.ndarray):
        if len(document) != len(embeddings):
            raise ValueError("Number of documents does not match number of embeddings")

        print(f"Adding {len(document)} to the vector store...")
        
        #prepare the data for chromadb
        ids = []
        metadatas = []
        document_texts = []
        embedding_lists = []

        for i, (doc,embedding) in enumerate(zip(document,embeddings)):
            #generate unique ids
            doc_id = f"doc_{uuid.uuid4().hex[:8]}_{i}"
            ids.append(doc_id)

            #prepare metadata
            metadata = dict(doc.metadata)
            metadata['doc_index'] = i
            metadata['content_length'] = len(doc.page_content)
            metadatas.append(metadata)

            #Document content
            document_texts.append(doc.page_content)

            #Embeddings
            embedding_lists.append(embedding)

            #Add into the collection
        try:
            self.collection.add(
                ids=ids,
                metadatas=metadatas,
                embeddings=embeddings,
                documents=document_texts
            )
            print(f"Successfully add {len(document)} into the vector store")
            print(f"Total documents in the collection: {self.collection.count()}")
        except Exception as e:
            print(f"Error adding document into the collection: {e}")
            raise

vectorstore = VectorStore()

In [11]:
#convert text into embeddings
text = [doc.page_content for doc in chunks]
#generate embeddings
embeddings = embedding_manager.generateEmbeddings(text)
#store into vector store
vectorstore.add_document(chunks,embeddings)

Genetating embedding for 374 texts...


Batches: 100%|██████████| 12/12 [00:16<00:00,  1.39s/it]


Generated embeddings with shape: (374, 384)
Adding 374 to the vector store...
Successfully add 374 into the vector store
Total documents in the collection: 748


## Retriever Pipline for Vector Store

In [12]:
class RAGRetriever:
    def __init__(self,vector_store: VectorStore, embedding_manager: EmbeddingManager):
        self.vector_store = vector_store
        self.embedding_manager = embedding_manager

    def retrieve(self, query:str, top_k:int = 5, score_threshold:float=0.0,)->List[dict[str,any]]:
        print(f"Retrieving documents for query: {query}")
        print(f"Top_k: {top_k}, Score_threshold: {score_threshold}")
        
        #Generate embedding vector for query
        query_embedding = self.embedding_manager.generateEmbeddings([query])[0]
        
        #search in the vector store
        try:
            results = self.vector_store.collection.query(
                query_embeddings=[query_embedding.tolist()],
                n_results=top_k
            )

            #process the results
            retrieved_docs = []
            if results['documents'] and results['documents'][0]:
                documents = results['documents'][0]
                metadatas = results['metadatas'][0]
                distances = results['distances'][0]
                ids = results['ids'][0]

                for i, (doc_id,document,metadata,distance,) in enumerate(zip(ids,documents,metadatas,distances)):
                    #convert distance to similarity score as chromadb uses cosine distance
                    similarity_score = 1 - distance
                    
                    if similarity_score >= score_threshold:
                        retrieved_docs.append({
                            'id' :doc_id,
                            'content':document,
                            'metadata': metadata,
                            'similarity_score': similarity_score,
                            'distance' : distance,
                            'rank': 1+i
                        })
                print(f"Retrieved {len(retrieved_docs)} documents afer filter")
            else:
                print("No document found")
            return retrieved_docs
        except Exception as e:
            print(f"Error: {e}")
            return []
        
rag_retriever = RAGRetriever(vectorstore,embedding_manager)

In [13]:
rag_retriever.retrieve(" Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general ")

Retrieving documents for query:  Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general 
Top_k: 5, Score_threshold: 0.0
Genetating embedding for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 11.13it/s]

Generated embeddings with shape: (1, 384)
Retrieved 5 documents afer filter





[{'id': 'doc_c6097cfa_0',
  'content': 'arXiv:1409.3215v3  [cs.CL]  14 Dec 2014\nSequence to Sequence Learning\nwith Neural Networks\nIlya Sutskever\nGoogle\nilyasu@google.com\nOriol Vinyals\nGoogle\nvinyals@google.com\nQuoc V. Le\nGoogle\nqvl@google.com\nAbstract\nDeep Neural Networks (DNNs) are powerful models that have achieved excel-\nlent performance on difﬁcult learning tasks. Although DNNs work well whenever\nlarge labeled training sets are available, they cannot be used to map sequences to\nsequences. In this paper, we present a general end-to-end approach to sequence\nlearning that makes minimal assumptions on the sequence structure. Our method\nuses a multilayered Long Short-Term Memory (LSTM) to map the input sequence\nto a vector of a ﬁxed dimensionality, and then another deep LSTM to decode the\ntarget sequence from the vector. Our main result is that on an English to French\ntranslation task from the WMT’14 dataset, the translations produced by the LSTM\nachieve a BLEU sc

## Integrating Vector Database with LLM Output

In [3]:
#simple RAG pipeline with groq llm
from langchain_groq import ChatGroq
from dotenv import load_dotenv

load_dotenv()

groq_api_key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(api_key=groq_api_key, model='llama-3.3-70b-versatile', temperature=0.1, max_tokens=1024)

#simple rag function to retrieve doc and generate output
def simple_rag(query, rag_retriever,llm,top_k=3):
    #retrieve context
    result = rag_retriever.retrieve(query=query,top_k=top_k)
    context = "\n\n".join([doc['content'] for doc in result]) if result else ""
    if not context:
        return "No relevent context found to answer the question."
    
    #generate the answer using groq llm
    prompt = f"""use the following context to answer the question concisely
        context:
        {context}

        Question: {query}
    
        Answer:"""
    
    response = llm.invoke(prompt.format(context=context,query=query))
    return response.content

In [25]:
answer = simple_rag("what is encoder-decoder architecture?",rag_retriever,llm)
print(answer)

Retrieving documents for query: what is encoder-decoder architecture?
Top_k: 3, Score_threshold: 0.0
Genetating embedding for 1 texts...


Batches: 100%|██████████| 1/1 [00:00<00:00, 64.03it/s]

Generated embeddings with shape: (1, 384)
Retrieved 3 documents afer filter





The encoder-decoder architecture refers to the transformer architecture, which consists of two main modules: 

1. **Encoder Module**: Comprises Feed-Forward Layer, Multi-Head Attention Layer, Residual connections, and Add and Norm layers. It receives embedding input and generates parameter matrices (Q, K, V) for the Multi-Head Attention layer.
2. **Decoder Module**: Similar to the Encoder module, but with additional layers such as Masked Multi-Head Attention. It also includes Feed-Forward, Multi-Head Attention, Residual connection, and Add layers.
