### Experimenting on RAG


###### PDFs : Text, Image, Table data,  minimum 200 pages

In [None]:
import os
from uuid import uuid4
import time
from pinecone import Pinecone
import docx
from typing import List
from langchain_pinecone import PineconeVectorStore
# from langchain_community.vectorstores import Pinecone as LC_Pinecone 
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.retrievers import BM25Retriever, MultiVectorRetriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from sentence_transformers import SentenceTransformer
from langchain.evaluation.qa import QAEvalChain
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from dotenv import load_dotenv
from pinecone import ServerlessSpec
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_google_genai import ChatGoogleGenerativeAI



load_dotenv()


PINECONE_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENVIRONMENT")


In [56]:
# Load a PDF and validate the page count
file_path = "C:\\Users\\sunny\\OneDrive\\Documents\\Agentic AI Course\\agentic_AI\\2023050195.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()

if len(pages) < 200:
    raise ValueError("The PDF must have at least 200 pages.")

#Using semantic chunking with recursive text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = text_splitter.split_documents(pages)


# Hugging Face embedding
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


In [67]:
pinecone = Pinecone(api_key=PINECONE_KEY)
index_name = "rag-index-new"

if not pinecone.has_index(index_name):
    pinecone.create_index(index_name, dimension=384, metric="cosine",spec=ServerlessSpec(
    cloud="aws",
    region="us-east-1"
  ))
index = pinecone.Index(index_name)


In [68]:
chunk_texts = [chunk.page_content for chunk in chunks]
docs = []
uuids = []
for chunk in chunk_texts:
    uuids.append(str(uuid4()))
    doc = Document( page_content = chunk,metadata={"source":"Constitution of India"})
    docs.append(doc)
    

#### Pinecone natively uses HNSW

In [69]:
# Store embeddings in Pinecone
docsearch = PineconeVectorStore(index=index, embedding=embedding_model)

# docsearch = LC_Pinecone.from_documents(documents=chunks, embedding=embedding_model, index_name=index_name)


In [70]:
docsearch.add_documents(documents=docs, ids=uuids)

['2aaf63fc-d822-4b3f-863e-e2012733adcb',
 '8a6f5d6a-0c71-460c-942c-dea2a33d9427',
 '258dd332-5270-46ed-be97-a768cf69a62a',
 'ca9e9af2-0745-4fbc-b59e-ec72bb808e2e',
 'b69ca3b7-ebbe-4e27-86b1-484688ceef31',
 '6e2ee2f9-bb08-4d18-9fe6-ac5ae630e490',
 'b84e919c-77d8-4280-a4ee-45d627c18445',
 '9ffaa33e-d5cc-41c9-bf17-e8997f36c870',
 '643838c4-cb4e-48d5-8cff-21e4f74ba697',
 'ed333d83-12dc-4849-8f3f-7cd0becf3564',
 '962744a3-d834-49a1-8def-915db82a6cc7',
 '26ca447e-3a71-42f6-ad34-58dbad05073d',
 'bd4dcb02-c92a-47d1-b955-ccb8b85e841e',
 '0bfc99eb-6f41-4aec-b5b0-4a142da0ee10',
 '5e73dec4-56e3-4f8c-b00f-175685a4d890',
 'ff2f2ba7-8a89-4e4f-ac9e-120d875c640c',
 '0cfe665e-22cf-41ee-b04b-9f14cb7eff8c',
 '381219b7-e9fc-47f3-aa3f-30cabfcbc7b8',
 'cbd8962f-a785-4625-b7b1-cb2a15b455ea',
 '6a2f710b-a52e-4eb6-9591-5c3f6f7352e7',
 'eb9357b3-37c5-4f1d-8d9e-0f3c2972ddbd',
 '1334164d-8828-4696-b7ca-03767f0fe734',
 '7f3f0081-7333-48cc-a481-41f0815fa526',
 '34161a6a-fd08-4731-a7d9-d9984e92f451',
 'c2ece01f-e6f0-

##### For Flat/IVF comparisons, use FAISS:

In [71]:
from langchain.vectorstores import FAISS

faiss_flat = FAISS.from_documents(chunks, embedding_model)
faiss_ivf = FAISS.from_documents(chunks, embedding_model)  # IVF only by clustering before indexing (simulated)


#### Create Retriever Pipelines

In [72]:
retriever_pinecone = docsearch.as_retriever()
retriever_flat = faiss_flat.as_retriever()
retriever_ivf = faiss_ivf.as_retriever()


#### Measure Retrieval Time

In [73]:
query = "What are the fundamental duties?"

def measure_time(retriever):
    start = time.time()
    docs = retriever.invoke(query)
    return time.time() - start, docs

pine_time, pine_docs = measure_time(retriever_pinecone)
flat_time, flat_docs = measure_time(retriever_flat)
ivf_time, ivf_docs = measure_time(retriever_ivf)

print(f"Pinecone (HNSW): {pine_time:.3f}s | Flat: {flat_time:.3f}s | IVF: {ivf_time:.3f}s")


Pinecone (HNSW): 1.107s | Flat: 0.017s | IVF: 0.016s


In [74]:
print(pine_docs)

print(flat_docs)

[Document(id='42954f73-e16b-4724-84c6-25d6a0bdb44f', metadata={'source': 'Constitution of India'}, page_content='25 \n1[PART  IVA \n \nFUNDAMENTAL DUTIES \n51A. Fundamental duties.—It shall be the duty of every citizen of \nIndia— \n(a) to abide by the Constitution and respect its ideals and \ninstitutions, the National Flag and the National Anthem; \n(b) to cherish and follow the noble ideals which inspired our \nnational struggle for freedom; \n(c) to uphold and protect the sovereignty, unity and integrity of \nIndia; \n(d) to defend the country and render national service when called \nupon to do so;'), Document(id='a5a62c22-00e3-4cf4-8422-469fff77ff2d', metadata={'source': 'Constitution of India'}, page_content='duties of a legal character, as may from time to time be referred or assigned to \nhim by the Governor, and to discharge the functions conferred on him by or \nunder this Constitution or any other law for the time being in force. \n(3) The Advocate-General shall hold office

#### Accuracy Score for Each

In [75]:
def compute_accuracy(query, docs):
    model = SentenceTransformer("all-MiniLM-L6-v2")
    query_emb = model.encode(query)
    doc_embs = model.encode([doc.page_content for doc in docs])
    print("Calculating Similarity Scores")
    sims = cosine_similarity([query_emb], doc_embs)[0]
    return float(np.max(sims))

print("Accuracy Scores:")
print("Pinecone:", compute_accuracy(query, pine_docs))
print("Flat:", compute_accuracy(query, flat_docs))
print("IVF:", compute_accuracy(query, ivf_docs))


Accuracy Scores:
Calculating Similarity Scores
Pinecone: 0.6157674193382263
Calculating Similarity Scores
Flat: 0.6157674193382263
Calculating Similarity Scores
IVF: 0.6157674193382263


#### Reranking: BM25 or MMR


In [77]:
# BM25 reranker example
bm25 = BM25Retriever.from_documents(chunks)
bm25.k = 5
bm25_docs = bm25.get_relevant_documents(query)


#### Prompt Template

In [78]:
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="You are an assistant. Use the following context to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
)


#### Generate Output with LLM

In [80]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [82]:
model=ChatGoogleGenerativeAI(model='gemini-1.5-flash')

rag_chain = (
    {"context": retriever_pinecone | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | model
    | StrOutputParser()
)


In [84]:
result = rag_chain.invoke("what are fundamental duties")

#### Render to DOCX

In [85]:
doc = docx.Document()
doc.add_heading("RAG Output", 0)
doc.add_paragraph(result)
doc.save("rag_output.docx")
