In [None]:

!pip install langchain openai faiss-cpu tiktoken PyPDF2 chromadb sentence-transformers


In [None]:

from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, Chroma
from langchain.chat_models import ChatOpenAI
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import pipeline
import os
from dotenv import load_dotenv


In [None]:

load_dotenv()


In [None]:

loader = PyPDFLoader("data/contract_detailed.pdf")
documents = loader.load()
print(f"Loaded {len(documents)} pages from contract_detailed.pdf")


In [None]:

from pathlib import Path
all_docs = []
data_path = Path("data")
for file in data_path.glob("*.pdf"):
    docs = PyPDFLoader(str(file)).load()
    all_docs.extend(docs)
print(f"Loaded total {len(all_docs)} pages from {len(list(data_path.glob('*.pdf')))} PDFs")


In [None]:

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(all_docs)
print(f"Total chunks created: {len(chunks)}")


In [None]:

embedding_oa = OpenAIEmbeddings()
embedding_hf = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [None]:

faiss_store = FAISS.from_documents(chunks, embedding_oa)
chroma_store = Chroma.from_documents(chunks, embedding_hf, collection_name="rag-demo")


In [None]:

faiss_retriever = faiss_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
mmr_retriever = chroma_store.as_retriever(search_type="mmr", search_kwargs={"k": 3})


In [None]:

llm_oa = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
# Optional: HuggingFace
# hf_pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2")
# llm_hf = HuggingFacePipeline(pipeline=hf_pipe)


In [None]:

qa_openai_faiss = RetrievalQA.from_chain_type(llm=llm_oa, retriever=faiss_retriever, return_source_documents=True)
qa_openai_mmr = RetrievalQA.from_chain_type(llm=llm_oa, retriever=mmr_retriever, return_source_documents=True)


In [None]:

query = "What are the termination conditions in the contract?"
result_faiss = qa_openai_faiss({"query": query})
result_mmr = qa_openai_mmr({"query": query})

print("FAISS + OpenAI Answer:\n", result_faiss['result'])
print("\nSources:")
for doc in result_faiss['source_documents']:
    print("-", doc.metadata.get("source", "N/A"))

print("\nMMR + OpenAI Answer:\n", result_mmr['result'])
print("\nSources:")
for doc in result_mmr['source_documents']:
    print("-", doc.metadata.get("source", "N/A"))



## 🔍 Comparison Notes

### Embeddings
- **OpenAIEmbeddings**: High quality, token cost
- **HuggingFaceEmbeddings**: Local, fast, free

### Vector Stores
- **FAISS**: Great for local/offline
- **Chroma**: More flexible for MMR, metadata filtering

### Retrievers
- **Similarity**: Retrieves most relevant
- **MMR**: Promotes diversity

### Generators
- **OpenAI Chat**: Very accurate
- **HuggingFacePipeline**: Local model option


In [None]:

faiss_store.save_local("vectorstore/faiss")
