In [3]:
from openai import OpenAI
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_chroma import Chroma

from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

import os
from dotenv import load_dotenv

In [4]:
# !pip install langchain
# !pip install langchain-openai
# !pip install langchain-community
# !pip install langchain-chroma

In [5]:
load_dotenv()

True

In [6]:
openai_api_key = os.getenv("OPENAI_API_KEY")
chroma_db_dir = os.getenv("CHROMA_DB_DIR")

## Loading the Dataset

In [7]:
all_documents = []

file_path = "./db"
for file in os.listdir(file_path):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(file_path, file))
        all_documents.extend(loader.load())

print(f"Total documents loaded: {len(all_documents)}")

Total documents loaded: 109


## Chunking the Dataset

In [8]:
# Define the splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)

In [9]:
# Apply to documents
chunked_docs = text_splitter.split_documents(all_documents)

# print(f"Total chunks created: {len(chunked_docs)}")
# print(chunked_docs[0].page_content[:])
# print(chunked_docs[0].metadata)

## Embed the Dataset

In [10]:
# Initialize embedding model
embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

In [11]:
# # Save embeddings to local Chroma DB
# vectorstore = Chroma.from_documents(
#     documents=chunked_docs,
#     embedding=embeddings,
#     persist_directory="./chroma_db"   # folder where Chroma stores data
# )

# # Persist to disk (so you don’t have to re-embed every run)
# vectorstore.persist()

In [11]:
import chromadb
client = chromadb.PersistentClient(path="./chroma_db")
print(client.list_collections())

[Collection(name=langchain)]


In [12]:
# Load vectordb from disk
chroma_db = Chroma(persist_directory="./chroma_db",
                   collection_name="langchain",
                  embedding_function = embeddings)


## Semantic Similarity based Retrieval

In [13]:
similarity_retriever = chroma_db.as_retriever(search_type="similarity_score_threshold",
                                             search_kwargs={'k':3,"score_threshold":0.2})

In [None]:
query = "what are training dataset"
results = similarity_retriever.get_relevant_documents(query)

# for i, doc in enumerate(results, 1):
#     print(f"\nResult {i}:")
#     print("Content:", doc.page_content)
#     print("Metadata:", doc.metadata)

## Generative Logic using Langchain and GPT

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

In [None]:
prompt_template = """
You are a helpful assistant. Use the provided context to answer the question.
If you can find relevant information in the context, provide an answer based on that information.
Only if you cannot find ANY relevant information should you respond with:
"This tool only answers questions based on the documents in its database. Please ask something within that scope."

Context:
{context}

Question: {question}

Answer:
"""


In [None]:
custom_prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

In [None]:
# Create a new retriever with more lenient settings for testing
test_retriever = chroma_db.as_retriever(
    search_type="similarity", 
    search_kwargs={'k': 3}  # Remove score threshold temporarily
)

In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=test_retriever,  # Use the test retriever
    chain_type="stuff",
    chain_type_kwargs={"prompt": custom_prompt},
    return_source_documents=True,
    verbose=True  # This will show you what's being passed to the LLM
)

In [None]:
result = qa_chain({"query": query})
print("Answer:", result["result"])
print("Number of source documents:", len(result["source_documents"]))