In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader, UnstructuredFileLoader

In [2]:
def get_documents(folder_path, file_extension):
    documents = []
    if file_extension == 'pdf':
        pdf_loader = DirectoryLoader(folder_path, glob="./*.pdf", loader_cls=PyPDFLoader)  # Select PDF files
        documents += pdf_loader.load()
    elif file_extension == 'txt':
        txt_loader = DirectoryLoader(folder_path, glob="./*.txt")  # Select TXT files
        documents += txt_loader.load()
    elif file_extension == 'docx':
        docx_loader = DirectoryLoader(folder_path, glob="./*.docx", loader_cls=UnstructuredWordDocumentLoader)
        documents += docx_loader.load()
    elif file_extension == 'combined':
        pdf_loader = DirectoryLoader(folder_path, glob="./*.pdf", loader_cls=PyPDFLoader)  # Select PDF files
        documents += pdf_loader.load()
        txt_loader = DirectoryLoader(folder_path, glob="./*.txt")  # Select TXT files
        documents += txt_loader.load()
    else:
        return None

    return documents

In [3]:
docs = get_documents("C:\\Users\\vishi\\Downloads\\New folder","docx")

In [45]:
print(docs[0].metadata["source"])

C:\Users\vishi\Downloads\New folder\Dotnet Resume.docx


In [13]:
#%pip install langchain-text-splitters

from langchain_text_splitters import CharacterTextSplitter

# use text from the first loaded document
#text = docs[0].page_content

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

chunks = text_splitter.split_documents(docs)

print(f"Divided into {len(chunks)} chunks")
print(f"First chunk:\n\n{chunks[0]}")

Created a chunk of size 1086, which is longer than the specified 1000


Divided into 62 chunks
First chunk:

page_content='Professional Summary

I am full stack .Net Developer with 13+ years of experience in all the phases of Software development life cycle, which includes software analysis, design, development, testing, implementation, maintenance and documentation of Client/Server and Web-based applications using N-Tier Architecture. Experience in various domains like Banking, Finance and Insurance industries.

Expertise on ASP.NET, C#, .NET, VB.NET, VB, Web API, LINQ, Entity Framework, .NET Core, MVC, ADO.NET, MS SQL Server, Restful API, Web API, Microservices.

I am a professional skilled in Deployment, Bug fixes, Production support, and maintenance using tools such as Microsoft Visual Studio, JIRA, SVN, TFS, VSTS, Visual Studio Code etc.

Efficient UI developer with efficient skills on Angular 2/4/6/8, Twitter Bootstrap, HTML, jQuery, HTML, HTML5, CSS, CSS3 JavaScript, Ajax, XML, JSON, Typescript with good hands on with browser side debugging and deve

In [13]:
#%pip install sentence_transformers
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [14]:

db_name="resume_db"
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

NameError: name 'chunks' is not defined

In [18]:
#load db if already created
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Chroma(persist_directory=db_name, embedding_function=embeddings)

In [21]:
retriever = vectorstore.as_retriever()

In [16]:
from openai import OpenAI
openrouter_api_key = "sk-or-v1-fb3e8464dda71446faa8f7f7e659e2f0"

openrouter_url = "https://openrouter.ai/api/v1"
llm = OpenAI(base_url=openrouter_url, api_key=openrouter_api_key)

In [18]:
SYSTEM_PROMPT_TEMPLATE = """
You are a HR person in a company who got multiple resume for the 
candidates. You are chatting with your manager.
If relevant, use the given context to answer any question.
If you don't know the answer, say so.
Context:
{context}
"""

In [39]:
from langchain_core.messages import SystemMessage, HumanMessage
def answer_question(question: str, history):
    docs = retriever.invoke(question)
    context = "\n\n".join(doc.page_content for doc in docs)
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)
    #print(context)
    response = llm.chat.completions.create(
        model="mistralai/devstral-2512:free",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question}
        ]
    )
    return response.choices[0].message.content

In [40]:
ans = answer_question("tell the state in which Touchrate Inc client is located",[])
print(ans)

Touchrate Inc is located in Orlando, FL.


# ADVANCED RAG INSTEAD OF INBUILT CHUNKING USE LLM TO CHUNK

In [2]:
from pydantic import BaseModel, Field
class Result(BaseModel):
    page_content: str
    metadata: dict

    # A class to perfectly represent a chunk

class Chunk(BaseModel):
    headline: str = Field(description="A brief heading for this chunk, typically a few words, that is most likely to be surfaced in a query")
    summary: str = Field(description="A few sentences summarizing the content of this chunk to answer common questions")
    original_text: str = Field(description="The original text of this chunk from the provided document, exactly as is, not changed in any way")

    def as_result(self, document):
        metadata = {"source": document.metadata["source"]}
        return Result(page_content=self.headline + "\n\n" + self.summary + "\n\n" + self.original_text,metadata=metadata)


class Chunks(BaseModel):
    chunks: list[Chunk]

In [3]:
AVERAGE_CHUNK_SIZE = 500
def make_prompt(document):
    how_many = (len(document.page_content) // AVERAGE_CHUNK_SIZE) + 1
    return f"""
You take a document and you split the document into overlapping chunks for a KnowledgeBase.

The document is from the shared drive of a HR person.
The document has been retrieved from: {document.metadata["source"]}

A chatbot will use these chunks to answer questions about the candidates experience and qualifications.
You should divide up the document as you see fit, being sure that the entire document is returned in the chunks - don't leave anything out.
This document should probably be split into {how_many} chunks, but you can have more or less as appropriate.
There should be overlap between the chunks as appropriate; typically about 25% overlap or about 50 words, so you have the same text in multiple chunks for best retrieval results.

For each chunk, you should provide a headline, a summary, and the original text of the chunk.
Together your chunks should represent the entire document with overlap.

Here is the document:

{document.page_content}

Respond with the chunks.
"""

In [57]:
def make_messages(document):
    return [
        {"role": "user", "content": make_prompt(document)},
    ]

def process_document(document):
    messages = make_messages(document)
    response = llm.chat.completions.parse(
        model="mistralai/devstral-2512:free",
        messages=messages, response_format=Chunks
    )
    reply = response.choices[0].message.content
    doc_as_chunks = Chunks.model_validate_json(reply).chunks
    return [chunk.as_result(document) for chunk in doc_as_chunks]

In [58]:
from tqdm import tqdm

def create_chunks(documents):
    chunks = []
    for doc in tqdm(documents):
        chunks.extend(process_document(doc))
    return chunks

chunks = create_chunks(docs)

100%|██████████| 3/3 [02:32<00:00, 50.71s/it]


In [60]:
print(chunks[0])

page_content='Professional Summary Overview\n\nThe candidate is a full stack .Net Developer with over 13 years of experience in various phases of the software development life cycle, including analysis, design, development, testing, and maintenance. They have expertise in multiple domains such as Banking, Finance, and Insurance.\n\nProfessional Summary\n\nI am full stack .Net Developer with 13+ years of experience in all the phases of Software development life cycle, which includes software analysis, design, development, testing, implementation, maintenance and documentation of Client/Server and Web-based applications using N-Tier Architecture. Experience in various domains like Banking, Finance and Insurance industries.' metadata={'source': 'C:\\Users\\vishi\\Downloads\\New folder\\Dotnet Resume.docx'}


In [63]:
from chromadb import PersistentClient
collection_name = "docs"

def create_embeddings(chunks):
    chroma = PersistentClient(path="advancedRAG")
    if collection_name in [c.name for c in chroma.list_collections()]:
        chroma.delete_collection(collection_name)

    texts = [chunk.page_content for chunk in chunks]
    emb = embeddings.embed_documents(texts)
    emb = [type('obj', (object,), {'embedding': e})() for e in emb]
    vectors = [e.embedding for e in emb]

    collection = chroma.get_or_create_collection(collection_name)

    ids = [str(i) for i in range(len(chunks))]
    metas = [chunk.metadata for chunk in chunks]

    collection.add(ids=ids, embeddings=vectors, documents=texts, metadatas=metas)
    print(f"Vectorstore created with {collection.count()} documents")

In [64]:
create_embeddings(chunks)

Vectorstore created with 77 documents


In [4]:
class RankOrder(BaseModel):
    order: list[int] = Field(
        description="The order of relevance of chunks, from most relevant to least relevant, by chunk id number"
    )

In [5]:
def rerank(question, chunks):
    system_prompt = """
You are a document re-ranker.
You are provided with a question and a list of relevant chunks of text from a query of a knowledge base.
The chunks are provided in the order they were retrieved; this should be approximately ordered by relevance, but you may be able to improve on that.
You must rank order the provided chunks by relevance to the question, with the most relevant chunk first.
Reply only with the list of ranked chunk ids, nothing else. Include all the chunk ids you are provided with, reranked.
"""
    user_prompt = f"The user has asked the following question:\n\n{question}\n\nOrder all the chunks of text by relevance to the question, from most relevant to least relevant. Include all the chunk ids you are provided with, reranked.\n\n"
    user_prompt += "Here are the chunks:\n\n"
    for index, chunk in enumerate(chunks):
        user_prompt += f"# CHUNK ID: {index + 1}:\n\n{chunk.page_content}\n\n"
    user_prompt += "Reply only with the list of ranked chunk ids, nothing else."
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ]
    response = llm.chat.completions.parse(
        model="mistralai/devstral-2512:free",
        messages=messages, response_format=RankOrder
    )
    #response = completion(model=MODEL, messages=messages, response_format=RankOrder)
    reply = response.choices[0].message.content
    order = RankOrder.model_validate_json(reply).order
    print(order)
    return [chunks[i - 1] for i in order]

In [7]:
from chromadb import PersistentClient
collection_name = "docs"
RETRIEVAL_K = 10
chroma = PersistentClient(path="advancedRAG")
collection = chroma.get_or_create_collection(collection_name)



def fetch_context_unranked(question):
    query = embeddings.embed_query(question)
    results = collection.query(query_embeddings=[query], n_results=RETRIEVAL_K)
    chunks = []
    for result in zip(results["documents"][0], results["metadatas"][0]):
        chunks.append(Result(page_content=result[0], metadata=result[1]))
    return chunks

In [68]:
question = "Who is Rohith?"
chunks = fetch_context_unranked(question)

for chunk in chunks:
    print(chunk.page_content[:15]+"...")

Education Backg...
Rohith Kumar - ...
Technical Skill...
Jayam Solutions...
Technical Skill...
Security and Da...
First Data Expe...
Development and...
DRS IT Groups E...
H&R Block Exper...


In [69]:
reranked = rerank(question, chunks)

[2, 1, 3, 5, 6, 7, 4, 9, 10, 8]


In [8]:
def fetch_context(question):
    chunks = fetch_context_unranked(question)
    return rerank(question, chunks)

In [9]:
# In the context, include the source of the chunk

def make_rag_messages(question, history, chunks):
    context = "\n\n".join(f"Extract from {chunk.metadata['source']}:\n{chunk.page_content}" for chunk in chunks)
    system_prompt = SYSTEM_PROMPT_TEMPLATE.format(context=context)
    return [{"role": "system", "content": system_prompt}] + history + [{"role": "user", "content": question}]

In [10]:
def answer_question(question: str, history: list[dict] = []) -> tuple[str, list]:
    """
    Answer a question using RAG and return the answer and the retrieved context
    """
    #query = rewrite_query(question, history), we can also improve the query using LLM
    query = question
    print(query)
    chunks = fetch_context(query)
    messages = make_rag_messages(question, history, chunks)
    #response = completion(model=MODEL, messages=messages)
    response = llm.chat.completions.create(
        model="mistralai/devstral-2512:free",
        messages=messages
    )
    return response.choices[0].message.content, chunks

In [22]:
ans = answer_question("Did varun work for UPS client?",[])
print(ans)

Did varun work for UPS client?
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
('Yes, Varun Nerella worked for **UPS** as a **Sr Systems Programmer Tech/.Net Full Stack Developer** from **March 2022 to Present** in **Charlotte, NC**.\n\nWould you like more details on his role or achievements there?', [Result(page_content='Professional Experience at UPS\n\nThe candidate worked as a Sr Systems Programmer Tech/.Net Full Stack Developer at UPS, focusing on full-stack development and modernization.\n\nUPS - Charlotte, NC | March 2022 - Present\n\nSr Systems Programmer Tech/ .Net Full Stack Developer\n\nFull-Stack Development: Built web applications using ASP.NET MVC/Core, C#, Angular20/19/17, React 19/18, Blazor, and HTML/CSS with responsive UI design.\n\nImplemented Native AOT (Ahead-of-Time compilation) in .NET 8 to create self-contained, optimized executables with faster startup times and reduced memory footprint.', metadata={'source': 'C:\\Users\\vishi\\Downloads\\New folder\\Varun Nerella .Net resume.