In [None]:
# Resume loading and chunking logic
import re
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from dotenv import load_dotenv
import os


  from .autonotebook import tqdm as notebook_tqdm


In [20]:
load_dotenv()  # Loads .env into environment

api_key = os.getenv("MISTRAL_API_KEY")

#print(api_key)  # just to test (remove later)


In [None]:
import re
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_classic.schema import Document

SECTION_HEADERS = [
    "SUMMARY",
    "SKILLS",
    "TECHNICAL SKILLS",
    "EXPERIENCE",
    "WORK EXPERIENCE",
    "EDUCATION",
    "PROJECTS",
    "INTERNSHIPS",
    "CERTIFICATIONS"
]


def clean_text(text: str) -> str:
    """
    Fix character spaced text and noisy formatting from PDFs
    """
    text = re.sub(r'(?<=\w)\s(?=\w)', '', text)
    text = re.sub(r'\n+', '\n', text)
    return text.strip()


def split_by_sections(doc: Document):
    """
    Split resume into semantic sections
    """
    chunks = []
    text = doc.page_content

    # Regex pattern for headers
    pattern = "|".join([fr"\n{h}\n" for h in SECTION_HEADERS])
    sections = re.split(pattern, text, flags=re.IGNORECASE)

    for section in sections:
        section = section.strip()
        if len(section) > 200:
            chunks.append(
                Document(
                    page_content=section,
                    metadata=doc.metadata
                )
            )

    return chunks


def load_resumes(folder_path: str):
    final_chunks = []

    for file in Path(folder_path).iterdir():
        if file.suffix.lower() == ".pdf":
            loader = PyPDFLoader(str(file))
        elif file.suffix.lower() == ".docx":
            loader = Docx2txtLoader(str(file))
        else:
            continue

        for doc in loader.load():
            # ✅ Clean text
            doc.page_content = clean_text(doc.page_content)

            # ✅ Attach metadata
            doc.metadata["candidate_id"] = file.stem
            doc.metadata["source"] = str(file)

            # ✅ Section-aware chunking
            chunks = split_by_sections(doc)
            final_chunks.extend(chunks)

    return final_chunks


In [38]:
docs = load_resumes(r"C:\Users\hp\Desktop\MLops\LLMOPS_RESUME_CHATBOT\LLMOPS_CHATBOT\data\resumes")
print("Cleaned Docs : --> ",docs)
print("Length of total documents: ",len(docs))


Cleaned Docs : -->  [Document(metadata={'producer': 'Canva', 'creator': 'Canva', 'creationdate': '2025-12-27T17:18:05+00:00', 'title': 'White Black Minimalist Professional Resume', 'moddate': '2025-12-27T17:18:05+00:00', 'keywords': 'DAG8tjQXWxs,BADzcwhu8_w,0', 'author': 'Kaustubh kshirsagar', 'source': 'C:\\Users\\hp\\Desktop\\MLops\\LLMOPS_RESUME_CHATBOT\\LLMOPS_CHATBOT\\data\\resumes\\PRASANNA BORA Curriculum Vitae.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'candidate_id': 'PRASANNA BORA Curriculum Vitae'}, page_content="PRASANNABORA8766941791Pune ,  Maharashtra ,  Indiabprasanna98765 @ gmail . comhttps : / / www . linkedin . com / in / prasanna - bora - 96ab1b316Detail - oriented  BBA  ( Finance )  student  with  practicalacademic  exposure  to  financialstatements ,  budgeting ,  andcost  analysis .  Proficient  in  MS  Excel  and  financialdatamanagement ,  with  a  focused  interest  in  corporatefinance  and  financial  analysis . Seeking  to  apply  academicknowledg

In [39]:
# docs = load_resumes("data/resumes")

for d in docs[:3]:
    print(d.metadata["candidate_id"])
    print(d.page_content[:200])
    print("-" * 40)


PRASANNA BORA Curriculum Vitae
PRASANNABORA8766941791Pune ,  Maharashtra ,  Indiabprasanna98765 @ gmail . comhttps : / / www . linkedin . com / in / prasanna - bora - 96ab1b316Detail - oriented  BBA  ( Finance )  student  with  pra
----------------------------------------
PRASANNA BORA Curriculum Vitae
HOBBIESReading  and  building
' Acumen '  of  Bussinessand  Self - Developmentarticles .
Playing  Gully / TurfCricket .
SOLVING  SUDOKU .
Streaming /  Binge  watchMovies / SeriesCO - CURRICULAR  ACTIV
----------------------------------------
Tushar_Moon_Resume_Data_Scientist_4YOE_1
TUSHARMOON
+919175756834 ⋄ Pune (MH),Indiatusharmoon2020@outlook.com ⋄ tushar-moon-0650b618b ⋄ github.com/tusharmoonOBJECTIVEPassionateDataScientistandMachineLearningEngineerwitharobustbackgroundinbui
----------------------------------------


In [40]:
# FAISS / Chroma vector store logic
from langchain_community.vectorstores import FAISS
#from langchain.embeddings import HuggingFaceEmbeddings
from langchain_mistralai import MistralAIEmbeddings

def create_vector_store(docs):
    
    embeddings = MistralAIEmbeddings(
        model="mistral-embed"
    )
    return FAISS.from_documents(docs, embeddings)


In [41]:
vector_store = create_vector_store(docs)

print("Vector store : ",vector_store)


Vector store :  <langchain_community.vectorstores.faiss.FAISS object at 0x000001FD447DACD0>


In [49]:
retriever=vector_store.as_retriever()

In [43]:
# Perform similarity search
query = "What are the name of candidatesid?"
docs = vector_store.similarity_search(query, k=4)

# Display the results 
for i, doc in enumerate(docs):
    print(f"Document {i+1}:")
    print(doc.page_content)
    print("-" * 50)


Document 1:
HOBBIESReading  and  building
' Acumen '  of  Bussinessand  Self - Developmentarticles .
Playing  Gully / TurfCricket .
SOLVING  SUDOKU .
Streaming /  Binge  watchMovies / SeriesCO - CURRICULAR  ACTIVITIESFINANCE  OPERATIONS  TRAINEE  –  FAMILYTEXTILE  BUSINESSAssisted  in  daily  bookkeeping ,  billing ,  and  expensetracking .
Learned  basics  of  cash  flow  management ,  costing ,  andpricing .
Supported  invoice  preparation  and  payment  follow -
ups .
Gained  exposure  to  GST  billing  and  vendor / customertransactions .
--------------------------------------------------
Document 2:
PRASANNABORA8766941791Pune ,  Maharashtra ,  Indiabprasanna98765 @ gmail . comhttps : / / www . linkedin . com / in / prasanna - bora - 96ab1b316Detail - oriented  BBA  ( Finance )  student  with  practicalacademic  exposure  to  financialstatements ,  budgeting ,  andcost  analysis .  Proficient  in  MS  Excel  and  financialdatamanagement ,  with  a  focused  interest  in  corporatef

In [50]:
from langchain_core.prompts import ChatPromptTemplate

template="""You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""

In [51]:
prompt=ChatPromptTemplate.from_template(template)

In [52]:
prompt

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks.\nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse ten sentences maximum and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"), additional_kwargs={})])

In [53]:
from langchain_classic.schema.output_parser import StrOutputParser
output_parser=StrOutputParser()

In [54]:
from langchain_mistralai import ChatMistralAI

llm = ChatMistralAI(
    model="mistral-small",
    api_key=api_key
)

In [55]:
from langchain_classic.schema.runnable import RunnablePassthrough

# {"context": retriever,  "question": RunnablePassthrough()}
# This entire cabove code will create an object with question and context and it will be fed to "prompt" a it is...

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [57]:
rag_chain.invoke("Tell me about the Candidates ?")

'Based on the provided context, there are two candidates mentioned:\n\n1. **Prasanna Bora**:\n   - A BBA (Finance) student with experience in financial operations, including bookkeeping, billing, and GST management.\n   - Skilled in MS Excel, financial data management, and corporate finance.\n   - Interested in applying academic knowledge to real-world financial operations.\n\n2. **Tushar Moon**:\n   - A Data Scientist with 4 years of experience.\n   - Specialized in customer churn prediction, AI-driven insights, and customer segmentation.\n   - Trained over 100 data analytics trainees in Python.\n   - Holds certifications in Data Science, Machine Learning, and AWS Cloud Practitioner.\n\nBoth candidates have distinct backgrounds and skill sets.'