In [1]:
# Imports
from pathlib import Path
from typing import List, Dict, Any
import os
import json
import pickle
import numpy as np




from pinecone import Pinecone, ServerlessSpec

from sklearn.feature_extraction.text import TfidfVectorizer

from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from bs4 import BeautifulSoup

# Config

In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

gemini_api_key = os.getenv("GEMINI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [3]:
## Define an Index name
RESUME_INDEX_NAME = "resume-hybrid-index"

# Embeddings model
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

#yf-idf file
TFIDF_PATH = "tfidf_vectorizer.pkl"

In [4]:
# Getting pinecone Index
pc = Pinecone(api_key=pinecone_api_key)

index = pc.Index(RESUME_INDEX_NAME)
index

<pinecone.db_data.index.Index at 0x21afd2c6f90>

In [5]:
from sentence_transformers import SentenceTransformer, CrossEncoder
#Dense Model
DENSE_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
dense_model = SentenceTransformer(DENSE_MODEL_NAME)
# Reranking defines
RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
rerank_model = CrossEncoder(RERANK_MODEL_NAME)

In [6]:
# LLM for MqE 
llm = ChatGoogleGenerativeAI(
    model = "gemini-2.5-flash",
    api_key = gemini_api_key,
)

# Retirval

In [7]:
def multi_query_ext(user_query):
    prompt = f"""
    Imagine you are helping retiver the most sutable canidates resume and you will be receving sample JD or user query about resume

    Write 3 variations based on the user question focusing different of reterival.. Do not explain what you are doing or the process

    Return only the below points
    1. The output should be sematically related to the user question
    2. Use different works compared to question but they should be related 
    3. Cover different aspects for resume filtering
    
    query_id:
    {user_query}
    """
    resp = llm.invoke(prompt)
    content = getattr(resp, "content", resp)
    return content

In [8]:
user_query = "Resume about data science"

In [9]:
# Hydbrid Retrival
def hybrid_query(query_text: str, alpha: float = 0.5, top_k: int = 8):
    """
    Hybrid search combining dense (1 - alpha) and sparse (alpha) scores.
    alpha = 0.0 => dense-only
    alpha = 1.0 => sparse-only (keyword)
    """
    alpha = float(alpha)
    alpha = max(0.0, min(1.0, alpha))
    # Load TF-IDF vectorizer trained during ingest
    with open(TFIDF_PATH, "rb") as f:
        vectorizer = pickle.load(f)
    # Dense query embedding
    q_dense = dense_model.encode([query_text], normalize_embeddings=True)[0]
    q_dense = (np.asarray(q_dense, dtype=float) * (1.0 - alpha)).tolist()
    # Sparse query vector
    q_sparse_csr = vectorizer.transform([query_text]).tocoo()
    if q_sparse_csr.nnz == 0:
        q_sparse = {"indices": [0], "values": [0.0]}
    else:
        q_sparse = {
            "indices": q_sparse_csr.col.tolist(),
            "values": (q_sparse_csr.data.astype(float) * alpha).tolist(),
        }
     # Query Pinecone
    res = index.query(
        vector=q_dense,
        sparse_vector=q_sparse,
        top_k=top_k,
        include_metadata=True,
    )
    out = []
    for m in res.get("matches", []):
        md = m.get("metadata", {}) or {}
        text = md.get("text", "") or ""
        preview = " ".join(text.split()[:120])  # short snippet
        out.append(
            {
                "id": m["id"],                 # resume_id
                "score": float(m["score"]),    # hybrid score
                "preview": preview,
                "metadata": md,
            }
        )
    return out

In [10]:
mqe_query = multi_query_ext(user_query)
mqe_query

'1. Profiles showcasing expertise in machine learning, statistical inference, and predictive analytics.\n2. Candidates proficient with Python, R, SQL, and experience in deploying AI models.\n3. Individuals with a background in quantitative analysis, data-driven decision making, and advanced statistical methods.'

In [11]:
final_query = user_query + '\n' + multi_query_ext(user_query)
final_query

'Resume about data science\n1. Profiles demonstrating expertise in statistical analysis, machine learning methodologies, and predictive modeling.\n2. Candidates skilled in Python, R, SQL, and relevant big data technologies for advanced analytics.\n3. Individuals with experience in developing data-driven solutions, interpreting complex datasets, and generating actionable insights.'

In [17]:
results = hybrid_query(mqe_query)
results

[{'id': 'SunilBanakar',
  'score': 0.268110514,
  'preview': 'Summary: Sunil Banakar is a highly innovative and results-driven Technical Architect with over 12 years of experience in Microsoft technologies and cloud-native development, primarily on Azure and AWS. He has a proven track record in designing and leading high-scale, resilient solutions, modernizing legacy systems, and implementing Microservices Architecture. Key accomplishments include architecting a high-availability Microservices solution capable of supporting over 1 million users, leading cloud-native migrations that resulted in 40% cost savings, and implementing CI/CD strategies that reduced deployment time by 80%. His expertise spans Kubernetes, Docker, Azure DevOps, and emerging AI applications such as Generative AI, Prompt Engineering, and ML Model Development. He has led teams in developing cloud-native applications, driving automation initiatives, and ensuring successful project delivery, including',
  'metadata': 

In [18]:
# Re-ranker
def reranker_crossencoder(query, results):
    pairs = [(query, r['preview']) for r in results]
    scores = rerank_model.predict(pairs)
    #scores = np.mod(scores)
    rescored = []
    for r, s in zip(results,scores):
        r2 = dict(r)
        r2['rerank_cross_score'] = float(s)
        rescored.append(r2)
    return sorted(rescored, key = lambda x:x['rerank_cross_score'], reverse= True)

In [19]:
rerank_results = reranker_crossencoder(final_query, results)
rerank_results

[{'id': 'SunilBanakar',
  'score': 0.268110514,
  'preview': 'Summary: Sunil Banakar is a highly innovative and results-driven Technical Architect with over 12 years of experience in Microsoft technologies and cloud-native development, primarily on Azure and AWS. He has a proven track record in designing and leading high-scale, resilient solutions, modernizing legacy systems, and implementing Microservices Architecture. Key accomplishments include architecting a high-availability Microservices solution capable of supporting over 1 million users, leading cloud-native migrations that resulted in 40% cost savings, and implementing CI/CD strategies that reduced deployment time by 80%. His expertise spans Kubernetes, Docker, Azure DevOps, and emerging AI applications such as Generative AI, Prompt Engineering, and ML Model Development. He has led teams in developing cloud-native applications, driving automation initiatives, and ensuring successful project delivery, including',
  'metadata': 

In [27]:
# Prompt
# --- set the path OUTSIDE the functions ---
PROMPT_PATH = r"prompt.yaml" 
# --- tiny YAML loader (file only) ---
def load_prompt(path=PROMPT_PATH):
    """Load prompt config strictly from a YAML file."""
    from pathlib import Path
    import yaml  # pip install pyyaml
    text = Path(path).read_text(encoding="utf-8")
    cfg = yaml.safe_load(text) or {}
    return cfg

def render_prompt(cfg: dict, query: str, sources: str) -> str:
    vars_all = dict(cfg.get("vars", {}), query=query, sources=sources)
    return cfg["template"].format(**vars_all)

In [29]:
# Wihtout doing any changes it will pass the input to next stage
# It will always take the input as dict 
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
chain = (
RunnablePassthrough()
    .assign(multi_query = RunnableLambda(lambda q:multi_query_ext(q["query"])))
    .assign(results = RunnableLambda(lambda q:hybrid_query(q["multi_query"])))
    .assign(rerank_results = RunnableLambda(lambda q:reranker_crossencoder(q['multi_query'] , q["results"])))
    .assign(prompt_temp = RunnableLambda(lambda q:load_prompt(PROMPT_PATH)))
    .assign(prompt = RunnableLambda(lambda q:render_prompt(q['prompt_temp'], q['multi_query'], q['rerank_results'])))
    .assign(output = RunnableLambda(lambda q:llm.invoke(q['prompt'])))
)
user_q = "Need candidate for data science"
final_output = chain.invoke({"query": user_q})
final_output

{'query': 'Need candidate for data science',
 'multi_query': '1.  **Seeking professionals proficient in machine learning, statistical modeling, and advanced analytics.**\n2.  **Locate individuals with a background in quantitative research and predictive algorithm development.**\n3.  **Find resumes demonstrating expertise in Python, R, SQL, and big data technologies for data-driven insights.**',
 'results': [{'id': 'SunilBanakar',
   'score': 0.23148334,
   'preview': 'Summary: Sunil Banakar is a highly innovative and results-driven Technical Architect with over 12 years of experience in Microsoft technologies and cloud-native development, primarily on Azure and AWS. He has a proven track record in designing and leading high-scale, resilient solutions, modernizing legacy systems, and implementing Microservices Architecture. Key accomplishments include architecting a high-availability Microservices solution capable of supporting over 1 million users, leading cloud-native migrations that 