In [9]:
#!pip install pinecone

In [1]:
# Cell 1: Install (if needed) + Imports

# If running fresh, uncomment:
# !pip install langchain-community langchain-huggingface langchain-google-genai \
#             sentence-transformers pinecone beautifulsoup4 scikit-learn

from pathlib import Path
from typing import List, Dict, Any
import os
import json
import pickle

from pinecone import Pinecone, ServerlessSpec

from sklearn.feature_extraction.text import TfidfVectorizer

from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader, TextLoader
from bs4 import BeautifulSoup


# Config

In [4]:
import os
from dotenv import load_dotenv

load_dotenv()

gemini_api_key = os.getenv("GEMINI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY")

In [5]:
## Define resume path
RESUME_DIR = Path("resume_dir")

## Define an Index name
RESUME_INDEX_NAME = "resume-hybrid-index"

# Embeddings model
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [11]:
from typing import List
from langchain_core.documents import Document

def csr_row_to_pinecone_sparse(csr_row) -> Dict[str, List[float]]:
    coo = csr_row.tocoo()
    return {
        "indices": coo.col.tolist(),
        "values": coo.data.astype(float).tolist()
    }


# LLM for parsing resumes into structured JSON
llm = ChatGoogleGenerativeAI(
    model = "gemini-2.5-flash",
    api_key = gemini_api_key,
)

# Instering the data into Vecotr DB

In [6]:
# Step 1
pc = Pinecone(api_key=pinecone_api_key)

if RESUME_INDEX_NAME not in [idx["name"] for idx in pc.list_indexes()]:
    pc.create_index(
        name=RESUME_INDEX_NAME,
        dimension=384,            # must match dense model
        metric="dotproduct",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    
index = pc.Index(RESUME_INDEX_NAME)

In [7]:
index

<pinecone.db_data.index.Index at 0x1beaf146120>

# Loading the resume and pre-process

In [18]:
def load_resume_text(path:Path):
    suffix = path.suffix.lower()
    if(suffix == '.pdf'):
        loader = PyPDFLoader(str(path))
        docs = loader.load()
        return "\n".join(d.page_content for d in docs)

In [19]:
#Testing 
test_path = Path("resume_dir/Andrew_Green_Resume_27.pdf")

In [22]:
parser_text = load_resume_text(test_path)
parser_text

'ANDREW GREEN\nRecent Graduate\nContact Information:\nEmail: andrew.green@email.com\nPhone: (714) 826-3519\nLocation: Long Beach, CA\nLinkedIn: linkedin.com/in/andrewgreen\nPROFESSIONAL SUMMARY\nRecent accounting graduate with strong academic foundation and internship experience. Eager to\nbegin career in accounting with focus on financial reporting and analysis. Detail-oriented, organized,\nand committed to professional growth and development.\nTECHNICAL SKILLS\n\x7f Tax Preparation\n\x7f Financial Reporting\n\x7f Fixed Asset Management\n\x7f Payroll Processing\n\x7f Accounts Payable\n\x7f Year-End Closing\n\x7f Bank Reconciliation\n\x7f Variance Analysis\nSOFTWARE PROFICIENCY\n\x7f PowerPoint\n\x7f Microsoft Excel\n\x7f Adobe Acrobat\n\x7f SQL\n\x7f Outlook\n\x7f Sage\nPROFESSIONAL EXPERIENCE\nAccounting Intern | Premier Financial Advisors | Summer 2023\n\x7f Assisted with month-end closing procedures and financial reporting\n\x7f Processed accounts payable invoices and vendor paymen

In [48]:
def llm_parse_resume(raw_text):
    prompt = f"""
        You are a strict JSON resume parser.
        
        Return ONLY valid minified JSON. No markdown, no commentary.
        In the summary generate a summary of his experience and projects
        Schema (use exactly these keys):
        {{
          "summary": "string",
          "skills": ["string", ...],
          "CERTIFICATIONS" : ["string",...],
          "email" : ["string"],
          "Location" : ["string"],
          "experiences": [
            {{
              "title": "string",
              "company": "string",
              "location": "string",
              "start_date": "string",
              "end_date": "string",
              "description": "string",
              "skills": ["string", ...]
            }}
          ],
          "education": [
            {{
              "degree": "string",
              "institution": "string",
              "year": "string"
            }}
          ],
          "projects": [
            {{
              "name": "string",
              "description": "string",
              "skills": ["string", ...]
            }}
          ]
        }}
        
        If something is missing, use "" or [].
        
        Resume:
        \"\"\"{raw_text[:12000]}\"\"\"
    """.strip()
    
    resp = llm.invoke(prompt)
    content = getattr(resp, "content", resp)
    return json.loads(content)

In [49]:
llm_parserd_text = llm_parse_resume(parser_text)
llm_parserd_text

{'summary': 'Andrew Green is a recent accounting graduate with a Master of Accountancy from Columbia University (2022) and practical internship experience at Premier Financial Advisors. His experience includes assisting with month-end closing, processing accounts payable, performing bank reconciliations, maintaining general ledger accounts, and creating Excel spreadsheets for financial analysis. He possesses a strong foundation in financial reporting, tax preparation, fixed asset management, and is proficient in software such as Microsoft Excel, SQL, and Sage. Andrew is detail-oriented and committed to a career in accounting, focusing on financial reporting and analysis.',
 'skills': ['Tax Preparation',
  'Financial Reporting',
  'Fixed Asset Management',
  'Payroll Processing',
  'Accounts Payable',
  'Year-End Closing',
  'Bank Reconciliation',
  'Variance Analysis',
  'PowerPoint',
  'Microsoft Excel',
  'Adobe Acrobat',
  'SQL',
  'Outlook',
  'Sage'],
 'CERTIFICATIONS': ['PMP', 'C

# LLM Ouput required Document format

In [58]:
overall_text = []
def build_resume_doc(parsed_text, resume_id, filename):
    summary = parsed_text.get("summary")
    skills = parsed_text.get("skills")
    experiences = parsed_text.get("experiences")
    education = parsed_text.get("education")
    CERTIFICATIONS =  parsed_text.get("CERTIFICATIONS")
    projects =  parsed_text.get("projects")
    # Emebedding part
    if summary: 
        overall_text.append(f"Summary: {summary}")
    if skills: 
        overall_text.append("Skills: " + ", ".join(skills))
    
    full_text = "\n".join(overall_text).strip()

    # Metadata part
    all_skills = set()
    for s in skills:
        all_skills.add(s.strip())
    roles = set()
    companies = set()
    for e in experiences:
        roles.add((e.get("title") or "").strip())
        companies.add((e.get("company") or "").strip())
    metadata = {"resume_id": resume_id, "filename": filename, "skills": sorted(all_skills), "roles": sorted(roles), "companies": sorted(companies)}
    
    return Document(page_content = full_text, metadata = metadata)

In [59]:
build_resume_doc(llm_parserd_text, 1, "resume_dir/Andrew_Green_Resume_27.pdf")

Document(metadata={'resume_id': 1, 'filename': 'resume_dir/Andrew_Green_Resume_27.pdf', 'skills': ['Accounts Payable', 'Adobe Acrobat', 'Bank Reconciliation', 'Financial Reporting', 'Fixed Asset Management', 'Microsoft Excel', 'Outlook', 'Payroll Processing', 'PowerPoint', 'SQL', 'Sage', 'Tax Preparation', 'Variance Analysis', 'Year-End Closing'], 'roles': ['Accounting Intern'], 'companies': ['Premier Financial Advisors']}, page_content='Summary: Andrew Green is a recent accounting graduate with a Master of Accountancy from Columbia University (2022) and practical internship experience at Premier Financial Advisors. His experience includes assisting with month-end closing, processing accounts payable, performing bank reconciliations, maintaining general ledger accounts, and creating Excel spreadsheets for financial analysis. He possesses a strong foundation in financial reporting, tax preparation, fixed asset management, and is proficient in software such as Microsoft Excel, SQL, and S

In [64]:
### Step 4: Chunk and split for all the resumes in the document folder
def step4_load_and_split(cfg):
    resume_dir = cfg['resume_dir']
    docs = []
    for fp in resume_dir.iterdir():
        raw  = load_resume_text(fp).strip()
        resume_id = fp.stem
        parsed = llm_parse_resume(raw)
        doc = build_resume_doc(parsed, resume_id, fp.name)
        docs.append(doc)
    return {"docs": docs}

In [65]:
payload = step4_load_and_split({"resume_dir" : RESUME_DIR})

In [66]:
payload

{'docs': [Document(metadata={'resume_id': 'Andrew_Green_Resume_27', 'filename': 'Andrew_Green_Resume_27.pdf', 'skills': ['Accounts Payable', 'Adobe Acrobat', 'Bank Reconciliation', 'Financial Reporting', 'Fixed Asset Management', 'Microsoft Excel', 'Outlook', 'Payroll Processing', 'PowerPoint', 'SQL', 'Sage', 'Tax Preparation', 'Variance Analysis', 'Year-End Closing'], 'roles': ['Accounting Intern'], 'companies': ['Premier Financial Advisors']}, page_content='Summary: Andrew Green is a recent accounting graduate with a Master of Accountancy from Columbia University (2022) and practical internship experience at Premier Financial Advisors. His experience includes assisting with month-end closing, processing accounts payable, performing bank reconciliations, maintaining general ledger accounts, and creating Excel spreadsheets for financial analysis. He possesses a strong foundation in financial reporting, tax preparation, fixed asset management, and is proficient in software such as Micro

In [67]:
# Creating embeddings
def step5_encode(payload):
    docs = payload["docs"]
    corpus = [d.page_content for d in docs]
    embed = HuggingFaceEmbeddings(
        model_name=EMBED_MODEL,
        encode_kwargs={"normalize_embeddings": True},
    )
    dense_vectors = embed.embed_documents(corpus)
    vectorizer = TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 2),
        min_df=1,
    )
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return {"docs": docs, "dense_vectors": dense_vectors, "tfidf_matrix":tfidf_matrix}

In [68]:
encoded_payload = step5_encode(payload)

In [72]:
encoded_payload

{'docs': [Document(metadata={'resume_id': 'Andrew_Green_Resume_27', 'filename': 'Andrew_Green_Resume_27.pdf', 'skills': ['Accounts Payable', 'Adobe Acrobat', 'Bank Reconciliation', 'Financial Reporting', 'Fixed Asset Management', 'Microsoft Excel', 'Outlook', 'Payroll Processing', 'PowerPoint', 'SQL', 'Sage', 'Tax Preparation', 'Variance Analysis', 'Year-End Closing'], 'roles': ['Accounting Intern'], 'companies': ['Premier Financial Advisors']}, page_content='Summary: Andrew Green is a recent accounting graduate with a Master of Accountancy from Columbia University (2022) and practical internship experience at Premier Financial Advisors. His experience includes assisting with month-end closing, processing accounts payable, performing bank reconciliations, maintaining general ledger accounts, and creating Excel spreadsheets for financial analysis. He possesses a strong foundation in financial reporting, tax preparation, fixed asset management, and is proficient in software such as Micro

In [73]:
def step6_package_and_upsert(payload: Dict[str, Any]) -> Dict[str, Any]:
    docs: List[Document] = payload["docs"]
    dense_vectors = payload.get("dense_vectors", [])
    tfidf_matrix = payload.get("tfidf_matrix")

    if not docs or tfidf_matrix is None:
        return {"upserted": 0, "index": RESUME_INDEX_NAME}

    vectors = []

    for i, (doc, dense) in enumerate(zip(docs, dense_vectors)):
        sparse = csr_row_to_pinecone_sparse(tfidf_matrix[i])
        resume_id = doc.metadata.get("resume_id")
        vid = resume_id
        meta = {
            **{k: v for k, v in doc.metadata.items() if k != "text"},
            "text": doc.page_content[:1200],  # snippet for preview
        }
        vectors.append(
            {
                "id": vid,
                "values": dense,
                "sparse_values": sparse,
                "metadata": meta,
            }
        )
    if vectors:
        index.upsert(vectors=vectors)
    return {"upserted": len(vectors), index: RESUME_INDEX_NAME}

In [74]:
step6_package_and_upsert(encoded_payload)

{'upserted': 3,
 <pinecone.db_data.index.Index at 0x1beaf146120>: 'resume-hybrid-index'}