In [1]:
# Imports
import os
from dotenv import load_dotenv
from datetime import datetime
import pandas as pd
import pdfplumber
import docx2txt
import json
import re
import numpy as np
import random
import time
import urllib.parse
import logging
from typing import Dict, List, Optional, Any
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, as_completed

from langchain_core.output_parsers import JsonOutputParser

from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import StrOutputParser
from langchain.chains import LLMChain
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_community.embeddings import HuggingFaceEmbeddings

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
warnings.filterwarnings('ignore')


import google.generativeai as genai
import spacy

from langchain_community.llms import Ollama
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException, NoSuchElementException, ElementClickInterceptedException

### Approach 1 – Raw text → LLM

- Extract text using PyMuPDF/docx2txt/Tesseract

- Call LLM to structure into JSON

- Log accuracy, tokens, latency

In [2]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
genai.configure(api_key=GEMINI_API_KEY)

model = genai.GenerativeModel("gemini-2.5-flash")

In [3]:
def extract_text_from_file(file_path: str, file_type: str) -> str:
    """Extract text from PDF or DOCX resume."""
    if file_type.lower() == "pdf":
        text_parts = []
        with pdfplumber.open(file_path) as pdf:
            max_pages = min(2, len(pdf.pages))
            for i in range(max_pages):
                text_parts.append(pdf.pages[i].extract_text() or "")
        text = "\n".join(text_parts)
    elif file_type.lower() == "docx":
        text = docx2txt.process(file_path)
    else:
        raise ValueError("Unsupported file type")

    # Optional cleaning
    text = text.replace('\n', ' ').replace('\t', ' ').strip()
    return text


In [None]:
# def parse_resume_with_gemini(resume_text):
#     prompt = f"""
# You are an expert Resume Intelligence Agent specialized in extracting structured data and evaluating resumes for ATS compatibility.

# Analyze the following resume text and return ONLY a valid JSON object with these exact keys:

# {{
#   "name": "",
#   "first_name": "",
#   "last_name": "",
#   "email_address": "",
#   "mobile_number": "",
#   "country_code": "",
#   "location": "",
#   "city": "",
#   "state": "",
#   "country": "",
#   "zip_postal_code": "",
#   "summary": "",
#   "skills": [],
#   "extra_skills": [],
#   "work_experience": [],
#   "projects": [],
#   "certifications": [],
#   "education": [],
#   "experience_level": "",
#   "experience_level_number": 0,
#   "role_keywords": [],
#   "tech_stack_summary": "",
#   "current_employer": "",
#   "experience_by_skill": {{}},
#   "availability_questions": {{
#     "own_laptop_for_evaluation": "Yes",
#     "willing_for_2_3_month_evaluation": "Yes",
#     "available_for_fulltime_internship": "Yes",
#     "preferred_work_timings": "Flexible",
#     "can_start_immediately": "Yes",
#     "work_authorization": "Yes"
#   }}
# }}

# CRITICAL EXTRACTION RULES FOR ALL SECTIONS:

# 1. **NAME / FIRST_NAME / LAST_NAME**:  
#    - Extract full name exactly as written (usually top of resume).  
#    - Split logically into first_name and last_name; if single name, leave last_name empty.

# 2. **EMAIL_ADDRESS / MOBILE_NUMBER / COUNTRY_CODE**:  
#    - Extract directly from text.  
#    - Derive `country_code` if phone number includes it (e.g., "+91"), otherwise leave empty.

# 3. **LOCATION / CITY / STATE / COUNTRY / ZIP_POSTAL_CODE**:  
#    - Extract location only from the contact section.  
#    - Derive state/country if missing, e.g., "Bangalore" → "Karnataka, India".  
#    - If city is present but ZIP/postal code is missing, **infer it automatically using known mapping for that city** (e.g., Bangalore → "560001").  
#    - If city is absent, leave ZIP/postal code empty.  
#    - Never use locations mentioned under work experience or projects.

# 4. **SUMMARY**:  
#    - Extract text from “Summary”, “Profile”, “Objective”, or “About Me” sections.

# 5. **SKILLS / EXTRA_SKILLS**:  
#    - `skills`: primary professional and technical competencies (normalize capitalization).  
#    - `extra_skills`: secondary, contextual, or soft skills (avoid duplicates).

# 6. **WORK_EXPERIENCE**:  
#    - Each record must include title, company, start_date, end_date (or "Present"), duration, and key achievements.  
#    - **Set `current_employer` only if the role’s end_date is marked as “Present” or “Ongoing”**.  
#    - If all roles are completed (no ongoing job), `current_employer` must be an empty string.

# 7. **PROJECTS**:  
#    - Each must include name, duration, tech_stack, and short description.  
#    - Prefer projects from “Projects” or “Academic Work” sections.

# 8. **CERTIFICATIONS**:  
#    - Include certification/course name, issuing body, and year if present.

# 9. **EDUCATION**:  
#    - Include degree, institution, field, and graduation year.  
#    - If currently studying, mark as “Pursuing” or “Ongoing”.

# 10. **EXPERIENCE_LEVEL**:  
#    - Based on total experience:  
#      - Internship → 0 years or studying  
#      - Entry Level → 0–1 years  
#      - Associate → 1–3 years  
#      - Mid-Senior Level → 3–7 years  
#      - Director → 7+ years  

# 11. **EXPERIENCE_LEVEL_NUMBER**:  
#    - Internship → 1  
#    - Entry Level → 2  
#    - Associate → 3  
#    - Mid-Senior Level → 4  
#    - Director → 5  

# 12. **ROLE_KEYWORDS**:  
#    - Extract key professional or domain-related keywords (e.g., “Full Stack Development”, “MLOps”, “Data Engineering”, etc.)

# 13. **EXPERIENCE_BY_SKILL**:  
#    - Map each detected skill to an estimated **integer number of years** based on resume content.  
#    - Round intelligently:  
#      - < 1 year → 1  
#      - 1–1.5 years → 1  
#      - 1.6–2.4 years → 2  
#      - and so on (round to nearest integer).  
#    - Example:  
#      "experience_by_skill": {{"Python": 2, "React": 1, "Spring Boot": 2}}  
#    - Output must use **numeric values only**, without “months” or “years” suffix.

# 14. **TECH_STACK_SUMMARY**:  
#    - Combine all technical tools, libraries, and frameworks (from skills + projects + experience) into a concise, comma-separated list.

# 15. **INFERENCE RULES**:
#    - Normalize capitalization for consistency.  
#    - Avoid duplicate entries across fields.  
#    - Do not fabricate data.  
#    - Infer logically only where reasonable (like ZIP from city).

# Return ONLY the JSON object, with no explanations, markdown, or text.

# Resume Text:
# {resume_text}
# """


#     start = time.time()
#     response = model.generate_content(prompt)
#     latency = time.time() - start

#     # --- Clean and Parse Gemini Output ---
#     raw_output = response.text.strip()

#     # Remove code block wrappers if present
#     raw_output = re.sub(r"^```(json)?", "", raw_output)
#     raw_output = re.sub(r"```$", "", raw_output)
#     raw_output = raw_output.strip()

#     # Try parsing clean JSON
#     try:
#         structured = json.loads(raw_output)
#     except json.JSONDecodeError:
#         structured = {"raw_output": raw_output}

#     return structured, latency, len(prompt.split()), len(response.text.split())

In [None]:
# def process_resume(file_path):
#     ext = file_path.split(".")[-1].lower()
#     if ext == "pdf":
#         resume_text = extract_text_from_file(file_path, "pdf")
#     elif ext == "docx":
#         resume_text = extract_text_from_file(file_path, "docx")
#     else:
#         raise ValueError("Unsupported file type!")

#     structured_data, latency, prompt_tokens, response_tokens = parse_resume_with_gemini(resume_text)

#     log = {
#         "timestamp": datetime.now().isoformat(),
#         "file": file_path,
#         "latency_sec": round(latency, 2),
#         "prompt_tokens": prompt_tokens,
#         "response_tokens": response_tokens,
#         "output": structured_data
#     }

#     return log

In [None]:
# resume_path = "resumes/AnupamSharma.pdf"
# result = process_resume(resume_path)

# print(json.dumps(result, indent=2))

In [None]:
# output_path = "extraction_outputs/output_approach1.json"

# # Create directory if not exists
# import os
# os.makedirs(os.path.dirname(output_path), exist_ok=True)

# # Write to JSON file
# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(result, f, indent=2, ensure_ascii=False)

# print(f"\n✅ Resume data saved successfully to {output_path}")

### Approach 2 – Text → spaCy → LLM
- Extract text

- Run spaCy NER + rules to pre-parse fields

- Send spaCy output to LLM for final structuring

- Log metrics


In [None]:
# nlp = spacy.load("en_core_web_sm")

In [None]:
# def preprocess_with_spacy(text: str):
#     doc = nlp(text)
#     pre_data = {
#         "name": None,
#         "email": None,
#         "phone": None,
#         "location": None,
#         "entities": [],
#         "sections": {}
#     }

#     # --- Named Entities ---
#     for ent in doc.ents:
#         pre_data["entities"].append({"text": ent.text, "label": ent.label_})
#         if ent.label_ == "PERSON" and not pre_data["name"]:
#             pre_data["name"] = ent.text
#         if ent.label_ in ["GPE", "LOC"] and not pre_data["location"]:
#             pre_data["location"] = ent.text

#     # --- Regex extraction ---
#     email_match = re.search(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", text)
#     phone_match = re.search(r"\+?\d[\d\s\-]{7,}\d", text)
#     pre_data["email"] = email_match.group() if email_match else None
#     pre_data["phone"] = phone_match.group() if phone_match else None

#     # --- Section Splitting ---
#     section_titles = [
#         "Education", "Experience", "Projects", "Certifications",
#         "Technical Skills", "Skills", "Summary", "Profile", "Objective"
#     ]
#     pattern = r"(?i)\b(" + "|".join(section_titles) + r")\b[:\s]?"
#     sections = re.split(pattern, text)
#     for i in range(1, len(sections), 2):
#         header = sections[i].strip().title()
#         content = sections[i + 1].strip() if i + 1 < len(sections) else ""
#         pre_data["sections"][header] = content[:2000]  # limit long text

#     return pre_data


In [None]:
# def parse_resume_with_gemini(resume_text, pre_data):
#     prompt = f"""You are an expert Resume Intelligence Agent that parses resumes into structured JSON.
# Below is the raw resume text and the pre-parsed information from spaCy and regex.

# Use the pre-parsed data to enhance extraction accuracy.

# Return ONLY a valid JSON matching this schema:

# {{
#   "name": "",
#   "first_name": "",
#   "last_name": "",
#   "email_address": "",
#   "mobile_number": "",
#   "country_code": "",
#   "location": "",
#   "city": "",
#   "state": "",
#   "country": "",
#   "zip_postal_code": "",
#   "summary": "",
#   "skills": [],
#   "extra_skills": [],
#   "work_experience": [],
#   "projects": [],
#   "certifications": [],
#   "education": [],
#   "experience_level": "",
#   "experience_level_number": 0,
#   "role_keywords": [],
#   "tech_stack_summary": "",
#   "current_employer": "",
#   "experience_by_skill": {{}},
#   "availability_questions": {{
#     "own_laptop_for_evaluation": "Yes",
#     "willing_for_2_3_month_evaluation": "Yes",
#     "available_for_fulltime_internship": "Yes",
#     "preferred_work_timings": "Flexible",
#     "can_start_immediately": "Yes",
#     "work_authorization": "Yes"
#   }}
# }}

# CRITICAL EXTRACTION RULES FOR ALL SECTIONS:

# 1. **NAME / FIRST_NAME / LAST_NAME**:  
#    - Extract full name exactly as written (usually top of resume).  
#    - Split logically into first_name and last_name; if single name, leave last_name empty.

# 2. **EMAIL_ADDRESS / MOBILE_NUMBER / COUNTRY_CODE**:  
#    - Extract directly from text.  
#    - Derive `country_code` if phone number includes it (e.g., "+91"), otherwise leave empty.

# 3. **LOCATION / CITY / STATE / COUNTRY / ZIP_POSTAL_CODE**:  
#    - Extract location only from the contact section.  
#    - Derive state/country if missing, e.g., "Bangalore" → "Karnataka, India".  
#    - If city is present but ZIP/postal code is missing, **infer it automatically using known mapping for that city** (e.g., Bangalore → "560001").  
#    - If city is absent, leave ZIP/postal code empty.  
#    - Never use locations mentioned under work experience or projects.

# 4. **SUMMARY**:  
#    - Extract text from “Summary”, “Profile”, “Objective”, or “About Me” sections.

# 5. **SKILLS / EXTRA_SKILLS**:  
#    - `skills`: primary professional and technical competencies (normalize capitalization).  
#    - `extra_skills`: secondary, contextual, or soft skills (avoid duplicates).

# 6. **WORK_EXPERIENCE**:  
#    - Each record must include title, company, start_date, end_date (or "Present"), duration, and key achievements.  
#    - **Set `current_employer` only if the role’s end_date is marked as “Present” or “Ongoing”**.  
#    - If all roles are completed (no ongoing job), `current_employer` must be an empty string.

# 7. **PROJECTS**:  
#    - Each must include name, duration, tech_stack, and short description.  
#    - Prefer projects from “Projects” or “Academic Work” sections.

# 8. **CERTIFICATIONS**:  
#    - Include certification/course name, issuing body, and year if present.

# 9. **EDUCATION**:  
#    - Include degree, institution, field, and graduation year.  
#    - If currently studying, mark as “Pursuing” or “Ongoing”.

# 10. **EXPERIENCE_LEVEL**:  
#    - Based on total experience:  
#      - Internship → 0 years or studying  
#      - Entry Level → 0–1 years  
#      - Associate → 1–3 years  
#      - Mid-Senior Level → 3–7 years  
#      - Director → 7+ years  

# 11. **EXPERIENCE_LEVEL_NUMBER**:  
#    - Internship → 1  
#    - Entry Level → 2  
#    - Associate → 3  
#    - Mid-Senior Level → 4  
#    - Director → 5  

# 12. **ROLE_KEYWORDS**:  
#    - Extract key professional or domain-related keywords (e.g., “Full Stack Development”, “MLOps”, “Data Engineering”, etc.)

# 13. **EXPERIENCE_BY_SKILL**:  
#    - Map each detected skill to an estimated **integer number of years** based on resume content.  
#    - Round intelligently:  
#      - < 1 year → 1  
#      - 1–1.5 years → 1  
#      - 1.6–2.4 years → 2  
#      - and so on (round to nearest integer).  
#    - Example:  
#      "experience_by_skill": {{"Python": 2, "React": 1, "Spring Boot": 2}}  
#    - Output must use **numeric values only**, without “months” or “years” suffix.

# 14. **TECH_STACK_SUMMARY**:  
#    - Combine all technical tools, libraries, and frameworks (from skills + projects + experience) into a concise, comma-separated list.

# 15. **INFERENCE RULES**:
#    - Normalize capitalization for consistency.  
#    - Avoid duplicate entries across fields.  
#    - Do not fabricate data.  
#    - Infer logically only where reasonable (like ZIP from city).

# Return ONLY the JSON object, with no explanations.

# Pre-parsed Data:
# {json.dumps(pre_data, indent=2)}


# Resume Text:
# {resume_text}
# """

#     start = time.time()
#     response = model.generate_content(prompt)
#     latency = time.time() - start

#     # --- Clean and Parse Gemini Output ---
#     raw_output = response.text.strip()

#     # Remove ```json or ``` wrappers if present
#     raw_output = re.sub(r"^```(json)?", "", raw_output)
#     raw_output = re.sub(r"```$", "", raw_output)
#     raw_output = raw_output.strip()

#     # Try parsing clean JSON
#     try:
#         structured = json.loads(raw_output)
#     except json.JSONDecodeError:
#       structured = {"raw_output": raw_output}
#     return structured, latency, len(prompt.split()), len(response.text.split())

In [None]:
# def process_resume(file_path):
#     if file_path.endswith(".pdf"):
#         text = extract_text_from_file(file_path, "pdf")
#     elif file_path.endswith(".docx"):
#         text = extract_text_from_file(file_path, "docx")
#     else:
#         raise ValueError("Unsupported file type!")
    
#     pre_data = preprocess_with_spacy(text)
#     structured, latency, prompt_tokens, response_tokens = parse_resume_with_gemini(text, pre_data)

#     result = {
#         "timestamp": datetime.now().isoformat(),
#         "file": file_path,
#         "latency_sec": latency,
#         "prompt_tokens": prompt_tokens,
#         "response_tokens": response_tokens,
#         "output": structured
#     }

#     print(json.dumps(result, indent=2))
#     return result


In [None]:
# resume_path = "resumes/VedantResume.pdf"  # update path if needed
# result2 = process_resume(resume_path)

In [None]:
# output_path = "extraction_outputs/output_approach2.json"

# # Create directory if not exists
# import os
# os.makedirs(os.path.dirname(output_path), exist_ok=True)

# # Write to JSON file
# with open(output_path, "w", encoding="utf-8") as f:
#     json.dump(result2, f, indent=2, ensure_ascii=False)

# print(f"\n✅ Resume data saved successfully to {output_path}")

In [None]:
# from difflib import SequenceMatcher
# from tabulate import tabulate
# import json

# def normalize_list_items(lst):
#     """Convert all list items (strings or dicts) into comparable lowercase strings."""
#     normalized = []
#     for item in lst:
#         if isinstance(item, dict):
#             normalized.append(json.dumps(item, sort_keys=True).lower())
#         else:
#             normalized.append(str(item).lower())
#     return normalized

# def jaccard_similarity(list1, list2):
#     if not list1 and not list2:
#         return 1.0
#     if not list1 or not list2:
#         return 0.0
#     set1 = set(normalize_list_items(list1))
#     set2 = set(normalize_list_items(list2))
#     return len(set1 & set2) / len(set1 | set2)

# def text_similarity(a, b):
#     if not a or not b:
#         return 0.0
#     return SequenceMatcher(None, a.lower(), b.lower()).ratio()

# def compare_resume_outputs(approach1_output, approach2_output):
#     fields_to_compare = [
#         "skills", "extra_skills", "projects", "certifications",
#         "education", "role_keywords", "tech_stack_summary",
#         "work_experience", "summary", "experience_level",
#         "experience_by_skill"
#     ]

#     results = []
#     summary = {}

#     for field in fields_to_compare:
#         val1 = approach1_output.get(field)
#         val2 = approach2_output.get(field)
#         sim = None

#         # --- List-based fields ---
#         if isinstance(val1, list) and isinstance(val2, list):
#             sim = jaccard_similarity(val1, val2)
#             summary[field] = {
#                 "common": list(set(normalize_list_items(val1)) & set(normalize_list_items(val2))),
#                 "only_in_approach1": list(set(normalize_list_items(val1)) - set(normalize_list_items(val2))),
#                 "only_in_approach2": list(set(normalize_list_items(val2)) - set(normalize_list_items(val1))),
#                 "similarity": round(sim, 2)
#             }

#         # --- Dict-based fields ---
#         elif isinstance(val1, dict) and isinstance(val2, dict):
#             common_keys = set(val1.keys()) & set(val2.keys())
#             diff1 = set(val1.keys()) - set(val2.keys())
#             diff2 = set(val2.keys()) - set(val1.keys())
#             sim = len(common_keys) / len(set(val1.keys()) | set(val2.keys()) or [1])
#             summary[field] = {
#                 "common_keys": list(common_keys),
#                 "only_in_approach1": list(diff1),
#                 "only_in_approach2": list(diff2),
#                 "similarity": round(sim, 2)
#             }

#         # --- String fields ---
#         elif isinstance(val1, str) and isinstance(val2, str):
#             sim = text_similarity(val1, val2)
#             summary[field] = {"similarity": round(sim, 2)}

#         else:
#             summary[field] = {"similarity": None}

#         results.append([field, round(sim, 2) if sim is not None else "-", "✅" if sim and sim > 0.7 else "⚠️"])

#     print("\n📊 **Resume Comparison Summary (Approach 1 vs Approach 2)**\n")
#     print(tabulate(results, headers=["Field", "Similarity", "Status"], tablefmt="github"))
    
#     return summary


# # --- Load both outputs ---
# with open("extraction_outputs/output_approach1.json", "r") as f1:
#     approach1_data = json.load(f1)

# with open("extraction_outputs/output_approach2.json", "r") as f2:
#     approach2_data = json.load(f2)

# comparison_summary = compare_resume_outputs(
#     approach1_data["output"],
#     approach2_data["output"]
# )

# # Save the detailed report
# with open("comparison_report.json", "w") as f:
#     json.dump(comparison_summary, f, indent=2)

# print("\n✅ Comparison report saved as comparison_report.json")


In [4]:
!pip install pymupdf

Defaulting to user installation because normal site-packages is not writeable
Collecting pymupdf
  Downloading pymupdf-1.26.5-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.5


In [4]:
import fitz
import re
import logging
import json

# =========================
# Logging setup
# =========================
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# =========================
# Helper: Calculate average font size
# =========================
def calculate_average_font_size(doc):
    total_font_size = 0
    num_fonts = 0
    for page in doc:
        blocks = page.get_text("dict")['blocks']
        for block in blocks:
            if block['type'] == 0:  # text block
                for line in block.get('lines', []):
                    for span in line.get('spans', []):
                        total_font_size += span['size']
                        num_fonts += 1
    return total_font_size / num_fonts if num_fonts > 0 else 12  # fallback avg font size


# =========================
# Extract resume sections
# =========================
def extract_sections(pdf_path, headings=None):
    if headings is None:
        headings = [
            r"Education", r"Work Experience", r"Professional Experience", r"Experience", r"Projects",
            r"Skills", r"Certifications", r"Summary", r"Contact", r"Technical Skills",
            r"Experience", r"Location" , r"Extra Curricular Activities" , r"Languages"
        ]

    heading_pattern = re.compile(r"^(" + r"|".join(headings) + r")\s*$", flags=re.IGNORECASE)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logging.error(f"Failed to open PDF: {e}")
        return {}

    average_font_size = calculate_average_font_size(doc)
    logging.info(f"Average font size calculated: {average_font_size:.2f}")

    sections = {}
    current_heading = None
    current_section_text = []

    for page in doc:
        blocks = page.get_text("dict")['blocks']
        for block in blocks:
            if block['type'] == 0:  # Only text blocks
                for line in block.get('lines', []):
                    for span in line.get('spans', []):
                        text = span['text'].strip()
                        if not text:
                            continue

                        font_size = span.get('size', 0)

                        # Heading detection
                        if heading_pattern.match(text) or font_size > 1.5 * average_font_size:
                            if current_heading:
                                sections[current_heading] = "\n".join(current_section_text).strip()
                            current_heading = text.strip().title()
                            current_section_text = []
                        elif current_heading:
                            current_section_text.append(text)

    # Add last section
    if current_heading:
        sections[current_heading] = "\n".join(current_section_text).strip()

    # =========================
    # Rename top heading → Personal Info
    # =========================
    if sections:
        first_key = list(sections.keys())[0]
        first_value = sections[first_key]
        # Append name to value and rename key
        sections["Personal Info"] = f"{first_key}\n{first_value}".strip()
        del sections[first_key]
        
    return sections


# =========================
# Save JSON
# =========================
def save_sections_to_json(pdf_path, sections):
    # --- Start of Modification ---

    # Hardcoded Key-Value Pairs
    # Create a new dictionary to hold the sections and the hardcoded values
    output_data = sections.copy() 
    output_data["preferred_title"] = "Machine Learning Engineer"
    output_data["preferred_job_location"] = "India"
    
    # --- End of Modification ---

    output_path = pdf_path.replace(".pdf", "_sections.json")
    with open(output_path, "w", encoding="utf-8") as f:
        # Save the new dictionary (output_data) instead of the original sections
        json.dump(output_data, f, indent=4, ensure_ascii=False)
    logging.info(f"✅ Sections saved to {output_path}")


# =========================
# Print sections for debugging
# =========================
def print_sections(pdf_path):
    sections = extract_sections(pdf_path)
    if not sections:
        logging.error(f"No sections extracted from {pdf_path}")
        return
    for section, content in sections.items():
        print(f"=== Section: {section} ===")
        print(content)
        print("\n")


# =========================
# Run Example
# =========================
if __name__ == "__main__":
    pdf_path = "resumes/Yeswanth_Yerra_CV.pdf"  # update your path
    # NOTE: print_sections uses extract_sections directly and won't show the hardcoded values
    print_sections(pdf_path) 
    
    # Extract sections, then save including the hardcoded values
    sections = extract_sections(pdf_path)
    save_sections_to_json(pdf_path, sections)


2025-10-27 23:30:45,071 - INFO - Average font size calculated: 9.63
2025-10-27 23:30:45,080 - INFO - Average font size calculated: 9.63
2025-10-27 23:30:45,083 - INFO - ✅ Sections saved to resumes/Yeswanth_Yerra_CV_sections.json


=== Section: Education ===
Pragati Engineering College
Surampalem, AP
Bachelor of Technology, Computer Science and Engineering (GPA- 8.14/10.0)
Aug. 2019 – May 2023
Sasi junior College
Eluru, AP
Intermediate (GPA-9.8/10.0)
Aug 2017 – May 2019


=== Section: Technical Skills ===



=== Section: Languages ===
: Java, JavaScript, Python
Core Skills
: Data Structures & Algorithms, Computer Networks , Operating Systems,DBMS
Frameworks
: Spring Boot, Hibernate,React.js,Flask
Developer Tools
: Git, Docker, Terraform, Ansible, Jenkins
Databases
: MySQL, MongoDB, PostgreSQL
Cloud Computing
: AWS,Kubernetes


=== Section: Experience ===
Full Stack Development Intern
June 2024 - October 2024
Pantech Prolabs Pvt Ltd
Chennai,TN(Remote)
•
Engineered a web application using Spring Boot for the backend, employing RESTful APIs and efficient database
querying with MongoDB.
•
Gained expertise in design patterns and microservice architecture for building efficient, modular systems.
•
Applied scalable web 

In [5]:
import fitz
import re
import logging
import json

# =========================
# Logging setup
# =========================
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


# =========================
# Helper: Calculate average font size
# =========================
def calculate_average_font_size(doc):
    total_font_size = 0
    num_fonts = 0
    for page in doc:
        blocks = page.get_text("dict")['blocks']
        for block in blocks:
            if block['type'] == 0:  # text block
                for line in block.get('lines', []):
                    for span in line.get('spans', []):
                        total_font_size += span['size']
                        num_fonts += 1
    return total_font_size / num_fonts if num_fonts > 0 else 12


# =========================
# Extract resume sections
# =========================
def extract_sections(pdf_path, headings=None):
    if headings is None:
        headings = [
            r"Objective", r"Summary", r"Education", r"Work Experience", r"Professional Experience",
            r"Projects", r"Skills", r"Certifications", r"Technical Skills", r"Experience",
            r"Achievements", r"Internship", r"Hobbies", r"Interests"
        ]

    heading_pattern = re.compile(r"^(" + r"|".join(headings) + r")\s*$", flags=re.IGNORECASE)
    separator_pattern = re.compile(r"^[-–—_=#*]{3,}$")  # e.g. "----" or "=====" or "____"

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logging.error(f"Failed to open PDF: {e}")
        return {}

    avg_font = calculate_average_font_size(doc)
    logging.info(f"📏 Average font size: {avg_font:.2f}")

    # Extract all lines + font sizes
    lines = []
    for page in doc:
        for block in page.get_text("dict")['blocks']:
            if block['type'] != 0:
                continue
            for line in block.get('lines', []):
                text = " ".join(span.get("text", "") for span in line.get("spans", [])).strip()
                if text:
                    max_font = max(span.get("size", 0) for span in line.get("spans", []))
                    lines.append((text, max_font))

    # Detect heading-like lines
    heading_indices = []
    for idx, (text, size) in enumerate(lines):
        if heading_pattern.match(text) or size > 1.5 * avg_font:
            heading_indices.append(idx)

    # --- Smart boundary detection for Personal Info ---
    personal_info_lines = []
    boundary_index = len(lines)  # default end

    # Case 1: Stop before second heading (if present)
    if len(heading_indices) >= 2:
        boundary_index = heading_indices[1]

    # Case 2: Stop at separator line (if present before boundary)
    for idx, (text, _) in enumerate(lines[:boundary_index]):
        if separator_pattern.match(text):
            boundary_index = idx
            break

    # Extract Personal Info
    for text, _ in lines[:boundary_index]:
        personal_info_lines.append(text)
    personal_info_text = "\n".join(personal_info_lines).strip()

    # Initialize sections dict
    sections = {"Personal Info": personal_info_text}

    # --- Extract remaining sections ---
    current_heading = None
    current_text = []
    for idx, (text, size) in enumerate(lines[boundary_index:], start=boundary_index):
        if heading_pattern.match(text) or size > 1.5 * avg_font:
            if current_heading:
                sections[current_heading] = "\n".join(current_text).strip()
            current_heading = text.strip().title()
            current_text = []
        elif current_heading:
            current_text.append(text)

    # Add last section
    if current_heading:
        sections[current_heading] = "\n".join(current_text).strip()

    return sections


# =========================
# Save JSON
# =========================
def save_sections_to_json(pdf_path, sections):
    # --- Start of Modification ---

    # Hardcoded Key-Value Pairs
    # Create a new dictionary to hold the sections and the hardcoded values
    output_data = sections.copy() 
    output_data["preferred_title"] = "Machine Learning Engineer"
    output_data["preferred_job_location"] = "India"
    
    # --- End of Modification ---

    output_path = pdf_path.replace(".pdf", "_sections.json")
    with open(output_path, "w", encoding="utf-8") as f:
        # Save the new dictionary (output_data) instead of the original sections
        json.dump(output_data, f, indent=4, ensure_ascii=False)
    logging.info(f"✅ Sections saved to {output_path}")



# =========================
# Debug print
# =========================
def print_sections(pdf_path):
    sections = extract_sections(pdf_path)
    for section, content in sections.items():
        print(f"\n=== Section: {section} ===\n{content}\n")


# =========================
# Run Example
# =========================
if __name__ == "__main__":
    pdf_path = "resumes/Yeswanth_Yerra_CV.pdf"
    sections = extract_sections(pdf_path)
    print_sections(pdf_path)
    save_sections_to_json(pdf_path, sections)


2025-10-27 23:32:13,273 - INFO - 📏 Average font size: 9.63
2025-10-27 23:32:13,282 - INFO - 📏 Average font size: 9.63
2025-10-27 23:32:13,286 - INFO - ✅ Sections saved to resumes/Yeswanth_Yerra_CV_sections.json



=== Section: Personal Info ===
Yeswanth Yerra
+91 9550413132  | yeswanthyerra07@gmail.com  |  Linkedin  |  Github


=== Section: Education ===
Pragati Engineering College
Surampalem, AP
Bachelor of Technology, Computer Science and Engineering (GPA- 8.14/10.0)
Aug. 2019 – May 2023
Sasi junior College
Eluru, AP
Intermediate (GPA-9.8/10.0)
Aug 2017 – May 2019


=== Section: Technical Skills ===
Languages : Java, JavaScript, Python
Core Skills : Data Structures & Algorithms, Computer Networks , Operating Systems,DBMS
Frameworks : Spring Boot, Hibernate,React.js,Flask
Developer Tools : Git, Docker, Terraform, Ansible, Jenkins
Databases : MySQL, MongoDB, PostgreSQL
Cloud Computing : AWS,Kubernetes


=== Section: Experience ===
Full Stack Development Intern
June 2024 - October 2024
Pantech Prolabs Pvt Ltd
Chennai,TN(Remote)
•  Engineered a web application using Spring Boot for the backend, employing RESTful APIs and efficient database
querying with MongoDB.
•  Gained expertise in design patt

## Final Tweaked Resume Extraction Code

In [6]:
import fitz
import re
import json
import logging
import unicodedata
from typing import Dict, List

# =========================
# Logger Setup
# =========================
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# =========================
# Regex Patterns
# =========================
EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
# Improved phone regex: ensures 10–15 digits, avoids years and short numbers
PHONE_RE = re.compile(r"(\+?\d[\d\s\-\(\)]{8,}\d)")
LINK_RE = re.compile(
    r"(?:https?://)?(?:www\.)?(?:linkedin|github|portfolio|medium|personal|behance)\.[^\s,]+",
    re.IGNORECASE,
)

# =========================
# Text Utilities
# =========================
def clean_text(text: str) -> str:
    if not text:
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[^\x20-\x7E\n]+", "", text)
    text = re.sub(r"[•●–~►|#]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()


# =========================
# Font Analysis
# =========================
def calculate_average_font_size(doc: fitz.Document) -> float:
    total, count = 0, 0
    for page in doc:
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") == 0:
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        total += span.get("size", 0)
                        count += 1
    return total / count if count else 12


# =========================
# Step 1: Name Extraction
# =========================
def extract_name_from_font(page: fitz.Page) -> str:
    """Detect candidate's name using largest font on the first page."""
    max_font = 0
    name_candidate = ""
    for block in page.get_text("dict")["blocks"]:
        if block.get("type") != 0:
            continue
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                if span["size"] > max_font:
                    max_font = span["size"]
                    name_candidate = span["text"].strip()
    logging.info(f"🧠 Name candidate (largest font): {name_candidate}")
    return name_candidate


# =========================
# Step 2: Contact Info Extraction (Improved)
# =========================
def extract_contact_info(text: str) -> Dict:
    emails = EMAIL_RE.findall(text)
    raw_phones = [m.group(0).strip() for m in PHONE_RE.finditer(text)]
    links = LINK_RE.findall(text)

    # Filter out false positives like "2018-2019" or "2022-present"
    phones = []
    for p in raw_phones:
        digits = re.sub(r"\D", "", p)  # keep only digits
        if len(digits) < 9 or len(digits) > 15:  # ignore short/long sequences
            continue
        if re.match(r"20\d{2}", digits[:4]):  # avoid year-like numbers
            continue
        phones.append(p.strip())

    return {
        "emails": list(set(emails)),
        "phones": list(set(phones)),
        "links": list(set(links)),
    }


# =========================
# Step 3: Section Extraction
# =========================
def extract_sections_from_resume(pdf_path: str, headings: List[str] = None) -> Dict:
    if headings is None:
        headings = [
            r"Objective", r"Summary", r"Education", r"Experience", r"Work Experience",
            r"Professional Experience", r"Projects", r"Skills", r"Technical Skills",
            r"Certifications", r"Internship", r"Achievements", r"Hobbies", r"Interests"
        ]

    heading_pattern = re.compile(r"^\s*(" + r"|".join(headings) + r")\s*$", re.IGNORECASE)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logging.error(f"❌ Failed to open PDF: {e}")
        return {}

    avg_font = calculate_average_font_size(doc)
    logging.info(f"📏 Average font size: {avg_font:.2f}")

    sections = {}
    current_heading = None
    current_text = []
    first_heading_seen = False

    for page in doc:
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") != 0:
                continue
            for line in block.get("lines", []):
                line_text = "".join(span.get("text", "") for span in line.get("spans", [])).strip()
                if not line_text:
                    continue

                max_font = max((span.get("size", 0) for span in line.get("spans", [])), default=0)
                is_heading = bool(heading_pattern.match(line_text)) or max_font > 1.5 * avg_font

                if is_heading:
                    if current_heading:
                        sections[current_heading] = clean_text("\n".join(current_text))
                    current_heading = line_text.strip().title()
                    current_text = []
                    first_heading_seen = True
                else:
                    if current_heading:
                        current_text.append(line_text)

    if current_heading:
        sections[current_heading] = clean_text("\n".join(current_text))

    return sections


# =========================
# Step 4: Combine All Logic
# =========================
def extract_resume_data(pdf_path: str) -> Dict:
    doc = fitz.open(pdf_path)
    first_page = doc[0]

    # Personal Info Region (first page)
    first_page_text = first_page.get_text("text")
    name = extract_name_from_font(first_page)
    contact_info = extract_contact_info(first_page_text)

    personal_info = {
        "name": name,
        "emails": contact_info.get("emails", []),
        "phones": contact_info.get("phones", []),
        "links": contact_info.get("links", []),
        "raw": clean_text(first_page_text.split("\n")[0:10].__str__()),
    }

    # Other structured sections
    sections = extract_sections_from_resume(pdf_path)

    # Merge into final structure
    result = {"Personal Info": personal_info}
    result.update(sections)
    return result


# =========================
# Step 5: Save to JSON
# =========================
def save_to_json(pdf_path: str, data: Dict):
    # --- Modification Start ---
    # Add the hardcoded key-value pairs to the data dictionary
    data["preferred_title"] = "Machine Learning Engineer"
    data["preferred_job_location"] = "India"
    # --- Modification End ---

    output_path = pdf_path.replace(".pdf", "_structured.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    logging.info(f"✅ Extracted structured resume saved to: {output_path}")


# =========================
# Run Example
# =========================
if __name__ == "__main__":
    pdf_path = "resumes/Yeswanth_Yerra_CV.pdf"  # update your path
    data = extract_resume_data(pdf_path)
    print(json.dumps(data, indent=4, ensure_ascii=False))
    save_to_json(pdf_path, data)


2025-10-27 23:33:29,885 - INFO - 🧠 Name candidate (largest font): Yeswanth Yerra
2025-10-27 23:33:29,892 - INFO - 📏 Average font size: 9.63
2025-10-27 23:33:29,896 - INFO - ✅ Extracted structured resume saved to: resumes/Yeswanth_Yerra_CV_structured.json


{
    "Personal Info": {
        "name": "Yeswanth Yerra",
        "emails": [
            "yeswanthyerra07@gmail.com"
        ],
        "phones": [
            "+91 9550413132"
        ],
        "links": [],
        "raw": "['Yeswanth Yerra', '+91 9550413132 yeswanthyerra07@gmail.com Linkedin Github', 'Education', 'Pragati Engineering College', 'Surampalem, AP', 'Bachelor of Technology, Computer Science and Engineering (GPA- 8.14/10.0)', 'Aug. 2019 May 2023', 'Sasi junior College', 'Eluru, AP', 'Intermediate (GPA-9.8/10.0)']"
    },
    "Yeswanth Yerra": "+91 9550413132 yeswanthyerra07@gmail.com Linkedin Github",
    "Education": "Pragati Engineering College\nSurampalem, AP\nBachelor of Technology, Computer Science and Engineering (GPA- 8.14/10.0)\nAug. 2019 May 2023\nSasi junior College\nEluru, AP\nIntermediate (GPA-9.8/10.0)\nAug 2017 May 2019",
    "Technical Skills": "Languages: Java, JavaScript, Python\nCore Skills: Data Structures & Algorithms, Computer Networks , Operating Sy

In [2]:
import fitz
import re
import json
import logging
import unicodedata
from typing import Dict, List


# Logger Setup

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")


# Regex Patterns

EMAIL_RE = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
PHONE_RE = re.compile(r"(\+?\d[\d\s\-\(\)]{8,}\d)")
LINK_RE = re.compile(
    r"(?:https?://)?(?:www\.)?(?:linkedin|github|portfolio|medium|personal|behance)\.[^\s,]+",
    re.IGNORECASE,
)


# Text Utilities

def clean_text(text: str) -> str:
    if not text:
        return ""
    text = unicodedata.normalize("NFKD", text)
    text = re.sub(r"[^\x20-\x7E\n]+", "", text)
    text = re.sub(r"[•●–~►|#]+", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = re.sub(r"\n{2,}", "\n", text)
    return text.strip()


# Font Analysis

def calculate_average_font_size(doc: fitz.Document) -> float:
    total, count = 0, 0
    for page in doc:
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") == 0:
                for line in block.get("lines", []):
                    for span in line.get("spans", []):
                        total += span.get("size", 0)
                        count += 1
    return total / count if count else 12


# Name Extraction

def extract_name_from_font(page: fitz.Page) -> str:
    """Detect candidate's name using largest font on the first page."""
    max_font = 0
    name_candidate = ""
    for block in page.get_text("dict")["blocks"]:
        if block.get("type") != 0:
            continue
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                if span["size"] > max_font:
                    max_font = span["size"]
                    name_candidate = span["text"].strip()
    logging.info(f"🧠 Name candidate (largest font): {name_candidate}")
    return name_candidate


# Contact Info Extraction

def extract_contact_info(text: str) -> Dict:
    emails = EMAIL_RE.findall(text)
    raw_phones = [m.group(0).strip() for m in PHONE_RE.finditer(text)]
    links = LINK_RE.findall(text)

    phones = []
    for p in raw_phones:
        digits = re.sub(r"\D", "", p)
        if len(digits) < 9 or len(digits) > 15:
            continue
        if re.match(r"20\d{2}", digits[:4]):  # avoid years
            continue
        phones.append(p.strip())

    return {
        "emails": list(set(emails)),
        "phones": list(set(phones)),
        "links": list(set(links)),
    }


# Section Extraction

def extract_sections_from_resume(pdf_path: str, headings: List[str] = None) -> Dict:
    if headings is None:
        headings = [
            r"Objective", r"Summary", r"Education", r"Experience", r"Work Experience",
            r"Professional Experience", r"Projects", r"Skills", r"Technical Skills",
            r"Certifications", r"Internship", r"Achievements", r"Hobbies", r"Interests"
        ]

    heading_pattern = re.compile(r"^\s*(" + r"|".join(headings) + r")\s*$", re.IGNORECASE)

    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logging.error(f" Failed to open PDF: {e}")
        return {}

    avg_font = calculate_average_font_size(doc)
    logging.info(f" Average font size: {avg_font:.2f}")

    sections = {}
    current_heading = None
    current_text = []

    for page in doc:
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") != 0:
                continue
            for line in block.get("lines", []):
                line_text = "".join(span.get("text", "") for span in line.get("spans", [])).strip()
                if not line_text:
                    continue

                max_font = max((span.get("size", 0) for span in line.get("spans", [])), default=0)
                is_heading = bool(heading_pattern.match(line_text)) or max_font > 1.5 * avg_font

                if is_heading:
                    if current_heading:
                        sections[current_heading] = clean_text("\n".join(current_text))
                    current_heading = line_text.strip().title()
                    current_text = []
                else:
                    if current_heading:
                        current_text.append(line_text)

    if current_heading:
        sections[current_heading] = clean_text("\n".join(current_text))

    return sections


# Combine All Logic

def extract_resume_data(pdf_path: str) -> Dict:
    doc = fitz.open(pdf_path)
    first_page = doc[0]

    first_page_text = first_page.get_text("text")
    name = extract_name_from_font(first_page)
    contact_info = extract_contact_info(first_page_text)

    personal_info = {
        "name": name,
        "emails": contact_info.get("emails", []),
        "phones": contact_info.get("phones", []),
        "links": contact_info.get("links", []),
        "raw": clean_text(str(first_page_text.split("\n")[0:10]))
    }

    sections = extract_sections_from_resume(pdf_path)

    result = {"Personal Info": personal_info}
    result.update(sections)
    return result


# Split Multiline Sections (No Regex)

def split_multiline_sections(data: dict) -> dict:
    """Convert multiline strings into list items safely (no regex)."""
    for key, value in list(data.items()):
        if isinstance(value, dict):
            data[key] = split_multiline_sections(value)
        elif isinstance(value, str) and "\n" in value:
            lines = [line.strip() for line in value.split("\n") if line.strip()]
            data[key] = lines
    return data


# Save to JSON

def save_to_json(pdf_path: str, data: Dict):
    # --- Modification Start ---
    # Add the hardcoded key-value pairs to the data dictionary
    data["preferred_title"] = "Machine Learning Engineer"
    data["preferred_job_location"] = "India"
    # --- Modification End ---

    output_path = pdf_path.replace(".pdf", "_structured.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    logging.info(f"✅ Extracted structured resume saved to: {output_path}")


# Run Example

if __name__ == "__main__":
    pdf_path = "resumes/Yeswanth_Yerra_CV.pdf"  # Change path to your PDF
    data = extract_resume_data(pdf_path)
    data = split_multiline_sections(data)
    print(json.dumps(data, indent=4, ensure_ascii=False))
    save_to_json(pdf_path, data)


2025-10-28 07:18:40,456 - INFO - 🧠 Name candidate (largest font): Yeswanth Yerra
2025-10-28 07:18:40,464 - INFO -  Average font size: 9.63
2025-10-28 07:18:40,473 - INFO - ✅ Extracted structured resume saved to: resumes/Yeswanth_Yerra_CV_structured.json


{
    "Personal Info": {
        "name": "Yeswanth Yerra",
        "emails": [
            "yeswanthyerra07@gmail.com"
        ],
        "phones": [
            "+91 9550413132"
        ],
        "links": [],
        "raw": "['Yeswanth Yerra', '+91 9550413132 yeswanthyerra07@gmail.com Linkedin Github', 'Education', 'Pragati Engineering College', 'Surampalem, AP', 'Bachelor of Technology, Computer Science and Engineering (GPA- 8.14/10.0)', 'Aug. 2019 May 2023', 'Sasi junior College', 'Eluru, AP', 'Intermediate (GPA-9.8/10.0)']"
    },
    "Yeswanth Yerra": "+91 9550413132 yeswanthyerra07@gmail.com Linkedin Github",
    "Education": [
        "Pragati Engineering College",
        "Surampalem, AP",
        "Bachelor of Technology, Computer Science and Engineering (GPA- 8.14/10.0)",
        "Aug. 2019 May 2023",
        "Sasi junior College",
        "Eluru, AP",
        "Intermediate (GPA-9.8/10.0)",
        "Aug 2017 May 2019"
    ],
    "Technical Skills": [
        "Languages: Java,