<a href="https://colab.research.google.com/github/tej-mahender/Resume-Matcher/blob/main/Notebook/LLM_Resume.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Step - 1: Resume Ingestion**

In [None]:
# Install dependencies (run once in Colab)
!pip install PyPDF2 python-docx pytesseract pdf2image Pillow
!apt-get install -y poppler-utils tesseract-ocr

In [None]:
import PyPDF2
import docx
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image

In [None]:
# ------------------ Extraction Functions ------------------
def extract_text_from_pdf(file_path):
    """Try PyPDF2 first; fallback to OCR if text is empty."""
    text = ""
    try:
        with open(file_path, "rb") as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
    except Exception as e:
        print("PyPDF2 extraction error:", e)

    # Fallback to OCR if PyPDF2 failed or returned empty
    if not text.strip():
        print("PyPDF2 failed or empty text, using OCR...")
        text = ocr_pdf(file_path)
    return text

def ocr_pdf(file_path):
    """Use pdf2image + pytesseract to extract text from scanned PDFs."""
    text = ""
    pages = convert_from_path(file_path)
    for page in pages:
        page_text = pytesseract.image_to_string(page)
        text += page_text + "\n"
    return text

def extract_text_from_docx(file_path):
    try:
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
        return text
    except Exception as e:
        print("Error reading DOCX:", e)
        return ""

def extract_text_from_txt(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return f.read()

In [None]:
# ------------------ Normalization ------------------
def normalize_text(text):
    """Clean and normalize resume text."""
    # Lowercase
    text = text.lower()
    # Replace multiple spaces/tabs/newlines with a single space
    text = re.sub(r"\s+", " ", text)
    # Keep alphanumeric and common tech symbols (+, #, .)
    text = re.sub(r"[^a-z0-9+.# ]", "", text)
    return text.strip()

In [None]:
# ---------- Normalization ----------
def normalize_text_readable(text):
    """
    Normalize text for NLP while keeping readable line breaks.
    """
    lines = text.split("\n")
    normalized_lines = []
    for line in lines:
        line = line.strip().lower()                 # lowercase
        line = re.sub(r"\s+", " ", line)           # collapse extra spaces
        line = re.sub(r"[^a-z0-9+.# ]", "", line) # keep tech symbols
        if line:                                   # skip empty lines
            normalized_lines.append(line)
    return "\n".join(normalized_lines)

In [None]:
# ------------------ Main Extraction Function ------------------
def extract_resume_text(file_path):
    if file_path.endswith(".pdf"):
        raw_text = extract_text_from_pdf(file_path)
    elif file_path.endswith(".docx"):
        raw_text = extract_text_from_docx(file_path)
    elif file_path.endswith(".txt"):
        raw_text = extract_text_from_txt(file_path)
    else:
        raise ValueError("Unsupported file format. Use PDF, DOCX, or TXT.")

    # normalized_text = normalize_text(raw_text)
    normalized_text = normalize_text_readable(raw_text)

    # ------------------ Debug / Output ------------------
    print(f"--- Extraction Complete for {file_path} ---")
    print(f"Original Length: {len(raw_text)} chars | Normalized Length: {len(normalized_text)} chars\n")
    print("------ FULL NORMALIZED TEXT ------\n")
    print(normalized_text)

    return normalized_text

In [None]:
if __name__ == "__main__":
    file_path = "/content/TejMahendraResume.pdf"  # upload your resume here
    resume_text = extract_resume_text(file_path)   # Step 1

# **Step - 2 : Section Segmentation**

In [None]:
import re
from sentence_transformers import SentenceTransformer, util

In [None]:
# ---------- Load Embedding Model ----------
model = SentenceTransformer("all-MiniLM-L6-v2")  # lightweight, free, runs on CPU/GPU

In [None]:
# Canonical section headers
SECTION_HEADERS = {
    "skills": ["skills", "technical skills", "core skills", "technologies"],
    "education": ["education", "academics", "qualifications", "academic background", "studies", "academic qualications"],
    "experience": ["experience", "work experience", "employment history", "professional background", "work history", "internship"],
    "projects": ["projects", "academic projects", "personal projects", "research work"],
    "certifications": ["certifications", "licenses", "achievements", "awards", "training", "courses"],
    "responsibilities": ["positions of responsibility", "leadership", "roles", "activities"]
}


# Pre-compute embeddings for canonical headers
canonical_labels = []
canonical_texts = []
for section, variations in SECTION_HEADERS.items():
    for v in variations:
        canonical_labels.append(section)
        canonical_texts.append(v)

canonical_embeddings = model.encode(canonical_texts, convert_to_tensor=True)

# ---------- Hybrid Segmentation ----------
def segment_resume_hybrid(text, similarity_threshold=0.6):
    """
    Hybrid method: regex first, then embedding similarity for unknown headings.
    """
    lines = [ln.strip() for ln in text.split("\n") if ln.strip()]
    sections = {}
    current_section = "general"

    for line in lines:
        line_lower = line.strip().lower()
        found_heading = False

        # --- Step 1: Regex match ---
        for key, variations in SECTION_HEADERS.items():
            if any(re.fullmatch(v, line_lower) for v in variations):
                current_section = key
                sections[current_section] = []
                found_heading = True
                break

        # --- Step 2: Embedding similarity if no regex match ---
        if not found_heading:
            line_embedding = model.encode(line_lower, convert_to_tensor=True)
            cosine_scores = util.cos_sim(line_embedding, canonical_embeddings)[0]
            best_idx = int(cosine_scores.argmax())
            best_score = float(cosine_scores[best_idx])

            if best_score >= similarity_threshold:
                current_section = canonical_labels[best_idx]
                sections.setdefault(current_section, [])
                found_heading = True

        # --- Add line to current section ---
        sections.setdefault(current_section, []).append(line)

    return sections

# ---------- Pretty Print ----------
def print_sections(sections):
    print("------ SEGMENTED RESUME (HYBRID) ------\n")
    for section, content in sections.items():
        print(f"### {section.upper()} ###")
        for line in content:
            print(f"- {line}")
        print("\n")
    print("------ END OF SEGMENTS ------\n")

In [None]:
if __name__ == "__main__":
    file_path = "/content/TejMahendraResume.pdf"  # upload your resume here
    sections = segment_resume_hybrid(resume_text)  # Step 2
    print_sections(sections)



In [None]:
import os
from google import genai
import json

# Ask for the API key safely (will not be displayed)
os.environ["GOOGLE_API_KEY"] = input("🔑 Enter your Google API key: ").strip()



In [None]:
from google import genai

# ---------- Initialize Gemini ----------
# Initialize the client securely
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# ---------- Function to parse resume using Gemini ----------
def parse_resume_with_gemini(resume_text):
    prompt = f"""
Extract a structured JSON from the resume text below.
- Name
- Email
- Phone
- Links (LinkedIn, GitHub, LeetCode, Portfolio if present)
- Career Objective
- Education (degree, institute, year, score/CGPA)
- Skills (group by: languages, frontend, backend, databases, ML, cloud, tools)
- Projects (title + description)
- Certifications
- Responsibilities

Resume Text:
{resume_text}

Output strictly as JSON.
"""
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )
    return response.text

# ---------- Parse ----------
structured_resume = parse_resume_with_gemini(resume_text)
print(structured_resume)


# **Step - 3 : JD ingestion**

In [None]:
import re
import json
from google import genai

# ---------- Initialize Gemini Client ----------
# Initialize the client securely
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

# ---------- Function: Call Gemini and parse JD ----------
def extract_jd_with_gemini(jd_text, model="gemini-2.5-flash"):
    """
    Parses a job description using Gemini LLM and returns structured JSON.
    Handles triple backticks and malformed JSON gracefully.

    Returns a Python dict with keys:
    - education
    - years_experience
    - must_have_skills
    - optional_skills
    """
    prompt = f"""
Extract the following from the Job Description:
- Required Education
- Required Years of Experience
- Must-have Skills
- Optional Skills (good-to-have)

Output strictly as JSON with keys:
education, years_experience, must_have_skills, optional_skills

Job Description:
{jd_text}
"""

    response = client.models.generate_content(
        model=model,
        contents=prompt
    )

    raw_text = response.text.strip()

    # ---------- Clean triple backticks or json code blocks ----------
    cleaned_text = re.sub(r"^```json\s*|\s*```$", "", raw_text, flags=re.DOTALL)
    cleaned_text = re.sub(r"^```\s*|\s*```$", "", cleaned_text, flags=re.DOTALL)

    # ---------- Parse JSON ----------
    try:
        jd_json = json.loads(cleaned_text)
    except json.JSONDecodeError:
        # Return raw text if JSON parsing fails
        jd_json = {"raw_output": cleaned_text}

    return jd_json

# ---------- Example usage ----------
job_description = """
Must-have
2+ years of hands-on experience in programming languages such as C/C++, Java, JavaScript, NodeJS, Python, Groovy, or ReactJS.
B. Tech in Computer Science from a reputed college.
Excellent computer science fundamentals and a solid understanding of architecture, design, and performance.
Working knowledge and experience with REST APIs and Kafka.
A good understanding of object-oriented design and knowledge of product life cycles.
Strong proficiency in version control systems (e.g., Git) and source code management practices.
Experience with CI/CD tools (e.g., Jenkins) and build automation.
Familiarity with configuration management tools (e.g., Ansible, Puppet, Chef) is a plus.
Good-to-have
Knowledge of databases such as MySQL, Postgres, Cassandra, Redis, MongoDB, Elastic Search, Spark.
Experience with AI/ML/DL technologies and their applications.
"""

jd_structured = extract_jd_with_gemini(job_description)
print(json.dumps(jd_structured, indent=2))


# **Step - 4 : Final Comparision**

In [None]:
from google import genai

# Initialize the client securely
client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

def evaluate_resume_with_gemini(resume_text, jd_text):
    prompt = f"""
You are a recruitment assistant. Compare the candidate's resume with the job description.

Resume:
{structured_resume}

Job Description:
{jd_structured}

Tasks:
1. Give an overall match score (0-100%).
2. Give section-wise match scores (education, experience, skills, projects, certifications, responsibilities).
3. List missing skills or experience per section.
4. Provide actionable suggestions to improve the resume for this job.

Output strictly as JSON with keys:
overall_score, section_scores, missing_items, suggestions
"""

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )

    return response.text

# ---------- Call LLM ----------
report_json = evaluate_resume_with_gemini(resume_text, job_description)
print(report_json)


In [None]:
def evaluate_resume_with_gemini(resume_text, jd_text):
    prompt = f"""
You are a recruitment assistant. Compare the candidate's resume with the job description.

Resume:
{resume_text}

Job Description:
{jd_text}

Tasks:
1. Give an overall match score (0-100%).
2. Give section-wise match scores (education, experience, skills, projects, certifications, responsibilities).
3. List missing skills or experience per section.
4. Provide actionable suggestions to improve the resume for this job.

Format your answer in this clean, readable structure (NOT JSON):

**Overall Match Score:** X%

### Section Scores
| Section | Score (%) |
|----------|------------|
| Education | ... |
| Experience | ... |
| Skills | ... |
| Projects | ... |
| Certifications | ... |
| Responsibilities | ... |

---

### Missing Items
**Experience:**
- ...

**Skills:**
- ...

**Projects:**
- ...

**General / Explicit Mentions:**
- ...

---

### Suggestions
1. ...
2. ...
3. ...

Keep the layout minimal, clean, and professional.
"""

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt
    )

    return response.text


In [None]:
# ---------- Call LLM ----------
report_json = evaluate_resume_with_gemini(resume_text, job_description)
print(report_json)