1. Dataset preparation

In [2]:
import pandas as pd

# Small dummy dataset
data = [
    {"JD": "Looking for a Python developer with experience in ML and AWS.",
     "Resume": "Experienced Python engineer skilled in ML, cloud computing, and AWS.",
     "Label": 1},

    {"JD": "Frontend developer required with React and CSS experience.",
     "Resume": "Expert in backend Python, Django, and database management.",
     "Label": 0},

    {"JD": "Data scientist needed with Python, ML, and deep learning skills.",
     "Resume": "Python data analyst experienced in ML, deep learning, and visualization.",
     "Label": 1},

    {"JD": "Hiring Java developer with Spring Boot experience.",
     "Resume": "Frontend React developer with CSS and HTML skills.",
     "Label": 0},

    {"JD": "Looking for DevOps engineer familiar with AWS and Docker.",
     "Resume": "Experienced in AWS, Docker, CI/CD pipelines, and DevOps practices.",
     "Label": 1},

    {"JD": "UI/UX designer needed for mobile app design.",
     "Resume": "Graphic designer with expertise in Photoshop and Illustrator.",
     "Label": 0},

    {"JD": "Python developer required with ML experience.",
     "Resume": "Python programmer skilled in machine learning and data analysis.",
     "Label": 1},

    {"JD": "Looking for backend developer with Node.js and MongoDB skills.",
     "Resume": "Experienced backend developer using Node.js and MongoDB.",
     "Label": 1},

    {"JD": "Senior accountant with Excel and taxation knowledge.",
     "Resume": "Accountant experienced in bookkeeping and Excel.",
     "Label": 1},

    {"JD": "Marketing specialist with SEO and social media skills.",
     "Resume": "Content writer with experience in social media marketing.",
     "Label": 1},
]

df = pd.DataFrame(data)

2. Text Preprocessing

In [3]:
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

df['JD_clean'] = df['JD'].apply(clean_text)
df['Resume_clean'] = df['Resume'].apply(clean_text)
df['Combined'] = df['JD_clean'] + " " + df['Resume_clean']

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['Combined'])
y = df['Label']

3. Model Training

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

4. Improving matched skills

In [5]:
import re
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# -------------------------
# Skills list & synonyms (so that random filler words dont get selected in missing skills)
# -------------------------
SKILLS_LIST = [
    "python", "java", "c++", "c#", "javascript", "node.js", "react", "angular",
    "sql", "mysql", "postgresql", "mongodb", "aws", "azure", "gcp",
    "docker", "kubernetes", "machine learning", "deep learning", "nlp",
    "tensorflow", "pytorch", "scikit-learn", "excel", "power bi", "tableau",
    "ui/ux", "figma", "adobe xd", "django", "flask", "spring boot",
    "git", "ci/cd", "rest api", "graphql", "data analysis", "communication"
]

SKILL_SYNONYMS = {
    "ml": "machine learning",
    "deeplearning": "deep learning",
    "deep-learning": "deep learning",
    "nodejs": "node.js",
    "js": "javascript",
    "reactjs": "react",
    "postgres": "postgresql",
    "mongo": "mongodb",
    "c plus plus": "c++",
    "cplusplus": "c++",
    "ci cd": "ci/cd",
    "restapi": "rest api",
    "uiux": "ui/ux"
}

# -------------------------
# Text cleaning utility
# -------------------------
def clean_text(text: str) -> str:
    if text is None:
        return ""
    text = text.lower()
    # keep plus sign for c++
    text = re.sub(r'[\.,;:/\\\(\)\[\]\{\}"\']', ' ', text)
    text = re.sub(r'[^a-z0-9+\s\-\.]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# -------------------------
# Robust skill extractor   (so that model matches from SKILLS_LIST + synonyms and becomes phrase-aware)
# -------------------------
def extract_skills(text: str):
    text_clean = clean_text(text)
    found = set()

    # Check for multi-word skills first (longer strings first)
    skills_sorted = sorted(SKILLS_LIST, key=lambda s: -len(s))
    for skill in skills_sorted:
        pattern = r'\b' + re.escape(skill.lower()) + r'\b'
        if re.search(pattern, text_clean):
            found.add(skill)

    # synonyms mapping
    for syn, canon in SKILL_SYNONYMS.items():
        if re.search(r'\b' + re.escape(syn.lower()) + r'\b', text_clean):
            found.add(canon)

    # special cases
    if re.search(r'\bc\+\+\b', text) or re.search(r'\bc plus plus\b', text):
        found.add('c++')
    if re.search(r'\bnode\b', text_clean):
        # prefer canonical node.js
        found.add('node.js')

    return found

5. Scoring function

In [6]:
def compute_match_score(job_desc: str, resume_text: str, debug: bool = False):
    jd_clean = clean_text(job_desc)
    resume_clean = clean_text(resume_text)

    # 1) TF-IDF cosine similarity using the pre-fitted vectorizer (vectorizer from training).
    try:
        jd_vec = vectorizer.transform([jd_clean])
        resume_vec = vectorizer.transform([resume_clean])
        tfidf_sim = float(cosine_similarity(jd_vec, resume_vec)[0][0])  # 0..1
    except Exception as e:
        # fallback: if vectorizer not available, train locally (less stable)
        from sklearn.feature_extraction.text import TfidfVectorizer
        tmp_vec = TfidfVectorizer().fit_transform([jd_clean, resume_clean])
        tfidf_sim = float(cosine_similarity(tmp_vec[0:1], tmp_vec[1:2])[0][0])

    tfidf_pct = round(tfidf_sim * 100, 2)

    # 2) Model probability (if model exists and was trained on combined JDs+Resumes)
    model_prob_pct = None
    try:
        combined = jd_clean + " " + resume_clean
        p = model.predict_proba(vectorizer.transform([combined]))[0][1]
        model_prob_pct = round(float(p) * 100, 2)
    except Exception:
        model_prob_pct = None

    # 3) Skill coverage (using controlled skill list)
    jd_skills = extract_skills(job_desc)
    resume_skills = extract_skills(resume_text)
    matched_skills = jd_skills.intersection(resume_skills)
    missing_skills = jd_skills - resume_skills

    skill_pct = 0.0
    if len(jd_skills) > 0:
        skill_pct = round((len(matched_skills) / len(jd_skills)) * 100, 2)
    else:
        skill_pct = 0.0

    # 4) Combine signals into final score (weights + fallback + smoothing)
    if model_prob_pct is not None:
        w_model, w_tfidf, w_skill = 0.60, 0.25, 0.15
    else:
        # no model available: rely on tfidf + skills
        w_model, w_tfidf, w_skill = 0.0, 0.75, 0.25

    # If job has very few skills, don't overweight skill_pct (makes it noisy). Reduce skill weight.
    if len(jd_skills) <= 2 and w_skill > 0:
        # halve the skill weight and redistribute proportionally to model/tfidf
        reduce = w_skill * 0.5
        w_skill = w_skill - reduce
        if (w_model + w_tfidf) > 0:
            w_model += reduce * (w_model / (w_model + w_tfidf))
            w_tfidf += reduce * (w_tfidf / (w_model + w_tfidf))

    # Normalize weights to sum to 1 (avoid rounding issues)
    total_w = w_model + w_tfidf + w_skill
    if total_w == 0:
        w_model, w_tfidf, w_skill = 0.0, 1.0, 0.0
    else:
        w_model, w_tfidf, w_skill = w_model / total_w, w_tfidf / total_w, w_skill / total_w

    # Compose final pct (all inputs are 0..100)
    m_pct = model_prob_pct if model_prob_pct is not None else 0.0
    final_pct = round(w_model * m_pct + w_tfidf * tfidf_pct + w_skill * skill_pct, 2)

    # Debug info (optional)
    debug_info = {
        "model_prob_pct": model_prob_pct,
        "tfidf_pct": tfidf_pct,
        "skill_pct": skill_pct,
        "weights": {"model": w_model, "tfidf": w_tfidf, "skill": w_skill},
        "jd_skills": sorted(list(jd_skills)),
        "resume_skills": sorted(list(resume_skills)),
        "matched_skills": sorted(list(matched_skills)),
        "missing_skills": sorted(list(missing_skills))
    }

    if debug:
        return final_pct, debug_info
    return final_pct, {
        "model_prob_pct": model_prob_pct,
        "tfidf_pct": tfidf_pct,
        "skill_pct": skill_pct,
        "matched_skills": matched_skills,
        "missing_skills": missing_skills
    }

6. Dashboard using Gradio UI (creative add-on)

In [7]:
# using gradio ui as it was easier to use in colab file

def gradio_interface(resume_text, job_desc, show_debug=False):
    final_pct, details = compute_match_score(job_desc, resume_text, debug=show_debug)
    # prepare outputs
    model_prob = details.get("model_prob_pct") if isinstance(details, dict) else None
    tfidf = details.get("tfidf_pct") if isinstance(details, dict) else None
    skill_pct = details.get("skill_pct") if isinstance(details, dict) else None
    matched = details.get("matched_skills", set())
    missing = details.get("missing_skills", set())

    # human-friendly strings
    model_str = f"{model_prob} %" if model_prob is not None else "N/A"
    tfidf_str = f"{tfidf} %"
    skill_str = f"{skill_pct} %"
    matched_str = ", ".join(sorted(matched)) if matched else "None"
    missing_str = ", ".join(sorted(missing)) if missing else "None"

    debug_text = ""
    if show_debug:
        debug_text = (
            f"Model prob: {model_str}\nTF-IDF sim: {tfidf_str}\nSkill coverage: {skill_str}\n"
            f"Matched: {matched_str}\nMissing: {missing_str}\nWeights: {details.get('weights')}\n"
        )

    final_text = f"✅ Final Match: **{final_pct} %**\n\n• Model prob: {model_str}\n• TF-IDF similarity: {tfidf_str}\n• Skill coverage: {skill_str}\n\n• Matched skills: {matched_str}\n• Missing skills: {missing_str}"

    return final_text, debug_text

with gr.Blocks() as demo:
    gr.Markdown("## 📊 Resume Match Dashboard — improved scoring")
    with gr.Row():
        with gr.Column(scale=1):
            resume_input = gr.Textbox(lines=10, label="Resume (paste text)")
            show_debug = gr.Checkbox(False, label="Show debug info")
        with gr.Column(scale=1):
            jd_input = gr.Textbox(lines=10, label="Job Description (paste text)")

    analyze_btn = gr.Button("Analyze")
    result_md = gr.Markdown()
    debug_box = gr.Textbox(label="Debug (optional)", interactive=False, lines=10)

    analyze_btn.click(fn=gradio_interface, inputs=[resume_input, jd_input, show_debug], outputs=[result_md, debug_box])

# Launch inline in Colab/Jupyter
demo.launch(inbrowser=False, share=False)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.
* To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

