In [12]:
import pandas as pd
resume_df = pd.read_csv("resume_cleaned_100.csv")
resume_df.iloc[3:6]

Unnamed: 0,career_objective,skills,degree_names,major_field_of_studies,positions,responsibilities
3,As a Data Analyst I always look into more inno...,"['Machine Learning', 'Artificial Intelligence'...","['B.Tech', 'M.Tech']","[None, None]",['Data Analyst'],Mikrotik Router Configuration\nOLT Device Setu...
4,Financial and Accounting professional with exp...,['Power User of Microsoft Excel Epicor NetSuit...,['Bachelor of Business Administration'],['Accounting'],"['Senior Accountant', 'Senior Accountant/Finan...",Design Creation\nCAD Drawings\nDesign Optimiza...
5,"Fresher starting out with Business Analysis, a...","['Business Analyst', 'Data Analysis', 'Busines...",['BBA'],['N/A'],['Part-Time Analyst'],"Full Stack Development\nFront-end: ReactJS, Ne..."


# step 1

In [35]:
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import ollama
from sentence_transformers import SentenceTransformer

# ===============================
# Embedding model
# ===============================
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ===============================
# Load data
# ===============================
jd_df = pd.read_excel("../1_data_cleaning/filtered_jd_sections2.xlsx")
jd_df["job_id"] = jd_df.index.astype(int)
VALID_JOB_IDS = set(jd_df["job_id"].tolist())

resume_df = pd.read_csv("../1_data_cleaning/resume_cleaned_100.csv")
resume_df = resume_df[
    ["career_objective", "skills", "degree_names",
     "major_field_of_studies", "positions", "responsibilities"]
].iloc[8:20]
resume_df["resume_id"] = resume_df.index.astype(int)

# ===============================
# Utilities
# ===============================
def safe_json_loads(text):
    try:
        data = json.loads(text)
        if isinstance(data, dict):
            data = [data]
        return data if isinstance(data, list) else []
    except Exception:
        try:
            start = text.index("[")
            end = text.rindex("]") + 1
            return json.loads(text[start:end])
        except Exception:
            return []

def summarize_text(text, max_length=900):
    if not isinstance(text, str):
        return ""
    text = text.strip()
    return text[:max_length] if len(text) > max_length else text

def build_resume_profile(row):
    return f"""
Career Objective:
{summarize_text(row['career_objective'])}

Skills:
{summarize_text(row['skills'])}

Degree Names:
{summarize_text(row['degree_names'])}

Major Field of Studies:
{summarize_text(row['major_field_of_studies'])}

Positions:
{summarize_text(row['positions'])}

Responsibilities:
{summarize_text(row['responsibilities'])}
"""

# ===============================
# Stage 1: SBERT coarse filter
# ===============================
def embed_text(text: str):
    return embedder.encode(text if isinstance(text, str) else "")

print("Building JD embeddings...")
jd_texts = jd_df["job_description"].fillna("").tolist()
jd_embeddings = embedder.encode(jd_texts, show_progress_bar=True)

def fast_filter_candidates(resume_text, top_k=20):
    emb = embed_text(resume_text).reshape(1, -1)
    sims = cosine_similarity(emb, jd_embeddings)[0]
    top_ids = np.argsort(sims)[::-1][:top_k]
    return jd_df.iloc[top_ids]

# ===============================
# Stage 2: LLM fine reasoning (Top-3)
# ===============================
LLM_MODEL = "phi3:mini"

def build_prompt(resume_text, jd_batch):
    jd_section = ""
    for _, row in jd_batch.iterrows():
        desc = summarize_text(row["job_description"], max_length=900)
        jd_section += f"""
---
Job ID: {row['job_id']}
Job Title: {row['job_title']}
Location: {row['location_cleaned']}
Description:
{desc}
"""
    return f"""
You are a senior hiring expert.

Given the following resume and job descriptions, identify the three best matching jobs overall.

Base your judgment primarily on:
- Skills and responsibilities alignment
- Relevant experience
- Educational background
- Location fit (lowest weight)

Return STRICT JSON ONLY in this format:
[
  {{"job_id": <best_job_id_1>, "reason": "<brief explanation>"}},
  {{"job_id": <best_job_id_2>, "reason": "<brief explanation>"}},
  {{"job_id": <best_job_id_3>, "reason": "<brief explanation>"}}
]

Resume:
{resume_text}

Job Descriptions:
{jd_section}
"""

def score_batch_with_llm(resume_text, jd_batch, retry=3):
    prompt = build_prompt(resume_text, jd_batch)
    for _ in range(retry):
        try:
            resp = ollama.chat(
                model=LLM_MODEL,
                messages=[{"role": "user", "content": prompt}],
                options={"num_ctx": 4096}
            )
            txt = resp["message"]["content"]
            data = safe_json_loads(txt)
            if data:
                return data
        except Exception as e:
            print("‚ö†Ô∏è Error:", e)
            time.sleep(1)
    return []

# ===============================
# Matching logic (Top-3)
# ===============================
def chunk_df(df, size=10):
    for i in range(0, len(df), size):
        yield df.iloc[i:i+size]

def match_resume_to_jobs(resume_text):
    filtered = fast_filter_candidates(resume_text, top_k=20)
    best_jobs = []

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [
            executor.submit(score_batch_with_llm, resume_text, batch)
            for batch in chunk_df(filtered, size=10)
        ]
        for fut in as_completed(futures):
            results = fut.result()
            if not results:
                continue
            for r in results:
                try:
                    jid = int(r.get("job_id", 0))
                    if jid in VALID_JOB_IDS:
                        best_jobs.append(jid)
                except Exception:
                    continue
            if len(best_jobs) >= 3:
                break

    best_jobs = best_jobs[:3]
    while len(best_jobs) < 3:
        best_jobs.append(0)
    return best_jobs

# ===============================
# Run all resumes
# ===============================
output = []

for _, row in tqdm(resume_df.iterrows(), total=len(resume_df), desc="Resumes"):
    rid = row["resume_id"]
    resume_text = build_resume_profile(row)
    top3 = match_resume_to_jobs(resume_text)

    job_rows = [jd_df[jd_df["job_id"] == jid].iloc[0] if jid in VALID_JOB_IDS else jd_df.iloc[0] for jid in top3]

    output.append({
        "resume_id": rid,
        "top1_match_job_id": top3[0],
        "top1_match_job_title": job_rows[0]["job_title"],
        "top1_match_job_location": job_rows[0]["location_cleaned"],
        "top2_match_job_id": top3[1],
        "top2_match_job_title": job_rows[1]["job_title"],
        "top2_match_job_location": job_rows[1]["location_cleaned"],
        "top3_match_job_id": top3[2],
        "top3_match_job_title": job_rows[2]["job_title"],
        "top3_match_job_location": job_rows[2]["location_cleaned"]
    })

pd.DataFrame(output).to_excel("resume_job_top3.xlsx", index=False)
print("\n DONE! Saved to resume_job_top3.xlsx")


Building JD embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:05<00:00,  7.64it/s]
Resumes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [08:21<00:00, 41.77s/it]


 DONE! Saved to resume_job_top3.xlsx





In [None]:
# import pandas as pd
# import json
# import numpy as np
# from tqdm import tqdm
# from sklearn.metrics.pairwise import cosine_similarity
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import time
# import ollama
# from sentence_transformers import SentenceTransformer

# # ===============================
# # Embedding model
# # ===============================
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # ===============================
# # Load data
# # ===============================
# jd_df = pd.read_excel("../1_data_cleaning/filtered_jd_sections2.xlsx")
# jd_df["job_id"] = jd_df.index.astype(int)
# VALID_JOB_IDS = set(jd_df["job_id"].tolist())

# resume_df = pd.read_csv("../1_data_cleaning/resume_cleaned_100.csv")
# resume_df = resume_df[
#     ["career_objective", "skills", "degree_names",
#      "major_field_of_studies", "positions", "responsibilities"]
# ].iloc[0:2]
# resume_df["resume_id"] = resume_df.index.astype(int)

# # ===============================
# # Utilities
# # ===============================
# def safe_json_loads(text):
#     try:
#         data = json.loads(text)
#         if isinstance(data, dict):
#             data = [data]
#         return data if isinstance(data, list) else []
#     except Exception:
#         try:
#             start = text.index("[")
#             end = text.rindex("]") + 1
#             return json.loads(text[start:end])
#         except Exception:
#             return []

# def summarize_text(text, max_length=900):
#     if not isinstance(text, str):
#         return ""
#     text = text.strip()
#     return text[:max_length] if len(text) > max_length else text

# def build_resume_profile(row):
#     return f"""
# Career Objective:
# {summarize_text(row['career_objective'])}

# Skills:
# {summarize_text(row['skills'])}

# Degree Names:
# {summarize_text(row['degree_names'])}

# Major Field of Studies:
# {summarize_text(row['major_field_of_studies'])}

# Positions:
# {summarize_text(row['positions'])}

# Responsibilities:
# {summarize_text(row['responsibilities'])}
# """

# # ===============================
# # Stage 1: SBERT coarse filter
# # ===============================
# def embed_text(text: str):
#     return embedder.encode(text if isinstance(text, str) else "")

# print("Building JD embeddings...")
# jd_texts = jd_df["job_description"].fillna("").tolist()
# jd_embeddings = embedder.encode(jd_texts, show_progress_bar=True)

# def fast_filter_candidates(resume_text, top_k=20):
#     emb = embed_text(resume_text).reshape(1, -1)
#     sims = cosine_similarity(emb, jd_embeddings)[0]
#     top_ids = np.argsort(sims)[::-1][:top_k]
#     return jd_df.iloc[top_ids]

# # ===============================
# # Stage 2: LLM fine reasoning (Top-1 only)
# # ===============================
# LLM_MODEL = "phi3:mini"

# def build_prompt(resume_text, jd_batch):
#     jd_section = ""
#     for _, row in jd_batch.iterrows():
#         desc = summarize_text(row["job_description"], max_length=900)
#         jd_section += f"""
# ---
# Job ID: {row['job_id']}
# Job Title: {row['job_title']}
# Location: {row['location_cleaned']}
# Description:
# {desc}
# """
#     return f"""
# You are a senior hiring expert.

# Given the following resume and job descriptions, identify the single best matching job overall.

# Base your judgment primarily on:
# - Skills and responsibilities alignment
# - Relevant experience
# - Educational background
# - Location fit (lowest weight)

# Return STRICT JSON ONLY in this format:
# {{
#   "job_id": <best_job_id>,
#   "reason": "<brief explanation>"
# }}

# Resume:
# {resume_text}

# Job Descriptions:
# {jd_section}
# """

# def score_batch_with_llm(resume_text, jd_batch, retry=3):
#     prompt = build_prompt(resume_text, jd_batch)
#     for _ in range(retry):
#         try:
#             resp = ollama.chat(
#                 model=LLM_MODEL,
#                 messages=[{"role": "user", "content": prompt}],
#                 options={"num_ctx": 4096}
#             )
#             txt = resp["message"]["content"]
#             data = safe_json_loads(txt)
#             if data:
#                 return data
#         except Exception as e:
#             print("‚ö†Ô∏è Error:", e)
#             time.sleep(1)
#     return []

# # ===============================
# # Matching logic (Top-1 only)
# # ===============================
# def chunk_df(df, size=10):
#     for i in range(0, len(df), size):
#         yield df.iloc[i:i+size]

# def match_resume_to_jobs(resume_text):
#     filtered = fast_filter_candidates(resume_text, top_k=20)
#     best_id = None

#     with ThreadPoolExecutor(max_workers=4) as executor:
#         futures = [
#             executor.submit(score_batch_with_llm, resume_text, batch)
#             for batch in chunk_df(filtered, size=10)
#         ]
#         for fut in as_completed(futures):
#             results = fut.result()
#             if not results:
#                 continue

#             if isinstance(results, dict) or isinstance(results, (int, str)):
#                 results = [results]

#             for r in results:
#                 jid = None
#                 if isinstance(r, dict):
#                     if "job_id" in r:
#                         jid = r["job_id"]
#                     elif "best_job_id" in r:
#                         jid = r["best_job_id"]
#                 elif isinstance(r, (int, str)):
#                     jid = r

#                 if jid is None:
#                     continue

#                 try:
#                     jid = int(jid)
#                 except ValueError:
#                     continue

#                 if jid in VALID_JOB_IDS:
#                     best_id = jid
#                     break

#             if best_id is not None:
#                 break

#     return best_id if best_id is not None else 0

# # ===============================
# # Run all resumes
# # ===============================
# output = []

# for _, row in tqdm(resume_df.iterrows(), total=len(resume_df), desc="Resumes"):
#     rid = row["resume_id"]
#     resume_text = build_resume_profile(row)
#     best_jid = match_resume_to_jobs(resume_text)

#     job_row = jd_df[jd_df["job_id"] == best_jid].iloc[0] if best_jid in VALID_JOB_IDS else jd_df.iloc[0]

#     output.append({
#         "resume_id": rid,
#         "match_job_id": best_jid,
#         "match_job_title": job_row["job_title"],
#         "match_job_location": job_row["location_cleaned"]
#     })

# pd.DataFrame(output).to_excel("resume_job_groundtrue.xlsx", index=False)
# print("\n DONE! Saved to resume_job_groundtrue.xlsx")


Building JD embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:05<00:00,  7.15it/s]
Resumes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:56<00:00, 28.38s/it]


 DONE! Saved to resume_job_groundtrue.xlsx





In [None]:
# import pandas as pd
# import json
# import numpy as np
# from tqdm import tqdm
# from sklearn.metrics.pairwise import cosine_similarity
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import time
# import ollama
# from sentence_transformers import SentenceTransformer

# # ========== Embedding model ==========
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # ========== Load JD data ==========
# jd_df = pd.read_excel("jobdescription.xlsx")
# jd_df["job_id"] = jd_df.index.astype(int)
# VALID_JOB_IDS = set(jd_df["job_id"].tolist())

# # ========== Load Resume data ==========
# resume_df = pd.read_csv("resume_cleaned_100.csv")
# resume_df = resume_df[
#     ['career_objective', 'skills', 'degree_names',
#      'major_field_of_studies', 'positions', 'responsibilities']
# ].iloc[10:11]

# #  resume_id
# resume_df["resume_id"] = resume_df.index.astype(int)

# # =====================================================================
# # JSON loader
# # =====================================================================
# def safe_json_loads(text):
#     try:
#         data = json.loads(text)
#         if isinstance(data, dict):
#             data = [data]
#         return data if isinstance(data, list) else []
#     except:
#         try:
#             start = text.index("[")
#             end = text.rindex("]") + 1
#             return json.loads(text[start:end])
#         except:
#             return []

# # =====================================================================
# # Text summarizer
# # =====================================================================
# def summarize_text(text, max_length=900):
#     if not isinstance(text, str):
#         return ""
#     text = text.strip()
#     return text[:max_length] if len(text) > max_length else text

# # =====================================================================
# # Build Resume text (UPDATED for your dataset)
# # =====================================================================
# def build_resume_profile(row):
#     return f"""
# Career Objective:
# {summarize_text(row['career_objective'])}

# Skills:
# {summarize_text(row['skills'])}

# Degree Names:
# {summarize_text(row['degree_names'])}

# Major Field of Studies:
# {summarize_text(row['major_field_of_studies'])}

# Positions:
# {summarize_text(row['positions'])}

# Responsibilities:
# {summarize_text(row['responsibilities'])}
# """

# # =====================================================================
# # Stage 1: Embedding coarse filter
# # =====================================================================
# def embed_text(text: str):
#     return embedder.encode(text if isinstance(text, str) else "")

# print("Building JD embeddings...")
# jd_texts = jd_df["job_description"].fillna("").tolist()
# jd_embeddings = embedder.encode(jd_texts, show_progress_bar=True)

# def fast_filter_candidates(resume_text, top_k=20):
#     emb = embed_text(resume_text).reshape(1, -1)
#     sims = cosine_similarity(emb, jd_embeddings)[0]
#     top_ids = np.argsort(sims)[::-1][:top_k]
#     return jd_df.iloc[top_ids]

# # =====================================================================
# # LLM scoring
# # =====================================================================
# LLM_MODEL = "phi3:mini"

# def build_prompt(resume_text, jd_batch):
#     jd_section = ""
#     for _, row in jd_batch.iterrows():
#         desc = summarize_text(row["job_description"], max_length=900)
#         jd_section += f"""
# ---
# Job ID: {row['job_id']}
# Job Title: {row['job_title']}
# Location: {row['location_cleaned']}
# Description:
# {desc}
# """
#     return f"""
# You are a senior hiring expert.

# Score each job from 0‚Äì1 based ONLY on:
# 1. Skills match
# 2. Work experience match
# 3. Education match
# 4. Location match (lowest weight)

# Return STRICT JSON ONLY:
# [
#   {{"job_id": 0, "score": 0.00}}
# ]

# Resume:
# {resume_text}

# Job Descriptions:
# {jd_section}

# Return TOP 5.
# """

# def score_batch_with_llm(resume_text, jd_batch, retry=3):
#     prompt = build_prompt(resume_text, jd_batch)
#     for _ in range(retry):
#         try:
#             resp = ollama.chat(
#                 model=LLM_MODEL,
#                 messages=[{"role": "user", "content": prompt}],
#                 options={"num_ctx": 4096}
#             )
#             txt = resp["message"]["content"]
#             data = safe_json_loads(txt)
#             if data:
#                 return data
#         except:
#             time.sleep(1)
#     return []

# # =====================================================================
# # GUARANTEE top1 exists
# # =====================================================================
# def ensure_top1(scored):
#     return scored[:1] if len(scored) >= 1 else [(0, 0.0)]

# # =====================================================================
# # Match per resume
# # =====================================================================
# def chunk_df(df, size=10):
#     for i in range(0, len(df), size):
#         yield df.iloc[i:i+size]

# def match_resume_to_jobs(resume_text):
#     filtered = fast_filter_candidates(resume_text, top_k=20)
#     scored = {}

#     with ThreadPoolExecutor(max_workers=8) as executor:
#         futures = [
#             executor.submit(score_batch_with_llm, resume_text, batch)
#             for batch in chunk_df(filtered, size=10)
#         ]
#         for fut in as_completed(futures):
#             for r in fut.result():
#                 try:
#                     jid = int(r["job_id"])
#                     score = float(r["score"])
#                     if jid in VALID_JOB_IDS:
#                         scored[jid] = max(scored.get(jid, 0.0), score)
#                 except:
#                     continue

#     ranked = sorted(scored.items(), key=lambda x: x[1], reverse=True)
#     return ensure_top1(ranked)

# # =====================================================================
# # Run all resumes
# # =====================================================================
# output = []

# for _, row in tqdm(resume_df.iterrows(), total=len(resume_df), desc="Resumes"):
#     rid = row["resume_id"]
#     resume_text = build_resume_profile(row)

#     top1 = match_resume_to_jobs(resume_text)
#     jid, score = top1[0]

#     job_row = jd_df[jd_df["job_id"] == jid].iloc[0] if jid in VALID_JOB_IDS else jd_df.iloc[0]

#     output.append({
#         "resume_id": rid,
#         "top1_job_id": jid,
#         "top1_job_title": job_row["job_title"],
#         "top1_job_location": job_row["location_cleaned"],
#         "top1_score": score
#     })

# pd.DataFrame(output).to_excel("resume_job_groundtrue.xlsx", index=False)
# print("\nDONE! Saved to resume_job_groundtrue.xlsx")


Building JD embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:06<00:00,  5.84it/s]
Resumes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [01:25<00:00, 85.32s/it]


DONE! Saved to resume_job_groundtrue.xlsx





# step 2 only for missing/less performance in step 1

In [None]:
# import pandas as pd
# import numpy as np
# import re
# import ast
# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity

# # ============================
# # Load Model
# # ============================
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # ============================
# # Load JD and Resume Data
# # ============================
# jd_df = pd.read_excel("../1_data_cleaning/filtered_jd_sections2.xlsx")
# jd_df["job_id"] = jd_df.index.astype(int)

# resume_df = pd.read_csv("../1_data_cleaning/resume_cleaned_100.csv")
# resume_df = resume_df[
#     [
#         "career_objective",
#         "skills",
#         "degree_names",
#         "major_field_of_studies",
#         "positions",
#         "responsibilities",
#     ]
# ]

# # ---------------------------
# # Build FULL resume index
# # ---------------------------
# resume_df = resume_df.reset_index(drop=True)
# resume_df["resume_id"] = resume_df.index.astype(int)

# # ---------------------------
# # SPECIAL IDs 
# # ---------------------------
# special_manual = [29,82,83,86,94]
# # special_range = list(range(80, 100))

# SPECIAL_IDS = sorted(list(set(special_manual)))

# print("üîç Will match resume IDs:", SPECIAL_IDS)

# # ============================
# # Utility Functions
# # ============================
# def clean_text(x):
#     if isinstance(x, float) or pd.isna(x):
#         return ""
#     x = str(x)
#     x = re.sub(r"\s+", " ", x)
#     return x.strip()

# def build_resume_text(row):
#     """Merge all resume fields into one text block."""
#     return (
#         clean_text(row["career_objective"])
#         + " "
#         + clean_text(row["skills"])
#         + " "
#         + clean_text(row["degree_names"])
#         + " "
#         + clean_text(row["major_field_of_studies"])
#         + " "
#         + clean_text(row["positions"])
#         + " "
#         + clean_text(row["responsibilities"])
#     )

# def extract_keywords(position_text):
#     """Parse list-like strings."""
#     if not isinstance(position_text, str):
#         return []

#     try:
#         items = ast.literal_eval(position_text)
#         if not isinstance(items, list):
#             items = [items]
#     except:
#         items = [position_text]

#     keywords = []
#     for item in items:
#         if not isinstance(item, str):
#             continue
#         words = item.lower().strip().split()
#         words = [w for w in words if len(w) > 2]
#         keywords.extend(words)

#     return list(set(keywords))

# def filter_jd_by_keywords(jd_df, keywords):
#     if not keywords:
#         return jd_df
#     mask = jd_df["job_description"].fillna("").str.lower().apply(
#         lambda x: any(kw in x for kw in keywords)
#     )
#     filtered = jd_df[mask]
#     return filtered if len(filtered) > 0 else jd_df

# # ============================
# # Matching Logic
# # ============================
# def match_resume(resume_row):
#     rid = resume_row["resume_id"]
#     resume_text = build_resume_text(resume_row)

#     keywords = extract_keywords(resume_row["positions"])
#     candidate_jd = filter_jd_by_keywords(jd_df, keywords)

#     resume_emb = embedder.encode(resume_text)
#     jd_texts = candidate_jd["job_description"].fillna("").tolist()
#     jd_embs = embedder.encode(jd_texts)

#     sims = cosine_similarity(resume_emb.reshape(1, -1), jd_embs)[0]
#     best_idx = np.argmax(sims)
#     best_jd_row = candidate_jd.iloc[best_idx]

#     return {
#         "resume_id": rid,
#         "top1_job_id": int(best_jd_row["job_id"]),
#         "top1_job_title": best_jd_row["job_title"],
#         "top1_job_location": best_jd_row["location_cleaned"],
#         "top1_score": float(sims[best_idx]),
#     }


# # ============================
# # Run Matching for SPECIAL + 80‚Äì100
# # ============================
# results = []

# for rid in SPECIAL_IDS:
#     if rid not in resume_df["resume_id"].values:
#         print(f"‚ö†Ô∏è Warning: resume_id {rid} not found. Skipped.")
#         continue

#     row = resume_df[resume_df["resume_id"] == rid].iloc[0]
#     result = match_resume(row)
#     results.append(result)

# output_df = pd.DataFrame(results)
# output_df.to_excel("resume_job_special_rematch.xlsx", index=False)

# print("\nüéâ DONE! Saved to resume_job_special_rematch.xlsx ")


üîç Will match resume IDs: [29, 82, 83, 86, 94]

üéâ DONE! Saved to resume_job_special_rematch.xlsx 


In [None]:
# import pandas as pd
# import json
# import numpy as np
# from tqdm import tqdm
# from sklearn.metrics.pairwise import cosine_similarity
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import time
# import ollama
# from sentence_transformers import SentenceTransformer

# # ========== Embedding model ==========
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # ========== Load data ==========
# jd_df = pd.read_excel("jobdescription.xlsx")
# jd_df["job_id"] = jd_df.index.astype(int)

# resume_df = pd.read_csv("resume_cleaned_100.csv")
# resume_df = resume_df[['career_objective', 'skills', 'degree_names', 'major_field_of_studies',
#        'positions', 'responsibilities']]

# # =====================================================================
# # Utility: robust JSON loader
# # =====================================================================
# def safe_json_loads(text):
#     try:
#         data = json.loads(text)
#         if isinstance(data, dict):
#             data = [data]
#         if isinstance(data, list):
#             return data
#         return []
#     except:
#         try:
#             start = text.index("[")
#             end = text.rindex("]") + 1
#             return json.loads(text[start:end])
#         except:
#             return []

# # =====================================================================
# # Text summarizer
# # =====================================================================
# def summarize_text(text, max_length=900):
#     if not isinstance(text, str):
#         return ""
#     text = text.strip()
#     if len(text) <= max_length:
#         return text
#     return text[:max_length]

# # =====================================================================
# # Build Resume text (NEW for your dataset)
# # =====================================================================
# def build_resume_profile(row):
#     return f"""
# Resume Text:
# {summarize_text(row['Resume_str'])}

# Category:
# {row['Category']}
# """

# # =====================================================================
# # Stage 1: Embedding coarse filter
# # =====================================================================
# def embed_text(text: str):
#     return embedder.encode(text if isinstance(text, str) else "")

# print("Building JD embeddings...")
# jd_texts = jd_df["job_description"].fillna("").tolist()
# jd_embeddings = embedder.encode(jd_texts, show_progress_bar=True)

# def fast_filter_candidates(resume_text, top_k=20):
#     emb = embed_text(resume_text).reshape(1, -1)
#     sims = cosine_similarity(emb, jd_embeddings)[0]
#     top_ids = np.argsort(sims)[::-1][:top_k]
#     return jd_df.iloc[top_ids]

# # =====================================================================
# # LLM scoring
# # =====================================================================
# LLM_MODEL = "phi3:mini"

# def build_prompt(resume_text, jd_batch):
#     jd_section = ""
#     for _, row in jd_batch.iterrows():
#         desc = summarize_text(row["job_description"], max_length=900)
#         jd_section += f"""
# ---
# Job ID: {row['job_id']}
# Job Title: {row['job_title']}
# Location: {row['location_cleaned']}
# Description:
# {desc}
# """
#     return f"""
# You are a senior hiring expert.

# Score each job from 0‚Äì1 based ONLY on:
# 1. Skills match
# 2. Work experience match
# 3. Education match
# 4. Location

# Output STRICT JSON ONLY:
# [
#   {{"job_id": 0, "score": 0.00}}
# ]

# Resume:
# {resume_text}

# Job Descriptions:
# {jd_section}

# Return TOP 5.
# """

# def score_batch_with_llm(resume_text, jd_batch, retry=3):
#     prompt = build_prompt(resume_text, jd_batch)
#     for _ in range(retry):
#         try:
#             resp = ollama.chat(
#                 model=LLM_MODEL,
#                 messages=[{"role": "user", "content": prompt}],
#                 options={"num_ctx": 4096}
#             )
#             txt = resp["message"]["content"]
#             data = safe_json_loads(txt)
#             if data:
#                 return data
#         except:
#             time.sleep(1)
#     return []

# # =====================================================================
# # GUARANTEE top1 exists
# # =====================================================================
# def ensure_top1(scored, jd_df):
#     if len(scored) >= 1:
#         return scored[:1]
#     fallback_id = int(jd_df.iloc[0]["job_id"])
#     return [(fallback_id, 0.0)]

# # =====================================================================
# # Match per resume
# # =====================================================================
# def chunk_df(df, size=10):
#     for i in range(0, len(df), size):
#         yield df.iloc[i:i+size]

# def match_resume_to_jobs(resume_text, jd_df):
#     filtered = fast_filter_candidates(resume_text, top_k=20)
#     scored = {}

#     with ThreadPoolExecutor(max_workers=8) as executor:
#         futures = [
#             executor.submit(score_batch_with_llm, resume_text, batch)
#             for batch in chunk_df(filtered, size=10)
#         ]
#         for fut in as_completed(futures):
#             results = fut.result()
#             for r in results:
#                 try:
#                     jid = int(r["job_id"])
#                     score = float(r["score"])
#                     scored[jid] = max(scored.get(jid, 0.0), score)
#                 except:
#                     continue

#     if not scored:
#         return ensure_top1([], jd_df)

#     ranked = sorted(scored.items(), key=lambda x: x[1], reverse=True)
#     return ensure_top1(ranked, jd_df)

# # =====================================================================
# # Run all resumes
# # =====================================================================
# output = []

# for _, row in tqdm(resume_df.iterrows(), total=len(resume_df), desc="Resumes"):
#     rid = row["ID"]
#     resume_text = build_resume_profile(row)

#     top1 = match_resume_to_jobs(resume_text, jd_df)

#     jid, score = top1[0]
#     job_row = jd_df[jd_df["job_id"] == jid].iloc[0]

#     rec = {
#         "resume_id": rid,
#         "top1_job_id": jid,
#         "top1_job_title": job_row["job_title"],
#         "top1_job_location": job_row["location_cleaned"],
#         "top1_score": score
#     }
#     output.append(rec)

# pd.DataFrame(output).to_excel("resume_job_groundtrue.xlsx", index=False)
# print("\nDONE! Saved to resume_job_groundtrue.xlsx")


Building JD embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:05<00:00,  7.24it/s]
Resumes:  24%|‚ñà‚ñà‚ñç       | 24/100 [38:35<2:57:56, 140.49s/it]

In [None]:
# import pandas as pd
# import json
# import numpy as np
# from tqdm import tqdm
# from sklearn.metrics.pairwise import cosine_similarity
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import time
# import ollama
# from sentence_transformers import SentenceTransformer

# # ========== Embedding model ==========
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # ========== Load data ==========
# jd_df = pd.read_excel("jobdescription.xlsx")
# jd_df["job_id"] = jd_df.index.astype(int)

# resume_df = pd.read_csv("Resume.csv")
# resume_df = resume_df[
#     [ "ID",
#     "Resume_str", "Category"]
# ]

# # =====================================================================
# # Utility: robust JSON loader
# # =====================================================================
# def safe_json_loads(text):
#     try:
#         data = json.loads(text)
#         if isinstance(data, dict):
#             data = [data]
#         if isinstance(data, list):
#             return data
#         return []
#     except:
#         try:
#             start = text.index("[")
#             end = text.rindex("]") + 1
#             return json.loads(text[start:end])
#         except:
#             return []

# # =====================================================================
# # Text summarizer
# # =====================================================================
# def summarize_text(text, max_length=900):
#     if not isinstance(text, str):
#         return ""
#     text = text.strip()
#     if len(text) <= max_length:
#         return text

#     lines = text.split("\n")
#     summary = []
#     for ln in lines:
#         ln = ln.strip()
#         if len(ln) > 20:
#             summary.append(ln)
#         if len(" ".join(summary)) > max_length:
#             break

#     return " ".join(summary)[:max_length]

# # =====================================================================
# # Build Resume text
# # =====================================================================
# def build_resume_profile(row):
#     return f"""
# Resume Title: {row['Resume Title']}

# Work Experience:
# {summarize_text(row['Work Experience'])}

# Education:
# {summarize_text(row['Education'])}

# Skills:
# {summarize_text(row['Skills'])}

# Additional Information:
# {summarize_text(row['Additional Information'])}
# """

# # =====================================================================
# # Stage 1: Embedding coarse filter
# # =====================================================================
# def embed_text(text: str):
#     return embedder.encode(text if isinstance(text, str) else "")

# print("Building JD embeddings...")
# jd_texts = jd_df["job_description"].fillna("").tolist()
# jd_embeddings = embedder.encode(jd_texts, show_progress_bar=True)

# def fast_filter_candidates(resume_text, top_k=20):
#     emb = embed_text(resume_text).reshape(1, -1)
#     sims = cosine_similarity(emb, jd_embeddings)[0]
#     top_ids = np.argsort(sims)[::-1][:top_k]
#     return jd_df.iloc[top_ids]

# # =====================================================================
# # LLM scoring
# # =====================================================================
# LLM_MODEL = "phi3:mini"

# def build_prompt(resume_text, jd_batch):
#     jd_section = ""
#     for _, row in jd_batch.iterrows():
#         desc = summarize_text(row["job_description"], max_length=900)
#         jd_section += f"""
# ---
# Job ID: {row['job_id']}
# Job Title: {row['job_title']}
# Location: {row['location_cleaned']}
# Description:
# {desc}
# """

#     return f"""
# You are a senior hiring expert.

# Score each job from 0‚Äì1 based ONLY on:
# 1. Skills match
# 2. Work experience match
# 3. Education match
# 4. Location

# Output STRICT JSON ONLY:
# [
#   {{"job_id": 0, "score": 0.00}}
# ]

# NO explanations.

# Resume:
# {resume_text}

# Job Descriptions:
# {jd_section}

# Return TOP 5.
# """

# def score_batch_with_llm(resume_text, jd_batch, retry=3):
#     prompt = build_prompt(resume_text, jd_batch)

#     for _ in range(retry):
#         try:
#             resp = ollama.chat(
#                 model=LLM_MODEL,
#                 messages=[{"role": "user", "content": prompt}],
#                 options={"num_ctx": 4096}
#             )
#             txt = resp["message"]["content"]
#             data = safe_json_loads(txt)
#             if data:
#                 return data
#         except:
#             time.sleep(1)

#     return []

# # =====================================================================
# # GUARANTEE top1 exists
# # =====================================================================
# def ensure_top1(scored, jd_df):
#     """
#     Only keep top1. If empty, fill with dummy job.
#     """
#     if len(scored) >= 1:
#         return scored[:1]

#     # pick any job as filler
#     fallback_id = int(jd_df.iloc[0]["job_id"])
#     return [(fallback_id, 0.0)]

# # =====================================================================
# # Match per resume
# # =====================================================================
# def chunk_df(df, size=10):
#     for i in range(0, len(df), size):
#         yield df.iloc[i:i+size]

# def match_resume_to_jobs(resume_text, jd_df):
#     filtered = fast_filter_candidates(resume_text, top_k=20)
#     scored = {}

#     with ThreadPoolExecutor(max_workers=8) as executor:
#         futures = [
#             executor.submit(score_batch_with_llm, resume_text, batch)
#             for batch in chunk_df(filtered, size=10)
#         ]
#         for fut in as_completed(futures):
#             results = fut.result()
#             for r in results:
#                 try:
#                     jid = int(r["job_id"])
#                     score = float(r["score"])
#                     scored[jid] = max(scored.get(jid, 0.0), score)
#                 except:
#                     continue

#     if not scored:
#         return ensure_top1([], jd_df)

#     ranked = sorted(scored.items(), key=lambda x: x[1], reverse=True)
#     return ensure_top1(ranked, jd_df)

# # =====================================================================
# # Run all resumes
# # =====================================================================
# output = []

# for _, row in tqdm(resume_df.iterrows(), total=len(resume_df), desc="Resumes"):
#     rid = row["Uniq Id"]
#     resume_text = build_resume_profile(row)

#     top1 = match_resume_to_jobs(resume_text, jd_df)

#     jid, score = top1[0]
#     job_row = jd_df[jd_df["job_id"] == jid].iloc[0]

#     rec = {
#         "resume_id": rid,
#         "top1_job_id": jid,
#         "top1_job_title": job_row["job_title"],
#         "top1_job_location": job_row["location_cleaned"],
#         "top1_score": score
#     }

#     output.append(rec)

# pd.DataFrame(output).to_excel("resume_job_matching_results.xlsx", index=False)
# print("\nDONE! Saved to resume_job_matching_results.xlsx")


'(ProtocolError('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer')), '(Request ID: 114ad6d4-4e87-434b-8eba-2446435aec9c)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


Building JD embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:05<00:00,  7.45it/s]
Resumes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [07:57<00:00, 47.73s/it]


DONE! Saved to resume_job_matching_results.xlsx





In [None]:
# import pandas as pd
# import json
# import numpy as np
# from tqdm import tqdm
# from sklearn.metrics.pairwise import cosine_similarity
# from concurrent.futures import ThreadPoolExecutor, as_completed
# import time
# import ollama
# from sentence_transformers import SentenceTransformer

# # ========== Embedding model ==========
# embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# # ========== Load data ==========
# jd_df = pd.read_excel("jobdescription.xlsx")
# jd_df["job_id"] = jd_df.index.astype(int)

# resume_df = pd.read_csv("resumes.csv")
# resume_df = resume_df[
#     ["Uniq Id", "Resume Title", "Introduction",
#      "Work Experience", "Education",
#      "Skills", "Additional Information"]
# ]

# # =====================================================================
# # Utility: robust JSON loader
# # =====================================================================
# def safe_json_loads(text):
#     """Never crash; always return [] or valid list."""
#     try:
#         data = json.loads(text)
#         if isinstance(data, dict):
#             data = [data]
#         if isinstance(data, list):
#             return data
#         return []
#     except:
#         # Try extract JSON part
#         try:
#             start = text.index("[")
#             end = text.rindex("]") + 1
#             return json.loads(text[start:end])
#         except:
#             return []

# # =====================================================================
# # Text summarizer to reduce prompt size
# # =====================================================================
# def summarize_text(text, max_length=900):
#     """Shorten long resume or job description."""
#     if not isinstance(text, str):
#         return ""

#     text = text.strip()
#     if len(text) <= max_length:
#         return text

#     # Simple extraction (fast)
#     lines = text.split("\n")
#     summary = []
#     for ln in lines:
#         ln = ln.strip()
#         if len(ln) > 20:
#             summary.append(ln)
#         if len(" ".join(summary)) > max_length:
#             break

#     return " ".join(summary)[:max_length]

# # =====================================================================
# # Build Resume text with summarization
# # =====================================================================
# def build_resume_profile(row):
#     return f"""
# Resume Title: {row['Resume Title']}

# Work Experience:
# {summarize_text(row['Work Experience'])}

# Education:
# {summarize_text(row['Education'])}

# Skills:
# {summarize_text(row['Skills'])}

# Additional Information:
# {summarize_text(row['Additional Information'])}
# """

# # =====================================================================
# # Stage 1: Embedding coarse filter
# # =====================================================================
# def embed_text(text: str) -> np.ndarray:
#     return embedder.encode(text if isinstance(text, str) else "")

# print("Building JD embeddings...")
# jd_texts = jd_df["job_description"].fillna("").tolist()
# jd_embeddings = embedder.encode(jd_texts, show_progress_bar=True)

# def fast_filter_candidates(resume_text, top_k=20):
#     emb = embed_text(resume_text).reshape(1, -1)
#     sims = cosine_similarity(emb, jd_embeddings)[0]
#     top_ids = np.argsort(sims)[::-1][:top_k]
#     return jd_df.iloc[top_ids]

# # =====================================================================
# # Stable LLM scoring (with retry + summarization)
# # =====================================================================
# LLM_MODEL = "phi3:mini"

# def build_prompt(resume_text, jd_batch):
#     jd_section = ""
#     for _, row in jd_batch.iterrows():
#         # compress JD text
#         desc = summarize_text(row["job_description"], max_length=900)
#         jd_section += f"""
# ---
# Job ID: {row['job_id']}
# Job Title: {row['job_title']}
# Location: {row['location_cleaned']}
# Description:
# {desc}
# """

#     return f"""
# You are a senior hiring expert.

# Score each job from 0‚Äì1 based ONLY on:
# 1. Skills match (highest weight)
# 2. Work experience match
# 3. Education match
# 4. Location match (lowest weight)

# Output STRICT JSON ONLY in this exact format:
# [
#   {{"job_id": 0, "score": 0.00}}
# ]

# NO explanations. NO extra text.

# Resume:
# {resume_text}

# Job Descriptions:
# {jd_section}

# Return TOP 5 highest scoring entries.
# """

# def score_batch_with_llm(resume_text, jd_batch, retry=3):
#     prompt = build_prompt(resume_text, jd_batch)

#     for _ in range(retry):
#         try:
#             resp = ollama.chat(
#                 model=LLM_MODEL,
#                 messages=[{"role": "user", "content": prompt}],
#                 options={"num_ctx": 4096}
#             )
#             txt = resp["message"]["content"]
#             data = safe_json_loads(txt)
#             if data:
#                 return data
        
#         except Exception:
#             time.sleep(1)

#     return []  # fallback

# # =====================================================================
# # Guarantee top1/top2/top3 always exist
# # =====================================================================
# def ensure_top3(scored, jd_df):
#     if len(scored) >= 3:
#         return scored[:3]

#     # fill missing with dummy lowest score
#     needed = 3 - len(scored)

#     all_job_ids = set(jd_df["job_id"])
#     used = {jid for jid, _ in scored}
#     remaining = list(all_job_ids - used)[:needed]

#     for jid in remaining:
#         scored.append((jid, 0.0))

#     return scored[:3]

# # =====================================================================
# # Master function for each resume
# # =====================================================================
# def chunk_df(df, size=10):
#     for i in range(0, len(df), size):
#         yield df.iloc[i:i+size]

# def match_resume_to_jobs(resume_text, jd_df):
#     filtered = fast_filter_candidates(resume_text, top_k=20)
#     scored = {}

#     with ThreadPoolExecutor(max_workers=8) as executor:
#         futures = [
#             executor.submit(score_batch_with_llm, resume_text, batch)
#             for batch in chunk_df(filtered, size=10)
#         ]
#         for fut in as_completed(futures):
#             results = fut.result()
#             for r in results:
#                 try:
#                     jid = int(r["job_id"])
#                     score = float(r["score"])
#                     scored[jid] = max(scored.get(jid, 0.0), score)
#                 except:
#                     continue

#     if not scored:
#         return ensure_top3([], jd_df)

#     ranked = sorted(scored.items(), key=lambda x: x[1], reverse=True)
#     return ensure_top3(ranked, jd_df)

# # =====================================================================
# # Run all
# # =====================================================================
# output = []

# for _, row in tqdm(resume_df.iterrows(), total=len(resume_df), desc="Resumes"):
#     rid = row["Uniq Id"]
#     resume_text = build_resume_profile(row)
#     top3 = match_resume_to_jobs(resume_text, jd_df)

#     rec = {"resume_id": rid}
#     for i, (jid, score) in enumerate(top3, start=1):
#         job_row = jd_df[jd_df["job_id"] == jid].iloc[0]
#         rec[f"top{i}_job_id"] = jid
#         rec[f"top{i}_job_title"] = job_row["job_title"]
#         rec[f"top{i}_job_location"] = job_row["location_cleaned"]
#         rec[f"top{i}_score"] = score

#     output.append(rec)

# pd.DataFrame(output).to_excel("resume_job_matching_results.xlsx", index=False)
# print("\nDONE! Saved to resume_job_matching_results.xlsx")


Building JD embeddings...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:05<00:00,  7.36it/s]
Resumes: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [07:50<00:00, 47.03s/it]


DONE! Saved to resume_job_matching_results.xlsx



