# 1

In [3]:
import pandas as pd
from difflib import SequenceMatcher

# 1. Load ground truth and prediction data
truth = pd.read_excel("../5_checking_accuracy/ground_true.xlsx")
pred = pd.read_csv("../4_model_outputs/baseline_outputs.csv")

# 2. Merge by resume ID
merged = truth.merge(pred, left_on="resume_id", right_on="resume_index", how="inner")

# 3. Helper function for fuzzy string similarity
def similar(a, b, threshold=0.9):
    return SequenceMatcher(None, a, b).ratio() >= threshold

# 4. Matching logic
def check_match(row):
    # --- Collect all ground truth Top-3 title + location pairs ---
    gt_pairs = []
    for g in [1, 2, 3]:
        tcol = f"top{g}_match_job_title"
        lcol = f"top{g}_match_job_location"
        if tcol in row and lcol in row:
            gt_title = str(row[tcol]).strip().lower()
            gt_loc = str(row[lcol]).strip().lower()
            if gt_title != "" and gt_title.lower() != "nan":
                gt_pairs.append((gt_title, gt_loc))

    # Initialize matching flags
    match_both = 0          # Exact match for title + location
    match_title = 0         # Exact match for title only
    match_title_fuzzy = 0   # Fuzzy match for title (similarity > threshold)

    # Optionally include legacy single ground truth columns if present
    if "match_job_title" in row and "match_job_location" in row:
        gt_single_title = str(row["match_job_title"]).strip().lower()
        gt_single_loc = str(row["match_job_location"]).strip().lower()
        if gt_single_title not in ["", "nan"]:
            gt_pairs.append((gt_single_title, gt_single_loc))

    # --- Compare all ground truth Top-3 against all predicted Top-3 ---
    for gt_title, gt_loc in gt_pairs:
        for k in [1, 2, 3]:
            pred_title = str(row[f"top{k}_job_title"]).strip().lower()
            pred_loc = str(row[f"top{k}_location_cleaned"]).strip().lower()

            # Exact title match
            if gt_title == pred_title:
                match_title = 1

            # Exact title + location match
            if gt_title == pred_title and gt_loc == pred_loc:
                match_both = 1

            # Fuzzy title match (similarity > 0.8)
            if similar(gt_title, pred_title, threshold=0.8):
                match_title_fuzzy = 1

    return pd.Series([match_both, match_title, match_title_fuzzy])

# 5. Apply matching function to each row
merged[["match_flag_both", "match_flag_title", "match_flag_title_fuzzy"]] = merged.apply(check_match, axis=1)

# 6. Select relevant output columns
output_cols = [
    "resume_id",
    # Ground truth Top-3
    "top1_match_job_title", "top1_match_job_location",
    "top2_match_job_title", "top2_match_job_location",
    "top3_match_job_title", "top3_match_job_location",
    # Predicted Top-3
    "top1_job_title", "top1_location_cleaned",
    "top2_job_title", "top2_location_cleaned",
    "top3_job_title", "top3_location_cleaned",
    # Matching flags
    "match_flag_both", "match_flag_title", "match_flag_title_fuzzy",
]

result = merged[output_cols].copy()

# 7. Compute accuracy metrics
acc_both = result["match_flag_both"].mean()
acc_title = result["match_flag_title"].mean()
acc_title_fuzzy = result["match_flag_title_fuzzy"].mean()

print(f"Top-3 Accuracy (Title + Location exact): {acc_both:.3f}")
print(f"Top-3 Accuracy (Title exact only): {acc_title:.3f}")
print(f"Top-3 Accuracy (Title fuzzy >0.8): {acc_title_fuzzy:.3f}")

# 8. Save results to CSV
result.to_csv("job_match_results.csv", index=False)
print("✅ Output file saved as: job_match_results.csv")


Top-3 Accuracy (Title + Location exact): 0.280
Top-3 Accuracy (Title exact only): 0.310
Top-3 Accuracy (Title fuzzy >0.8): 0.450
✅ Output file saved as: job_match_results.csv


In [5]:
# import pandas as pd
# from difflib import SequenceMatcher

# # 1. Load data
# truth = pd.read_excel("../5_checking_accuracy/ground_true.xlsx")  # Columns: match_job_title, match_job_location
# pred = pd.read_csv("../4_model_outputs/baseline_outputs.csv")

# # 2. Merge by resume ID
# merged = truth.merge(pred, left_on="resume_id", right_on="resume_index", how="inner")

# # 3. Helper: fuzzy similarity
# def similar(a, b, threshold=0.9):
#     return SequenceMatcher(None, a, b).ratio() >= threshold

# # 4. Matching logic
# def check_match(row):
#     gt_title = str(row["match_job_title"]).strip().lower()
#     gt_loc = str(row["match_job_location"]).strip().lower()

#     match_both = 0
#     match_title = 0
#     match_title_fuzzy = 0

#     for k in [1, 2, 3]:
#         pred_title = str(row[f"top{k}_job_title"]).strip().lower()
#         pred_loc = str(row[f"top{k}_location_cleaned"]).strip().lower()

#         # exact title
#         if gt_title == pred_title:
#             match_title = 1
#         # title + location
#         if gt_title == pred_title and gt_loc == pred_loc:
#             match_both = 1
#         # fuzzy title
#         if similar(gt_title, pred_title, threshold=0.8):
#             match_title_fuzzy = 1

#     return pd.Series([match_both, match_title, match_title_fuzzy])

# # 5. Apply to each row
# merged[["match_flag_both", "match_flag_title", "match_flag_title_fuzzy"]] = merged.apply(check_match, axis=1)

# # 6. Select output columns
# output_cols = [
#     "resume_id",
#     "match_job_title", "match_job_location",
#     "top1_job_title", "top1_location_cleaned",
#     "top2_job_title", "top2_location_cleaned",
#     "top3_job_title", "top3_location_cleaned",
#     "match_flag_both", "match_flag_title", "match_flag_title_fuzzy",
# ]

# result = merged[output_cols].copy()

# # 7. Compute accuracies
# acc_both = result["match_flag_both"].mean()
# acc_title = result["match_flag_title"].mean()
# acc_title_fuzzy = result["match_flag_title_fuzzy"].mean()

# print(f"Top-3 Accuracy (Title + Location exact): {acc_both:.3f}")
# print(f"Top-3 Accuracy (Title exact only): {acc_title:.3f}")
# print(f"Top-3 Accuracy (Title fuzzy >0.8): {acc_title_fuzzy:.3f}")

# # 8. Save results
# result.to_csv("job_match_results.csv", index=False)
# print("✅ Output file saved as: job_match_results.csv")


# 2

In [6]:
import pandas as pd
from difflib import SequenceMatcher

# 1. Load ground truth and prediction data
truth = pd.read_excel("../5_checking_accuracy/ground_true.xlsx")  # Columns: match_job_title, match_job_location
pred = pd.read_csv("../4_model_outputs/crossencoder_outputs.csv")

# 2. Merge by resume ID
merged = truth.merge(pred, left_on="resume_id", right_on="resume_index", how="inner")

# 3. Helper function for fuzzy string similarity
def similar(a, b, threshold=0.9):
    return SequenceMatcher(None, a, b).ratio() >= threshold

# 4. Matching logic
def check_match(row):
    # --- Collect all ground truth Top-3 title + location pairs ---
    gt_pairs = []
    for g in [1, 2, 3]:
        tcol = f"top{g}_match_job_title"
        lcol = f"top{g}_match_job_location"
        if tcol in row and lcol in row:
            gt_title = str(row[tcol]).strip().lower()
            gt_loc = str(row[lcol]).strip().lower()
            if gt_title != "" and gt_title.lower() != "nan":
                gt_pairs.append((gt_title, gt_loc))

    # Initialize matching flags
    match_both = 0          # Exact match for title + location
    match_title = 0         # Exact match for title only
    match_title_fuzzy = 0   # Fuzzy match for title (similarity > threshold)

    # Optionally include legacy single ground truth columns if present
    if "match_job_title" in row and "match_job_location" in row:
        gt_single_title = str(row["match_job_title"]).strip().lower()
        gt_single_loc = str(row["match_job_location"]).strip().lower()
        if gt_single_title not in ["", "nan"]:
            gt_pairs.append((gt_single_title, gt_single_loc))

    # --- Compare all ground truth Top-3 against all predicted Top-3 ---
    for gt_title, gt_loc in gt_pairs:
        for k in [1, 2, 3]:
            pred_title = str(row[f"top{k}_job_title"]).strip().lower()
            pred_loc = str(row[f"top{k}_location_cleaned"]).strip().lower()

            # Exact title match
            if gt_title == pred_title:
                match_title = 1

            # Exact title + location match
            if gt_title == pred_title and gt_loc == pred_loc:
                match_both = 1

            # Fuzzy title match (similarity > 0.8)
            if similar(gt_title, pred_title, threshold=0.8):
                match_title_fuzzy = 1

    return pd.Series([match_both, match_title, match_title_fuzzy])

# 5. Apply matching function to each row
merged[["match_flag_both", "match_flag_title", "match_flag_title_fuzzy"]] = merged.apply(check_match, axis=1)

# 6. Select relevant output columns
output_cols = [
    "resume_id",
    # Ground truth Top-3
    "top1_match_job_title", "top1_match_job_location",
    "top2_match_job_title", "top2_match_job_location",
    "top3_match_job_title", "top3_match_job_location",
    # Predicted Top-3
    "top1_job_title", "top1_location_cleaned",
    "top2_job_title", "top2_location_cleaned",
    "top3_job_title", "top3_location_cleaned",
    # Matching flags
    "match_flag_both", "match_flag_title", "match_flag_title_fuzzy",
]

result = merged[output_cols].copy()

# 7. Compute accuracy metrics
acc_both = result["match_flag_both"].mean()
acc_title = result["match_flag_title"].mean()
acc_title_fuzzy = result["match_flag_title_fuzzy"].mean()

print(f"Top-3 Accuracy (Title + Location exact): {acc_both:.3f}")
print(f"Top-3 Accuracy (Title exact only): {acc_title:.3f}")
print(f"Top-3 Accuracy (Title fuzzy >0.8): {acc_title_fuzzy:.3f}")

# 8. Save results to CSV
result.to_csv("job_match_results.csv", index=False)
print("✅ Output file saved as: job_match_results.csv")


Top-3 Accuracy (Title + Location exact): 0.190
Top-3 Accuracy (Title exact only): 0.250
Top-3 Accuracy (Title fuzzy >0.8): 0.500
✅ Output file saved as: job_match_results.csv


In [7]:
# import pandas as pd
# from difflib import SequenceMatcher

# # 1. Load data
# truth = pd.read_excel("../5_checking_accuracy/ground_true.xlsx")  # Columns: match_job_title, match_job_location
# pred = pd.read_csv("../4_model_outputs/crossencoder_outputs.csv")

# # 2. Merge by resume ID
# merged = truth.merge(pred, left_on="resume_id", right_on="resume_index", how="inner")

# # 3. Helper: fuzzy similarity
# def similar(a, b, threshold=0.9):
#     return SequenceMatcher(None, a, b).ratio() >= threshold

# # 4. Matching logic
# def check_match(row):
#     gt_title = str(row["match_job_title"]).strip().lower()
#     gt_loc = str(row["match_job_location"]).strip().lower()

#     match_both = 0
#     match_title = 0
#     match_title_fuzzy = 0

#     for k in [1, 2, 3]:
#         pred_title = str(row[f"top{k}_job_title"]).strip().lower()
#         pred_loc = str(row[f"top{k}_location_cleaned"]).strip().lower()

#         # exact title
#         if gt_title == pred_title:
#             match_title = 1
#         # title + location
#         if gt_title == pred_title and gt_loc == pred_loc:
#             match_both = 1
#         # fuzzy title
#         if similar(gt_title, pred_title, threshold=0.8):
#             match_title_fuzzy = 1

#     return pd.Series([match_both, match_title, match_title_fuzzy])

# # 5. Apply to each row
# merged[["match_flag_both", "match_flag_title", "match_flag_title_fuzzy"]] = merged.apply(check_match, axis=1)

# # 6. Select output columns
# output_cols = [
#     "resume_id",
#     "match_job_title", "match_job_location",
#     "top1_job_title", "top1_location_cleaned",
#     "top2_job_title", "top2_location_cleaned",
#     "top3_job_title", "top3_location_cleaned",
#     "match_flag_both", "match_flag_title", "match_flag_title_fuzzy",
# ]

# result = merged[output_cols].copy()

# # 7. Compute accuracies
# acc_both = result["match_flag_both"].mean()
# acc_title = result["match_flag_title"].mean()
# acc_title_fuzzy = result["match_flag_title_fuzzy"].mean()

# print(f"Top-3 Accuracy (Title + Location exact): {acc_both:.3f}")
# print(f"Top-3 Accuracy (Title exact only): {acc_title:.3f}")
# print(f"Top-3 Accuracy (Title fuzzy >0.8): {acc_title_fuzzy:.3f}")

# # # 8. Save results
# # result.to_csv("job_match_results.csv", index=False)
# # print("Output file saved as: job_match_results.csv")


In [3]:
# df=pd.read_csv('job_match_results.csv')
# df.iloc[0]

In [4]:
# import pandas as pd

# # 1.Load data
# # truth = pd.read_excel("../5_checking_accuracy/ground_true.xlsx")  # Columns: match_job_title, match_job_location
# # pred = pd.read_csv("../4_model_outputs/baseline_outputs.csv")

# # 2. Merge the two tables on resume ID
# merged = truth.merge(pred, left_on="resume_id", right_on="resume_index", how="inner")

# # 3. Define matching logic
# def check_match(row):
#     gt_title = str(row["match_job_title"]).strip().lower()
#     gt_loc = str(row["match_job_location"]).strip().lower()

#     # Loop through top1–top3 predictions
#     for k in [1, 2, 3]:
#         pred_title = str(row[f"top{k}_job_title"]).strip().lower()
#         pred_loc = str(row[f"top{k}_location_cleaned"]).strip().lower()
#         if gt_title == pred_title and gt_loc == pred_loc:
#             return 1  # Match found
#     return 0  # No match

# # 4. Apply matching function to each row
# merged["match_flag"] = merged.apply(check_match, axis=1)

# # 5. Select columns for output
# output_cols = [
#     "resume_id",
#     "match_job_title", "match_job_location",
#     "top1_job_title", "top1_location_cleaned",
#     "top2_job_title", "top2_location_cleaned",
#     "top3_job_title", "top3_location_cleaned",
#     "match_flag",
# ]

# result2 = merged[output_cols].copy()

# # 6. Compute Top-3 Accuracy
# top3_acc2 = result2["match_flag"].mean()
# print(f"Top-3 Accuracy (Title + Location match): {top3_acc2:.3f}")

# # 7. Save the results
# result2.to_csv("job_match_results2.csv", index=False)
# print("Output file saved as: job_match_results2.csv")


In [5]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics.pairwise import cosine_similarity


# # =====================================================
# # Metric Functions
# # =====================================================
# def topk_accuracy(y_true, y_pred_topk, k=3):
#     """
#     Compute Top-k Accuracy and return hit flags.
#     """
#     hits = np.array([y_true[i] in y_pred_topk[i, :k] for i in range(len(y_true))])
#     return hits.mean(), hits


# def mrr_at_k(y_true, y_pred_topk, k=3):
#     """
#     Compute Mean Reciprocal Rank at K.
#     """
#     scores = []
#     for gt, preds in zip(y_true, y_pred_topk):
#         if gt in preds[:k]:
#             rank = list(preds[:k]).index(gt) + 1
#             scores.append(1.0 / rank)
#         else:
#             scores.append(0.0)
#     return np.mean(scores)


# def mean_cosine_similarity(y_true, y_pred_top3, resume_embs, job_embs, hits):
#     """
#     Compute Mean Cosine Similarity between resume embeddings and
#     ground-truth / predicted job embeddings.
#     """
#     true_sims, miss_sims = [], []

#     for i, gt in enumerate(y_true):
#         try:
#             # Retrieve embeddings
#             r_emb = resume_embs[i]
#             gt_emb = job_embs[gt]
#         except KeyError:
#             continue  # Skip if embedding not found

#         # Cosine similarity between resume and ground-truth job
#         true_sim = cosine_similarity(r_emb.reshape(1, -1), gt_emb.reshape(1, -1))[0, 0]
#         true_sims.append(true_sim)

#         # Cosine similarity between ground-truth job and top-3 predicted jobs
#         pred_embs = np.array([job_embs[j] for j in y_pred_top3[i] if j in job_embs])
#         if len(pred_embs) > 0:
#             sims = cosine_similarity(gt_emb.reshape(1, -1), pred_embs)[0]
#             best_sim = sims.max()
#             if not hits[i]:
#                 miss_sims.append(best_sim)

#     mean_all = np.mean(true_sims)
#     mean_miss = np.mean(miss_sims) if len(miss_sims) > 0 else None
#     return mean_all, mean_miss


# # =====================================================
# # Main Evaluation Function
# # =====================================================
# def evaluate_model_output(
#     ground_truth_path,
#     model_output_path,
#     resume_embs,
#     job_embs,
#     col_map=None,
# ):
#     """
#     Evaluate Top-3 Accuracy, MRR@3, and Mean Cosine Similarity.

#     Parameters
#     ----------
#     ground_truth_path : str
#         Path to the ground-truth CSV file (must include resume_id and top1_job_id).
#     model_output_path : str
#         Path to the model output CSV file.
#     resume_embs : np.ndarray or dict
#         Resume embeddings (either aligned by index or mapped by resume_id).
#     job_embs : np.ndarray or dict
#         Job embeddings (aligned by job_id or index).
#     col_map : dict, optional
#         Column name mapping for model output file, e.g.:
#         {
#             "resume_id": "resume_index",
#             "top1": "top1_jd_index",
#             "top2": "top2_jd_index",
#             "top3": "top3_jd_index"
#         }
#     """
#     # Default column mapping
#     default_cols = {
#         "resume_id": "resume_index",
#         "top1": "top1_jd_index",
#         "top2": "top2_jd_index",
#         "top3": "top3_jd_index",
#     }
#     if col_map is not None:
#         default_cols.update(col_map)

#     # Load data
#     truth = pd.read_csv(ground_truth_path)
#     pred = pd.read_csv(model_output_path)

#     # Merge on resume ID
#     merged = truth.merge(
#         pred, left_on="resume_id", right_on=default_cols["resume_id"]
#     )

#     # Build arrays
#     y_true = merged["top1_job_id"].astype(int).values
#     y_pred_top3 = (
#         merged[[default_cols["top1"], default_cols["top2"], default_cols["top3"]]]
#         .astype(int)
#         .values
#     )

#     # Compute metrics
#     top3_acc, hits = topk_accuracy(y_true, y_pred_top3, k=3)
#     mrr3 = mrr_at_k(y_true, y_pred_top3, k=3)
#     mean_all, mean_miss = mean_cosine_similarity(
#         y_true, y_pred_top3, resume_embs, job_embs, hits
#     )

#     # Print summary
#     print("======================================")
#     print(f"Top-3 Accuracy: {top3_acc:.3f}")
#     print(f"MRR@3: {mrr3:.3f}")
#     print(f"Mean Cosine Similarity (all): {mean_all:.3f}")
#     if mean_miss is not None:
#         print(f"Mean Cosine Similarity (miss only): {mean_miss:.3f}")
#     print("======================================")

#     return {
#         "Top3_Accuracy": top3_acc,
#         "MRR@3": mrr3,
#         "MeanCosSim_all": mean_all,
#         "MeanCosSim_miss": mean_miss,
#     }
