In [1]:
import os
import re
import spacy
import fitz
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.decomposition import TruncatedSVD
from sentence_transformers import SentenceTransformer, CrossEncoder
from scipy.stats import loguniform
from scipy import sparse

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Path to the raw folders (e.g. archive(1)/data/data/ACCOUNTANT/...)
PATH_RAW_FOLDERS = "archive(1)/data/data"
OUTPUT_METRICS_FILE = "ranking_evaluation_results.csv"

##### 2. DATA PROCESSING

In [3]:
# Load spaCy
print("Loading spaCy model...")
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # print("Warning: spaCy model not found. Using regex cleaning only.")
    nlp = None

Loading spaCy model...


In [4]:
# Cleans raw resume text: removes newlines, special chars, extra spaces.
def clean_text(text):
    if not text: return ""
    
    # Remove newlines and tabs
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    
    # Remove non-ascii characters (clean up messy PDF artifacts)
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    
    # Remove extra spaces
    text = re.sub(' +', ' ', text)
    
    return text.strip()

In [5]:
def load_data_from_folders(base_path):
    if not os.path.exists(base_path):
        raise FileNotFoundError(f"CRITICAL ERROR: Folder '{base_path}' not found.")
        
    print(f"--- Scanning folders in {base_path} ---")
    data = []
    
    # Walk through all subfolders
    for root, dirs, files in os.walk(base_path):
        category = os.path.basename(root)
        
        # Skip the base folder itself
        if root == base_path: 
            continue
            
        pdf_files = [f for f in files if f.lower().endswith(".pdf")]
        
        if pdf_files:
            print(f"Processing {category:<20} | Found {len(pdf_files)} resumes")
            
            for file in pdf_files:
                file_path = os.path.join(root, file)
                try:
                    # EXTRACT TEXT WITH FITZ
                    text = ""
                    with fitz.open(file_path) as doc:
                        for page in doc:
                            text += page.get_text()
                    
                    # Store clean data
                    data.append({
                        "Category": category, 
                        "Filename": file,
                        "Cleaned_Text": clean_text(text)
                    })
                    
                except Exception as e:
                    print(f"  [Error] Could not read {file}: {e}")
    
    return pd.DataFrame(data)

##### 3. EVALUATION LOGIC

In [6]:
def evaluate_ranking_performance(y_true, y_pred_proba, classes, output_path):
    n_samples = len(y_true)
    ranks = []
    reciprocal_ranks = []
    detailed_results = []
    
    print(f"\nEvaluating predictions for {n_samples} test resumes...")
    
    for i in range(n_samples):
        true_label = y_true[i]
        probs = y_pred_proba[i]
        
        # Sort predictions by probability (Highest first)
        sorted_indices = np.argsort(probs)[::-1]
        ranked_classes = [classes[idx] for idx in sorted_indices]
        
        # Find where the TRUE category is in the ranked list
        if true_label in ranked_classes:
            rank = ranked_classes.index(true_label) + 1
        else:
            # Fallback if label is missing (rare)
            rank = len(classes) + 1 

        reciprocal_rank = 1.0 / rank
        ranks.append(rank)
        reciprocal_ranks.append(reciprocal_rank)
        
        detailed_results.append({
            "resume_idx": i,
            "true_category": true_label,
            "top1_prediction": ranked_classes[0],
            "rank_of_true_category": rank,
            "reciprocal_rank": reciprocal_rank
        })

    # --- Calculate Aggregate Metrics ---
    mrr = np.mean(reciprocal_ranks)
    acc_top1 = np.mean([r == 1 for r in ranks])
    acc_top3 = np.mean([r <= 3 for r in ranks])
    avg_rank = np.mean(ranks)
    
    # NDCG@3 Approximation
    ndcg_scores = [(1.0 / np.log2(r + 1)) if r <= 3 else 0.0 for r in ranks]
    mean_ndcg_3 = np.mean(ndcg_scores)
    
    # --- PRINT THE REPORT ---
    print("\n" + "="*70 + "FINAL EVALUATION SUMMARY (All Data, Proper Ranking Metrics)" + "="*70)
    print(f"Total resumes evaluated: {n_samples}")
    print(f"Total categories: {len(classes)}")
    print(f"Mean Reciprocal Rank (MRR):        {mrr:.4f}")
    print(f"Precision@1 (Accuracy):           {acc_top1:.4f}")
    print(f"Precision@3:                      {acc_top3:.4f}")
    print(f"Mean NDCG@3:                      {mean_ndcg_3:.4f}")
    print(f"Average rank of true category:    {avg_rank:.2f} / {len(classes)}")
    print("="*180)
    
    # Save CSV
    df_res = pd.DataFrame(detailed_results)
    df_res.to_csv(output_path, index=False)
    print(f"Detailed results saved to: {os.path.abspath(output_path)}")
    
    # Distribution
    correct_count = sum([r == 1 for r in ranks])
    print(f"Correct predictions (P@1): {correct_count} / {n_samples} ({acc_top1*100:.1f}%)")
    
    print("Distribution of true category ranks:")
    rank_counts = pd.Series(ranks).value_counts().sort_index()
    for r in range(1, 11):
        c = rank_counts.get(r, 0)
        print(f"  Rank {r:>2}: {c:>4} resumes ({(c/n_samples)*100:>5.1f}%)")

    print("\nWorst predictions (lowest reciprocal rank):")
    print(df_res.sort_values("reciprocal_rank").head(5).to_string(index=False))
    
    print("\nBest predictions (highest reciprocal rank):")
    print(df_res.sort_values(["reciprocal_rank", "resume_idx"], ascending=[False, True]).head(5).to_string(index=False))

##### 4. MAIN FUNCTION

In [None]:
# SBERT setup: install with `pip install -U sentence-transformers` if needed
print("Loading SBERT model (all-MiniLM-L6-v2)...")
model_sbert = SentenceTransformer('all-MiniLM-L6-v2')

def embed_texts(texts, batch_size=64):
    """Encode a list of texts into numpy embeddings using SBERT."""
    return model_sbert.encode(texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True)

Loading SBERT model (all-mpnet-base-v2)...


In [17]:
if __name__ == "__main__":

    # A. Load Data (Reading PDFs from Folder)
    try:
        df = load_data_from_folders(PATH_RAW_FOLDERS)
        print(f"\nSuccessfully loaded {len(df)} resumes.")
    except Exception as e:
        print(e)
        exit()

    if df.empty:
        print("Error: No PDF files found. Check your folder path.")
        exit()

    # B. Split Data
    print("Splitting data into Train/Test sets...")
    X = df['Cleaned_Text']
    y = df['Category']

    # Stratify ensures we have examples of every job type in the test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # C1. TF-IDF Features
    print("Fitting TF-IDF vectorizer on training data...")
    tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
    tfidf_train = tfidf_vectorizer.fit_transform(X_train.values.astype('U'))
    tfidf_test = tfidf_vectorizer.transform(X_test.values.astype('U'))

    # C2. Embedding (SBERT)
    print("Embedding texts with SBERT model...")
    # Convert pandas Series to list and embed
    X_train_vec = embed_texts(X_train.tolist())
    X_test_vec = embed_texts(X_test.tolist())

    # Combine SBERT embeddings with TF-IDF features (sparse-safe)
    print("Combining SBERT embeddings with TF-IDF features using sparse hstack...")
    # Convert dense SBERT embeddings to sparse and horizontally stack with TF-IDF sparse matrices
    X_train_vec_sparse = sparse.csr_matrix(X_train_vec)
    X_test_vec_sparse = sparse.csr_matrix(X_test_vec)

    X_train_combined = sparse.hstack([X_train_vec_sparse, tfidf_train]).tocsr()
    X_test_combined = sparse.hstack([X_test_vec_sparse, tfidf_test]).tocsr()

    # D. Hyperparameter tuning + Train Model (Logistic Regression)
    print("Running RandomizedSearchCV to tune LogisticRegression (C parameter)...")
    # Use class_weight='balanced' to mitigate class imbalance
    base_clf = LogisticRegression(max_iter=2000, multi_class='multinomial', solver='saga', n_jobs=-1, class_weight='balanced')
    param_dist = {'C': loguniform(1e-4, 1e4)}
    search = RandomizedSearchCV(
    base_clf, 
    param_distributions=param_dist, 
    n_iter=4,     # Reduce from 20 to 4 (checks fewer random settings)
    cv=3,         # Reduce from 5 to 3 (less validation, fine for demos)
    scoring='accuracy', 
    n_jobs=-1, 
    random_state=42
)
    search.fit(X_train_combined, y_train)
    clf = search.best_estimator_
    print("Best params:", search.best_params_)

    # E. Predict Probabilities (Crucial for Ranking)
    print("Predicting probabilities on test set...")
    y_pred_proba = clf.predict_proba(X_test_combined)
    classes = clf.classes_

    # F. Run Evaluation
    evaluate_ranking_performance(
        y_true=y_test.values,
        y_pred_proba=y_pred_proba,
        classes=classes,
        output_path=OUTPUT_METRICS_FILE
    )

--- Scanning folders in archive(1)/data/data ---
Processing ACCOUNTANT           | Found 118 resumes
Processing ADVOCATE             | Found 118 resumes
Processing AGRICULTURE          | Found 63 resumes
Processing APPAREL              | Found 97 resumes
Processing ARTS                 | Found 103 resumes
Processing AUTOMOBILE           | Found 36 resumes
Processing AVIATION             | Found 117 resumes
Processing BANKING              | Found 115 resumes
Processing BPO                  | Found 22 resumes
Processing BUSINESS-DEVELOPMENT | Found 120 resumes
Processing CHEF                 | Found 118 resumes
Processing CONSTRUCTION         | Found 112 resumes
Processing CONSULTANT           | Found 115 resumes
Processing DESIGNER             | Found 107 resumes
Processing DIGITAL-MEDIA        | Found 96 resumes
Processing ENGINEERING          | Found 118 resumes
Processing FINANCE              | Found 118 resumes
Processing FITNESS              | Found 117 resumes
Processing HEALTHCAR

KeyboardInterrupt: 

In [16]:
import numpy as np
import re
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# ==========================================
# 1. SETUP & LOAD MODEL
# ==========================================
print("Loading SBERT model (all-MiniLM-L6-v2)...")
# This model handles the "Semantic" understanding of the text
model_sbert = SentenceTransformer('all-MiniLM-L6-v2')

# ==========================================
# 2. PRE-PROCESSING FUNCTIONS
# ==========================================
def clean_text(text):
    """
    Cleans text to ensure fair comparison.
    (Corresponds to 'Skill Normalization' concept in your image)
    """
    if not text: return ""
    
    # Remove newlines and tabs
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    
    # Remove non-ascii characters
    text = re.sub(r'[^\x00-\x7f]', r' ', text)
    
    # Remove extra spaces
    text = re.sub(' +', ' ', text)
    
    return text.strip().lower()

def calculate_matching_score(job_description, resume_text):
    """
    Calculates the semantic similarity score between a Job and a Resume.
    """
    # A. Pre-process (Clean & Normalize)
    clean_jd = clean_text(job_description)
    clean_resume = clean_text(resume_text)

    # B. Embed (Convert text to Semantic Vectors)
    # This replaces "Skill Extraction" by capturing the entire context/skills semantically
    embeddings = model_sbert.encode([clean_jd, clean_resume])
    
    # embeddings[0] is the Job Description
    # embeddings[1] is the Resume

    # C. Matching (Cosine Similarity)
    # Reshape is needed because cosine_similarity expects 2D arrays
    # This calculates how close the two vectors are in space
    score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    
    return score

# ==========================================
# 3. RUN THE MATCHING SYSTEM
# ==========================================
if __name__ == "__main__":
    
    # --- INPUT 1: The Job Description ---
    job_desc = """
    We are looking for a Senior Python Developer.
    Requirements:
    - Strong experience in Python and Django/Flask.
    - Knowledge of SQL and NoSQL databases.
    - Experience with REST APIs and Cloud services (AWS).
    - Understanding of machine learning concepts is a plus.
    """

    # --- INPUT 2: The Candidate Resume ---
    candidate_resume = """
    Experienced Software Engineer with a focus on backend systems.
    Proficient in Python, Java, and C++.
    Built scalable web applications using Django and PostgreSQL.
    Familiar with AWS (EC2, S3) and Docker.
    Interested in Data Science and AI.
    """

    print("\n--- CALCULATING MATCHING SCORE ---")
    print(f"Job Description Length: {len(job_desc)} chars")
    print(f"Resume Length: {len(candidate_resume)} chars")
    
    # Run the comparison
    similarity_score = calculate_matching_score(job_desc, candidate_resume)
    
    # Output the result
    print("-" * 40)
    print(f"Matching Score: {similarity_score:.4f}")
    print(f"Match Percentage: {similarity_score * 100:.2f}%")
    print("-" * 40)

    # Interpretation
    if similarity_score > 0.75:
        print(">> VERDICT: Excellent Match")
    elif similarity_score > 0.5:
        print(">> VERDICT: Good Match")
    else:
        print(">> VERDICT: Low Match")

Loading SBERT model (all-MiniLM-L6-v2)...

--- CALCULATING MATCHING SCORE ---
Job Description Length: 287 chars
Resume Length: 261 chars
----------------------------------------
Matching Score: 0.7026
Match Percentage: 70.26%
----------------------------------------
>> VERDICT: Good Match


In [13]:
# After training finishes and you have `clf` and `tfidf_vectorizer`
from resume_pipeline import save_artifacts
import json, os

outdir = "model_artifacts"
save_artifacts(outdir, clf, tfidf_vectorizer)

# optional metadata (class ordering)
os.makedirs(outdir, exist_ok=True)
with open(os.path.join(outdir, "metadata.json"), "w", encoding="utf8") as fh:
    json.dump({"classes": clf.classes_.tolist()}, fh)

print("Saved artifacts to", outdir)

Saved artifacts to model_artifacts
