In [1]:
import os
import fitz
import random
import re
import pickle
import numpy as np
import pandas as pd
import torch

# VERY IMPORTANT for low RAM devices
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity

print("Libraries Loaded Successfully")


Libraries Loaded Successfully


In [2]:
print("Loading Lightweight Embedding Model...")

# Best for low-end devices
model = SentenceTransformer("all-MiniLM-L6-v2")

print("Model Loaded Successfully")


Loading Lightweight Embedding Model...




Model Loaded Successfully


In [25]:
DATASET_PATH = r"C:\Users\vaibh\Downloads\Resumedata\data\data"

print("Starting PDF Extraction...")

data = []

for category_folder in os.listdir(DATASET_PATH):
    folder_path = os.path.join(DATASET_PATH, category_folder)

    if os.path.isdir(folder_path):
        print(f"Reading from: {category_folder}")
        
        for filename in os.listdir(folder_path):
            if filename.endswith(".pdf"):
                filepath = os.path.join(folder_path, filename)
                try:
                    doc = fitz.open(filepath)
                    text = ""
                    for page in doc:
                        text += page.get_text()
                    
                    if len(text.strip()) > 200:   # remove very short resumes
                        data.append({
                            "Category": category_folder,
                            "Resume": text
                        })
                except:
                    pass

df = pd.DataFrame(data)
print("Total Resumes Extracted:", len(df))
df.head()


Starting PDF Extraction...
Reading from: ACCOUNTANT
Reading from: ADVOCATE
Reading from: AGRICULTURE
Reading from: APPAREL
Reading from: ARTS
Reading from: AUTOMOBILE
Reading from: AVIATION
Reading from: BANKING
Reading from: BPO
Reading from: BUSINESS-DEVELOPMENT
Reading from: CHEF
Reading from: CONSTRUCTION
Reading from: CONSULTANT
Reading from: DESIGNER
Reading from: DIGITAL-MEDIA
Reading from: ENGINEERING
Reading from: FINANCE
Reading from: FITNESS
Reading from: HEALTHCARE
Reading from: HR
Reading from: INFORMATION-TECHNOLOGY
Reading from: PUBLIC-RELATIONS
Reading from: SALES
Reading from: TEACHER
Total Resumes Extracted: 2483


Unnamed: 0,Category,Resume
0,ACCOUNTANT,ACCOUNTANT\nSummary\nFinancial Accountant spec...
1,ACCOUNTANT,STAFF ACCOUNTANT\nSummary\nHighly analytical a...
2,ACCOUNTANT,ACCOUNTANT\nProfessional Summary\nTo obtain a ...
3,ACCOUNTANT,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...
4,ACCOUNTANT,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...


In [26]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["Cleaned_Resume"] = df["Resume"].apply(clean_text)

print("Cleaning Complete")


Cleaning Complete


In [27]:
print("Generating Resume Embeddings...")

resume_embeddings = model.encode(
    df["Cleaned_Resume"].tolist(),
    batch_size=8,     # VERY IMPORTANT: low RAM
    show_progress_bar=True,
    convert_to_numpy=True
)

print("Embedding Shape:", resume_embeddings.shape)


Generating Resume Embeddings...


Batches:   0%|          | 0/311 [00:00<?, ?it/s]

Embedding Shape: (2483, 384)


In [28]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["Category"])

print("Categories:", label_encoder.classes_)


Categories: ['ACCOUNTANT' 'ADVOCATE' 'AGRICULTURE' 'APPAREL' 'ARTS' 'AUTOMOBILE'
 'AVIATION' 'BANKING' 'BPO' 'BUSINESS-DEVELOPMENT' 'CHEF' 'CONSTRUCTION'
 'CONSULTANT' 'DESIGNER' 'DIGITAL-MEDIA' 'ENGINEERING' 'FINANCE' 'FITNESS'
 'HEALTHCARE' 'HR' 'INFORMATION-TECHNOLOGY' 'PUBLIC-RELATIONS' 'SALES'
 'TEACHER']


In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    resume_embeddings,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training Classifier...")

clf = LogisticRegression(
    max_iter=4000,
    C=3,
    class_weight="balanced",
    n_jobs=-1
)

clf.fit(X_train, y_train)

print("Training Complete")


Training Classifier...
Training Complete


In [30]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Final Accuracy:", round(accuracy * 100, 2), "%")

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Final Accuracy: 73.04 %
                        precision    recall  f1-score   support

            ACCOUNTANT       0.81      0.92      0.86        24
              ADVOCATE       0.74      0.58      0.65        24
           AGRICULTURE       0.69      0.69      0.69        13
               APPAREL       0.43      0.32      0.36        19
                  ARTS       0.77      0.48      0.59        21
            AUTOMOBILE       0.22      0.29      0.25         7
              AVIATION       0.63      0.71      0.67        24
               BANKING       0.72      0.57      0.63        23
                   BPO       0.12      0.25      0.17         4
  BUSINESS-DEVELOPMENT       0.77      1.00      0.87        24
                  CHEF       0.95      0.83      0.89        24
          CONSTRUCTION       1.00      0.73      0.84        22
            CONSULTANT       0.71      0.74      0.72        23
              DESIGNER       0.86      0.86      0.86        21
         DIGITA

In [33]:
def rank_resumes(job_description, top_k=10):

    jd_embedding = model.encode([clean_text(job_description)], convert_to_numpy=True)

    similarities = cosine_similarity(jd_embedding, resume_embeddings)[0]

    df["Similarity"] = similarities

    ranked_df = df.sort_values(by="Similarity", ascending=False)

    return ranked_df[["Category", "Similarity"]].head(top_k)


In [34]:
job_description = """
Looking for a Python Developer with experience in machine learning,
Django, REST APIs, and SQL.
"""

rank_resumes(job_description, 5)


Unnamed: 0,Category,Similarity
291,AGRICULTURE,0.560035
2079,INFORMATION-TECHNOLOGY,0.504187
548,AVIATION,0.486463
2147,INFORMATION-TECHNOLOGY,0.472099
2101,INFORMATION-TECHNOLOGY,0.454375


In [35]:
model.save("smart_hire_embedding_model")

with open("resume_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("All Models Saved Successfully")


All Models Saved Successfully
