In [3]:
import os
import fitz
import random
import re
import pickle
import numpy as np
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics.pairwise import cosine_similarity

print("Libraries Loaded Successfully")


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
  from .autonotebook import tqdm as notebook_tqdm


Libraries Loaded Successfully


In [4]:
print("Loading Embedding Model...")

model = SentenceTransformer("all-mpnet-base-v2")

print("Model Loaded Successfully")


Loading Embedding Model...




Model Loaded Successfully


In [5]:
DATASET_PATH = r"C:\Users\vaibh\Downloads\Resumedata\data\data"

print("Starting PDF Extraction...")

data = []

for category_folder in os.listdir(DATASET_PATH):
    folder_path = os.path.join(DATASET_PATH, category_folder)

    if os.path.isdir(folder_path):
        print(f"Reading from: {category_folder}")
        
        for filename in os.listdir(folder_path):
            if filename.endswith(".pdf"):
                filepath = os.path.join(folder_path, filename)
                try:
                    doc = fitz.open(filepath)
                    text = ""
                    for page in doc:
                        text += page.get_text()
                    
                    if len(text.strip()) > 200:   # remove very short resumes
                        data.append({
                            "Category": category_folder,
                            "Resume": text
                        })
                except:
                    pass

df = pd.DataFrame(data)
print("Total Resumes Extracted:", len(df))
df.head()


Starting PDF Extraction...
Reading from: ACCOUNTANT
Reading from: ADVOCATE
Reading from: AGRICULTURE
Reading from: APPAREL
Reading from: ARTS
Reading from: AUTOMOBILE
Reading from: AVIATION
Reading from: BANKING
Reading from: BPO
Reading from: BUSINESS-DEVELOPMENT
Reading from: CHEF
Reading from: CONSTRUCTION
Reading from: CONSULTANT
Reading from: DESIGNER
Reading from: DIGITAL-MEDIA
Reading from: ENGINEERING
Reading from: FINANCE
Reading from: FITNESS
Reading from: HEALTHCARE
Reading from: HR
Reading from: INFORMATION-TECHNOLOGY
Reading from: PUBLIC-RELATIONS
Reading from: SALES
Reading from: TEACHER
Total Resumes Extracted: 2483


Unnamed: 0,Category,Resume
0,ACCOUNTANT,ACCOUNTANT\nSummary\nFinancial Accountant spec...
1,ACCOUNTANT,STAFF ACCOUNTANT\nSummary\nHighly analytical a...
2,ACCOUNTANT,ACCOUNTANT\nProfessional Summary\nTo obtain a ...
3,ACCOUNTANT,SENIOR ACCOUNTANT\nExperience\nCompany Name Ju...
4,ACCOUNTANT,SENIOR ACCOUNTANT\nProfessional Summary\nSenio...


In [6]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"[^\x00-\x7f]", " ", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["Cleaned_Resume"] = df["Resume"].apply(clean_text)

print("Cleaning Complete")


Cleaning Complete


In [None]:
print("Generating Resume Embeddings...")

resume_embeddings = model.encode(
    df["Cleaned_Resume"].tolist(),
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)

print("Embedding Shape:", resume_embeddings.shape)


Generating Resume Embeddings...


Batches:  12%|████████▎                                                               | 9/78 [02:40<20:10, 17.54s/it]

In [13]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["Category"])

print("Categories:", label_encoder.classes_)


Categories: ['ACCOUNTANT' 'ADVOCATE' 'AGRICULTURE' 'APPAREL' 'ARTS' 'AUTOMOBILE'
 'AVIATION' 'BANKING' 'BPO' 'BUSINESS-DEVELOPMENT' 'CHEF' 'CONSTRUCTION'
 'CONSULTANT' 'DESIGNER' 'DIGITAL-MEDIA' 'ENGINEERING' 'FINANCE' 'FITNESS'
 'HEALTHCARE' 'HR' 'INFORMATION-TECHNOLOGY' 'PUBLIC-RELATIONS' 'SALES'
 'TEACHER']


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    resume_embeddings, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training Logistic Regression...")

clf = LogisticRegression(max_iter=3000)
clf.fit(X_train, y_train)

print("Training Complete")


Training Logistic Regression...
Training Complete


In [15]:
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Before Fine Tuning:", round(accuracy*100, 2), "%")

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy Before Fine Tuning: 72.03 %
                        precision    recall  f1-score   support

            ACCOUNTANT       0.82      0.96      0.88        24
              ADVOCATE       0.62      0.62      0.62        24
           AGRICULTURE       1.00      0.54      0.70        13
               APPAREL       0.50      0.32      0.39        19
                  ARTS       0.71      0.24      0.36        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.67      0.75      0.71        24
               BANKING       0.62      0.57      0.59        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.92      0.96      0.94        24
                  CHEF       0.91      0.88      0.89        24
          CONSTRUCTION       0.81      0.77      0.79        22
            CONSULTANT       0.70      0.70      0.70        23
              DESIGNER       0.76      0.76      0.76        21
  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [1]:
print("Preparing Fine-Tuning Data...")

train_examples = []

for category in df["Category"].unique():
    category_resumes = df[df["Category"] == category]["Cleaned_Resume"].tolist()
    other_resumes = df[df["Category"] != category]["Cleaned_Resume"].tolist()
    
    for resume in category_resumes:
        positive = random.choice(category_resumes)
        negative = random.choice(other_resumes)
        
        train_examples.append(
            InputExample(texts=[resume, positive, negative])
        )

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

train_loss = losses.TripletLoss(model)

print("Starting Fine Tuning...")

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=3,
    warmup_steps=100,
    show_progress_bar=True
)

print("Fine Tuning Complete")


Preparing Fine-Tuning Data...


NameError: name 'df' is not defined

In [None]:
print("Generating New Embeddings...")

resume_embeddings_ft = model.encode(
    df["Cleaned_Resume"].tolist(),
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    resume_embeddings_ft, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

clf = LogisticRegression(max_iter=3000)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy After Fine Tuning:", round(accuracy*100, 2), "%")

print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


In [None]:
def rank_resumes(job_description, top_k=10):
    
    jd_embedding = model.encode([clean_text(job_description)], convert_to_numpy=True)
    
    similarities = cosine_similarity(jd_embedding, resume_embeddings_ft)[0]
    
    df["Similarity"] = similarities
    
    ranked_df = df.sort_values(by="Similarity", ascending=False)
    
    return ranked_df[["Category", "Similarity"]].head(top_k)

# Example
job_description = """
Looking for a Python Developer with experience in machine learning,
Django, REST APIs, and SQL.
"""

rank_resumes(job_description, top_k=5)


In [None]:
model.save("smart_hire_finetuned_model")

with open("resume_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Model Saved Successfully")
