In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# 📘 Step 2: Load Cleaned Resumes
df = pd.read_csv('../data/cleaned_resumes.csv')
df.head()
df.dropna(subset=['cleaned_resume', 'Category'], inplace=True)
print(f"Total resumes loaded: {len(df)}")

Total resumes loaded: 2481


In [None]:
# 📘 Step 2: Take Only Top 100 Resumes
df = df.head(100)
print(f"Total resumes loaded: {len(df)}")  # Should print 100
# 📘 Step 3: Load SBERT Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# 📘 Step 4: Encode Top 100 Resumes
print("Encoding top 100 resumes...")
resume_embeddings = model.encode(df['cleaned_resume'].tolist(), show_progress_bar=True)

# 📘 Step 5: Input a Sample JD
sample_jd = """
Looking for a Data Scientist with strong Python, machine learning, and NLP skills.
Experience with resume parsing, vector embeddings, and model deployment is a plus.
"""

Total resumes loaded: 100
Encoding top 100 resumes...


Batches: 100%|██████████| 4/4 [00:10<00:00,  2.59s/it]


In [12]:

jd_embedding = model.encode([sample_jd])

# 📘 Step 6: Compute Cosine Similarities
cosine_scores = cosine_similarity(jd_embedding, resume_embeddings)[0]

# 📘 Step 7: Rank Resumes
df['similarity_score'] = cosine_scores
top_matches = df.sort_values(by='similarity_score', ascending=False).head(10)

# 📘 Step 8: Save Results
top_matches.to_csv('../output/top_resume_matches.csv', index=False)
print("Top matching resumes saved to 'output/top_resume_matches.csv' ✅")

Top matching resumes saved to 'output/top_resume_matches.csv' ✅
