In [1]:
import pandas as pd

In [37]:
df = pd.read_csv(r"C:\Users\Sundharesan.sk\Desktop\CVM\data\resume.csv")
df.head()

Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [40]:
len(df) - sum(df.duplicated())

166

In [41]:
df.drop_duplicates(inplace=True)

In [7]:
import spacy
import re
nlp = spacy.load('en_core_web_sm')

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text) 
    doc = nlp(str(text))
    preprocessed_text = []
    for token in doc:
        if token.is_punct or token.like_num or token.is_space:
            continue
        preprocessed_text.append(token.lemma_.lower().strip())
    return ' '.join(preprocessed_text)

In [8]:
df['Processed'] = df['Resume'].apply(preprocess)

In [9]:
df.head()

Unnamed: 0,Category,Resume,Processed
0,Data Science,Skills * Programming Languages: Python (pandas...,skill programming languages python panda numpy...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...,education details may to may be uitrgpv data s...
2,Data Science,"Areas of Interest Deep Learning, Control Syste...",area of interest deep learning control system ...
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...,skill â r â python â sap hana â tableau â sap ...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab...",education details mca ymcaust faridabad haryan...


In [10]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('all-MiniLM-L6-v2')

df['Embedding'] = df['Processed'].apply(encoder.encode)

vectors = encoder.encode(df['Processed'])

  from tqdm.autonotebook import tqdm, trange
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [16]:
import faiss

vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)

In [17]:
faiss.normalize_L2(vectors)
index.add(vectors)

In [18]:
import numpy as np

search_text = 'Give the top 10 resumes for the python developer'
search_vector = encoder.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

In [19]:
k = index.ntotal
distances, ann = index.search(_vector, k=15)

In [20]:
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
results

Unnamed: 0,distances,ann
0,0.968674,552
1,0.968674,558
2,0.968674,564
3,0.968674,570
4,0.968674,576
...,...,...
957,1.980501,270
958,1.980501,276
959,1.980501,282
960,1.980501,288


In [42]:
import re

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  
    text = text.lower()
    return text

In [43]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

def generate_embedding(text):
    return model.encode(text)



In [44]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(job_description_embedding, resume_embeddings):
    return cosine_similarity([job_description_embedding], resume_embeddings)[0]

In [45]:
def rank_resumes(resumes, scores):
    ranked_resumes = sorted(zip(resumes, scores), key=lambda x: x[1], reverse=True)
    return ranked_resumes

In [46]:
def get_top_n_resumes(ranked_resumes, n=10):
    return ranked_resumes[:n]

In [51]:
preprocessed_resumes = df['Resume'].apply(preprocess_text)
resume_embeddings = preprocessed_resumes.apply(generate_embedding).tolist()

In [52]:
def matcher(resume_embeddings, job_description_embedding, top_n=10):
    scores = calculate_similarity(job_description_embedding, resume_embeddings)

    ranked_resumes = rank_resumes(df['Resume'], scores)

    top_resumes = get_top_n_resumes(ranked_resumes, top_n)
    
    return top_resumes

In [53]:
job_description_text = "python developer with knowledge on web applications and problem solving"
job_description_text = preprocess_text(job_description_text)
job_description_embedding = generate_embedding(job_description_text)

In [54]:
matcher(resume_embeddings, job_description_embedding)

[('Education Details \r\nJune 2013 to June 2016 Diploma Computer science Pune, Maharashtra Aissms\r\nJune 2016 BE pursuing Computer science Pune, Maharashtra Anantrao pawar college of Engineering & Research centre\r\nPython Developer \r\n\r\n\r\nSkill Details \r\nCompany Details \r\ncompany - Cybage Software Pvt. Ltd\r\ndescription - I want to work in organisation as a python developer to utilize my knowledge & To gain more knowledge with our organisation.',
  0.576438),
 ('TECHNICAL PROFICIENCIES Platform: Ubuntu/Fedora/Cent OS/Windows Database: MySQL Languages: Python, Tensorflow, Numpy, C, C++ Education Details \r\nJanuary 2016 ME Computer Engineering Pune, Maharashtra Savitribai Phule Pune University\r\nJanuary 2014 B.E Computer Engineering Pune, Maharashtra Savitribai Phule Pune University\r\nJanuary 2010    RYK Science College, Maharashtra state board\r\nJanuary 2008    Maharashtra state board\r\nPython developer \r\n\r\nPython Developer\r\nSkill Details \r\nC++- Exprience - 6 mo