In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from gensim.models.callbacks import CallbackAny2Vec
import tqdm

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('data.csv')
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,resume_text,job_description_text,label
0,SummaryHighly motivated Sales Associate with e...,Net2Source Inc. is an award-winning total work...,No Fit
1,Professional SummaryCurrently working with Cat...,At Salas OBrien we tell our clients that were ...,No Fit
2,SummaryI started my construction career in Jun...,Schweitzer Engineering Laboratories (SEL) Infr...,No Fit
3,SummaryCertified Electrical Foremanwith thirte...,"Mizick Miller & Company, Inc. is looking for a...",No Fit
4,SummaryWith extensive experience in business/r...,Life at Capgemini\nCapgemini supports all aspe...,No Fit


In [4]:
class ProgressCallback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
    
    def on_epoch_end(self, model):
        print(f'Epoch {self.epoch} ended. Model trained on {len(model.wv)} words.')
        self.epoch += 1

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):
        words = word_tokenize(text)
        words = [word.lower() for word in words if word.isalnum()]
        words = [word for word in words if word not in stop_words]
        return words
    else:
        return []

# Preprocess resume and job postings data with progress tracking
def preprocess_dataframe(df, column_name):
    processed_texts = []
    total = len(df)
    for i, text in tqdm.tqdm(enumerate(df[column_name]), total=total, desc=f'Preprocessing {column_name}'):
        processed_texts.append(preprocess_text(text))
    return processed_texts

df['processed_resume'] = preprocess_dataframe(df, 'resume_text')
df['processed_description'] = preprocess_dataframe(df, 'job_description_text')

all_sentences = df['processed_resume'].tolist() + df['processed_description'].tolist()

# Train Word2Vec model
model = Word2Vec(sentences=all_sentences, vector_size=100, window=5, min_count=1, sg=0, epochs=10, callbacks=[ProgressCallback()])

# Vectorize function
def vectorize_text(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Load and preprocess the comparison data
df['job_description_text'] = df['job_description_text'].apply(preprocess_text)
df['resume_text'] = df['resume_text'].apply(preprocess_text)

# Extract job description and resumes from comparison data
job_description_vectors = np.array([vectorize_text(job_description, model) for job_description in df['job_description_text']])
resume_vectors = np.array([vectorize_text(resume, model) for resume in df['resume_text']])

cosine_similarities = [cosine_similarity(job_desc.reshape(1, -1), resume.reshape(1, -1))[0][0] for job_desc, resume in zip(job_description_vectors, resume_vectors)]

# Add cosine similarity to the DataFrame
df['cosine_similarity'] = cosine_similarities


Preprocessing resume_text: 100%|██████████| 6241/6241 [00:18<00:00, 337.22it/s]
Preprocessing job_description_text: 100%|██████████| 6241/6241 [00:12<00:00, 499.42it/s]


Epoch 0 ended. Model trained on 29806 words.
Epoch 1 ended. Model trained on 29806 words.
Epoch 2 ended. Model trained on 29806 words.
Epoch 3 ended. Model trained on 29806 words.
Epoch 4 ended. Model trained on 29806 words.
Epoch 5 ended. Model trained on 29806 words.
Epoch 6 ended. Model trained on 29806 words.
Epoch 7 ended. Model trained on 29806 words.
Epoch 8 ended. Model trained on 29806 words.
Epoch 9 ended. Model trained on 29806 words.


In [5]:
df.head()

Unnamed: 0,resume_text,job_description_text,label,processed_resume,processed_description,cosine_similarity
0,"[summaryhighly, motivated, sales, associate, e...","[net2source, total, workforce, solutions, comp...",No Fit,"[summaryhighly, motivated, sales, associate, e...","[net2source, total, workforce, solutions, comp...",0.13896
1,"[professional, summarycurrently, working, cate...","[salas, obrien, tell, clients, engineered, imp...",No Fit,"[professional, summarycurrently, working, cate...","[salas, obrien, tell, clients, engineered, imp...",0.372345
2,"[summaryi, started, construction, career, june...","[schweitzer, engineering, laboratories, sel, i...",No Fit,"[summaryi, started, construction, career, june...","[schweitzer, engineering, laboratories, sel, i...",0.35452
3,"[summarycertified, electrical, foremanwith, th...","[mizick, miller, company, looking, dynamic, in...",No Fit,"[summarycertified, electrical, foremanwith, th...","[mizick, miller, company, looking, dynamic, in...",0.136201
4,"[summarywith, extensive, experience, analysis,...","[life, capgemini, capgemini, supports, aspects...",No Fit,"[summarywith, extensive, experience, analysis,...","[life, capgemini, capgemini, supports, aspects...",0.05316


In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(df['cosine_similarity'], df['label'])
plt.xlabel('Cosine Similarity')
plt.ylabel('Label')
plt.title('Cosine Similarity vs Fit Label')
plt.grid(True)
plt.show()