In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [11]:
nltk.download('stopwords')
nltk.download('punkt')

df = pd.read_csv('../data/resumes_job_postings.csv', sep=';')
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,job_description,resume,expected_place
0,Location: Remote Company: XYZ Solutions Inc.Ab...,"John Doe123 Main Street | Anytown, USA, 12345 ...",1
1,Location: RemoteCompany: XYZ Solutions Inc.Abo...,"John Doe123 Main Street | Anytown, USA, 12345 ...",2
2,Location: RemoteCompany: XYZ Solutions Inc.Abo...,"John Doe123 Main Street | Anytown, USA, 12345 ...",3
3,Location: RemoteCompany: XYZ Solutions Inc.Abo...,"John Doe123 Main Street | Anytown, USA, 12345 ...",4
4,Location: RemoteCompany: XYZ Solutions Inc.Abo...,"Jane Smith123 Elm Street | Springfield, IL 627...",5


In [12]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = word_tokenize(text)
    words = [word for word in words if word.isalnum()]
    words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(words)

df['job_description'] = df['job_description'].apply(preprocess_text)
df['resume'] = df['resume'].apply(preprocess_text)

job_description = df['job_description'].iloc[0]

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([job_description] + df['resume'].tolist())

job_description_tfidf = tfidf_matrix[0]
resume_tfidf = tfidf_matrix[1:]

similarity_scores = cosine_similarity(job_description_tfidf, resume_tfidf).flatten()

result_df = pd.DataFrame({'resume_number': range(len(df)), 'similarity_score': similarity_scores})

print(result_df)

   resume_number  similarity_score
0              0          0.646133
1              1          0.612038
2              2          0.490929
3              3          0.276778
4              4          0.209474
5              5          0.120061
6              6          0.051464
