# Tfidf Model

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [2]:
# Reading in Jobs data
jobs = pd.read_csv('../data/job_postings.csv')
# Dropping unecessary columns from jobs
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])

In [3]:
# Replace nan's with empty string
jobs.fillna('', inplace=True)

In [4]:
# Combine job description and text into a single column
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

In [5]:
# Reading in Zachs linkedin profile data.
profile_data_zach = pd.read_csv('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')

# Creating one column, 'text' including titles, skills, summary, and education
profile_data_zach['text'] = profile_data_zach['Titles'] + ' ' \
                        + profile_data_zach['Skills'] + ' ' \
                        + profile_data_zach['Summary'] + ' ' \
                        + profile_data_zach['Education']
profile_data_zach

Unnamed: 0,Name,Titles,Skills,Summary,Education,Certifications,text
0,Zachary Brown,"Data Science Fellow, Python Developer, Health ...","Data Analysis, Python (Programming Language), ...",I bridge the gap between data and climate poli...,"Data Science Intensive, Bachelor's of Science",Microsoft Certified: Azure Data Scientist Asso...,"Data Science Fellow, Python Developer, Health ..."


In [6]:
profile_data_nolan = pd.read_csv('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')

profile_data_nolan['text'] = profile_data_nolan['Titles'] + ' ' \
                        + profile_data_nolan['Skills'] + ' ' \
                        + profile_data_nolan['Summary'] + ' ' \
                        + profile_data_nolan['Education']
profile_data_nolan

Unnamed: 0,Name,Titles,Skills,Summary,Education,text
0,Nolan Arendt,"Data Science Fellow, Painter","Data Science, Python, Data Analysis, Data Mana...",An innovative Data Scientist who is passionate...,"Bachelor's degree, Software Boot Camp Certificate","Data Science Fellow, Painter Data Science, Pyt..."


In [7]:
# Instantiating countvectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = "english")

# Cvec fit transform on column text in data
tfidf_jobtext = tfidf_vectorizer.fit_transform(jobs['text'])

# Transforming user profile column text
user_tfidf = tfidf_vectorizer.transform(profile_data_zach['text'])
# Calculating cosine similarity between users profile and job text
cos_similar_user = map(lambda x: cosine_similarity(user_tfidf, x), tfidf_jobtext)

# Results are a list of our cosine similarity scores
results = list(cos_similar_user)

# Create a dataframe using our results and job_titles, then transposing
new_df = pd.DataFrame(data = [results, jobs['job_title']]).T
# Renaming column 0 to sim score, similar to our tfidf model, and 1 to job_title
new_df = new_df.rename(columns = {0: "sim_score", 1 : "job_title"})
# Column sim_score as float so that we can use groupby
new_df['sim_score'] = new_df['sim_score'].astype(float)
# Group by job_title, using count and mean to sort values
new_df = new_df.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)
# Sorting our dataframe by mean, looking at either top 10 or top 3.
new_df = new_df.sort_values(by=['mean'], ascending = False)

new_df[0:10]

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,282,0.079
Analyst,1892,0.053481
Architect,828,0.047609
Director,146,0.038432
Developer,5234,0.037122
Programmer,321,0.036333
Engineer,4061,0.032168
Administrator,898,0.028251
Manager,1389,0.026169
Support,344,0.024517


In [8]:
# Instantiating countvectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = "english")

# Cvec fit transform on column text in data
tfidf_jobtext = tfidf_vectorizer.fit_transform(jobs['text'])

# Transforming user profile column text
user_tfidf = tfidf_vectorizer.transform(profile_data_nolan['text'])
# Calculating cosine similarity between users profile and job text
cos_similar_user = map(lambda x: cosine_similarity(user_tfidf, x), tfidf_jobtext)

# Results are a list of our cosine similarity scores
results = list(cos_similar_user)

# Create a dataframe using our results and job_titles, then transposing
new_df = pd.DataFrame(data = [results, jobs['job_title']]).T
# Renaming column 0 to sim score, similar to our tfidf model, and 1 to job_title
new_df = new_df.rename(columns = {0: "sim_score", 1 : "job_title"})
# Column sim_score as float so that we can use groupby
new_df['sim_score'] = new_df['sim_score'].astype(float)
# Group by job_title, using count and mean to sort values
new_df = new_df.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)
# Sorting our dataframe by mean, looking at either top 10 or top 3.
new_df = new_df.sort_values(by=['mean'], ascending = False)

new_df[0:10]

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,282,0.086271
Analyst,1892,0.057645
Architect,828,0.053957
Developer,5234,0.050391
Programmer,321,0.049618
Director,146,0.049342
Engineer,4061,0.043498
Administrator,898,0.041735
Manager,1389,0.039175
Designer,221,0.033307
