# Tfidf Model

In [45]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from IPython.display import display_html

## Jobs data

In [46]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])
jobs.fillna('', inplace=True)
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

## User data

In [47]:
def gather_profile_data(file_path):
    profile_data = pd.read_csv(file_path)
    profile_data['text'] = profile_data['Titles'] + ' ' \
                            + profile_data['Skills'] + ' ' \
                            + profile_data['Summary'] + ' ' \
                            + profile_data['Education']
    try: profile_data['text'] += ' ' + profile_data['Certifications']
    except: pass
    
    try: profile_data['text'] += ' ' + profile_data['Projects']
    except: pass
    
    return profile_data

In [48]:
# Reading in linkedin profile data.
profile_data_zach = gather_profile_data('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')
profile_data_nolan = gather_profile_data('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')
profile_data_albert = gather_profile_data('../data/linkedin/test-output/Albert_LinkedInData.csv')
profile_data_ye = gather_profile_data('../data/linkedin/test-output/Ye_LinkedInData.csv')

## Make recommendations

In [49]:
def get_recommendations(vectorizer, user_data):
    # Fit transform on text data
    cvec_jobtext = vectorizer.fit_transform(jobs['text'])

    # Transforming user profile text
    user_cvec = vectorizer.transform(user_data['text'])

    # Calculating cosine similarity between users profile and job text
    cos_similar_user = map(lambda x: cosine_similarity(user_cvec, x), cvec_jobtext)

    # Results are a list of our cosine similarity scores
    results = list(cos_similar_user)

    # Create a dataframe using our results and job_titles
    new_df = pd.DataFrame(data = [results, jobs['job_title']]).T
    new_df = new_df.rename(columns = {0: "sim_score", 1 : "job_title"})

    # Column sim_score as float so that we can use groupby
    new_df['sim_score'] = new_df['sim_score'].astype(float)

    # Group by job_title, using count and mean to sort values
    new_df = new_df.groupby('job_title').mean().sort_values('sim_score', ascending=False)

    return new_df

In [50]:
# Instantiating vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = "english")

In [51]:
# Calculate similarity scores
zachs_recommendations = get_recommendations(tfidf_vectorizer, profile_data_zach)
nolans_recommendations = get_recommendations(tfidf_vectorizer, profile_data_nolan)
alberts_recommendations = get_recommendations(tfidf_vectorizer, profile_data_albert)
yes_recommendations = get_recommendations(tfidf_vectorizer, profile_data_ye)

In [52]:
# Credit for notebook styling: https://blog.softhints.com/display-two-pandas-dataframes-side-by-side-jupyter-notebook/
df1_styler = zachs_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Zach')
df2_styler = nolans_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Nolan')
df3_styler = alberts_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Albert')
df4_styler = yes_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Ye')

space = "\xa0" * 5
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_() + space + df3_styler._repr_html_() + space + df4_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.083202
Analyst,0.053053
Architect,0.048627
Director,0.039053
Developer,0.03682
Programmer,0.035724
Engineer,0.031839
Administrator,0.028969
Manager,0.026283
Consulting,0.024734

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.086439
Analyst,0.057396
Architect,0.054187
Developer,0.05164
Programmer,0.050246
Director,0.049474
Engineer,0.043642
Administrator,0.041633
Manager,0.039171
Designer,0.03435

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.063702
Analyst,0.049534
Director,0.043379
Architect,0.043119
Manager,0.039721
Programmer,0.036334
Developer,0.035869
Designer,0.034215
Engineer,0.034108
Administrator,0.029137

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Director,0.016316
Data Position,0.013855
Analyst,0.013509
Manager,0.013388
Engineer,0.013238
Programmer,0.011853
Support,0.011755
Administrator,0.011392
Developer,0.01121
Architect,0.010951


In [53]:
# Instantiating vectorizer with min_df
tfidf_vectorizer = TfidfVectorizer(stop_words = "english", min_df = 0.001)

# Calculate similarity scores
zachs_recommendations_min_df = get_recommendations(tfidf_vectorizer, profile_data_zach)
nolans_recommendations_min_df = get_recommendations(tfidf_vectorizer, profile_data_nolan)
alberts_recommendations_min_df = get_recommendations(tfidf_vectorizer, profile_data_albert)
yes_recommendations = get_recommendations(tfidf_vectorizer, profile_data_ye)

# Credit for notebook styling: https://blog.softhints.com/display-two-pandas-dataframes-side-by-side-jupyter-notebook/
df1_styler_min_df = zachs_recommendations_min_df.style.set_table_attributes("style='display:inline'").set_caption('Zach')
df2_styler_min_df = nolans_recommendations_min_df.style.set_table_attributes("style='display:inline'").set_caption('Nolan')
df3_styler_min_df = alberts_recommendations_min_df.style.set_table_attributes("style='display:inline'").set_caption('Albert')
df4_styler = yes_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Ye')


space = "\xa0" * 5
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_() + space + df3_styler._repr_html_() + space + df4_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.083202
Analyst,0.053053
Architect,0.048627
Director,0.039053
Developer,0.03682
Programmer,0.035724
Engineer,0.031839
Administrator,0.028969
Manager,0.026283
Consulting,0.024734

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.086439
Analyst,0.057396
Architect,0.054187
Developer,0.05164
Programmer,0.050246
Director,0.049474
Engineer,0.043642
Administrator,0.041633
Manager,0.039171
Designer,0.03435

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.063702
Analyst,0.049534
Director,0.043379
Architect,0.043119
Manager,0.039721
Programmer,0.036334
Developer,0.035869
Designer,0.034215
Engineer,0.034108
Administrator,0.029137

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Director,0.024351
Data Position,0.020356
Analyst,0.020182
Manager,0.019918
Engineer,0.019598
Programmer,0.017641
Support,0.017464
Administrator,0.017008
Developer,0.01666
Designer,0.015944
