# CountVectorizer Model

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from IPython.display import display_html

## Jobs data

In [2]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])
jobs.fillna('', inplace=True)
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

## User data

In [3]:
def gather_profile_data(file_path):
    profile_data = pd.read_csv(file_path)
    profile_data['text'] = profile_data['Titles'] + ' ' \
                            + profile_data['Skills'] + ' ' \
                            + profile_data['Summary'] + ' ' \
                            + profile_data['Education']
    try: profile_data['text'] += ' ' + profile_data['Certifications']
    except: pass
    
    try: profile_data['text'] += ' ' + profile_data['Projects']
    except: pass
    
    return profile_data

In [4]:
# Reading in linkedin profile data.
profile_data_zach = gather_profile_data('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')
profile_data_nolan = gather_profile_data('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')
profile_data_albert = gather_profile_data('../data/linkedin/test-output/Albert_LinkedInData.csv')
profile_data_ye = gather_profile_data('../data/linkedin/test-output/Ye_LinkedInData.csv')

## Make recommendations

In [8]:
def get_recommendations(vectorizer, user_data):
    # Fit transform on text data
    cvec_jobtext = vectorizer.fit_transform(jobs['text'])

    # Transforming user profile text
    user_cvec = vectorizer.transform(user_data['text'])

    # Calculating cosine similarity between users profile and job text
    cos_similar_user = map(lambda x: cosine_similarity(user_cvec, x), cvec_jobtext)

    # Results are a list of our cosine similarity scores
    results = list(cos_similar_user)

    # Create a dataframe using our results and job_titles
    new_df = pd.DataFrame(data = [results, jobs['job_title']]).T
    new_df = new_df.rename(columns = {0: "sim_score", 1 : "job_title"})

    # Column sim_score as float so that we can use groupby
    new_df['sim_score'] = new_df['sim_score'].astype(float)

    # Group by job_title, using count and mean to sort values
    new_df = new_df.groupby('job_title').mean().sort_values('sim_score', ascending=False)

    return new_df

In [9]:
# Instantiating countvectorizer
count_vectorizer = CountVectorizer()

# Instantiating countvectorizer with stopwords and min_df
count_vectorizer_stopwords = CountVectorizer(stop_words = "english", min_df = 10)

In [10]:
# Keep all words
zachs_recommendations = get_recommendations(count_vectorizer, profile_data_zach)
nolans_recommendations = get_recommendations(count_vectorizer, profile_data_nolan)
alberts_recommendations = get_recommendations(count_vectorizer, profile_data_albert)
yes_recommendations = get_recommendations(count_vectorizer, profile_data_ye)

# Remove stop words and min_df = 10
zachs_recommendations_stopwords = get_recommendations(count_vectorizer_stopwords, profile_data_zach)
nolans_recommendations_stopwords = get_recommendations(count_vectorizer_stopwords, profile_data_nolan)
alberts_recommendations_stopwords = get_recommendations(count_vectorizer_stopwords, profile_data_albert)
yes_recommendations_stopwords = get_recommendations(count_vectorizer_stopwords, profile_data_ye)

### Zach

In [11]:
# Credit for notebook styling: https://blog.softhints.com/display-two-pandas-dataframes-side-by-side-jupyter-notebook/
df1_styler = zachs_recommendations.style.set_table_attributes("style='display:inline'").set_caption('All Words')
df2_styler = zachs_recommendations_stopwords.style.set_table_attributes("style='display:inline'").set_caption('Stopwords Removed')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.351783
Analyst,0.322048
Director,0.31041
Architect,0.309148
Manager,0.288559
Engineer,0.287443
Administrator,0.286771
Technician,0.284725
Support,0.277533
Developer,0.277097

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.178998
Architect,0.126449
Analyst,0.124752
Developer,0.092764
Director,0.09108
Programmer,0.088899
Engineer,0.076362
Administrator,0.073022
Consulting,0.071229
Manager,0.062797


### Nolan

In [12]:
df1_styler = nolans_recommendations.style.set_table_attributes("style='display:inline'").set_caption('All Words')
df2_styler = nolans_recommendations_stopwords.style.set_table_attributes("style='display:inline'").set_caption('Stopwords Removed')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.193067
Analyst,0.166393
Architect,0.162124
Developer,0.150255
Director,0.146972
Programmer,0.146852
Engineer,0.13927
Administrator,0.137523
Manager,0.136588
Consulting,0.129598

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.205967
Analyst,0.156924
Architect,0.147545
Director,0.130016
Developer,0.127768
Programmer,0.124049
Administrator,0.1128
Manager,0.108675
Engineer,0.107925
Consulting,0.104279


### Albert

In [13]:
df1_styler = alberts_recommendations.style.set_table_attributes("style='display:inline'").set_caption('All Words')
df2_styler = alberts_recommendations_stopwords.style.set_table_attributes("style='display:inline'").set_caption('Stopwords Removed')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Director,0.499885
Data Position,0.493522
Manager,0.482534
Analyst,0.479695
Architect,0.46414
Engineer,0.464071
Technician,0.456273
Support,0.448927
Designer,0.442908
Programmer,0.441415

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Data Position,0.19138
Analyst,0.16125
Architect,0.144769
Director,0.131209
Manager,0.124166
Programmer,0.115866
Developer,0.114011
Consulting,0.106573
Engineer,0.105706
Administrator,0.098912


### Ye

In [14]:
df1_styler = yes_recommendations.style.set_table_attributes("style='display:inline'").set_caption('All Words')
df2_styler = yes_recommendations_stopwords.style.set_table_attributes("style='display:inline'").set_caption('Stopwords Removed')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Director,0.227829
Data Position,0.219652
Manager,0.219126
Analyst,0.217274
Engineer,0.214543
Technician,0.211993
Support,0.21044
Designer,0.206233
Architect,0.204803
Administrator,0.204063

Unnamed: 0_level_0,sim_score
job_title,Unnamed: 1_level_1
Director,0.036633
Analyst,0.036214
Data Position,0.033121
Manager,0.03047
Engineer,0.030076
Programmer,0.028149
Administrator,0.028072
Developer,0.026906
Architect,0.026468
Support,0.02588
