# CountVectorizer Model

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from IPython.display import display_html

## Jobs data

In [2]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])
jobs.fillna('', inplace=True)
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

## User data

In [3]:
def gather_profile_data(file_path):
    profile_data = pd.read_csv(file_path)
    profile_data['text'] = profile_data['Titles'] + ' ' \
                            + profile_data['Skills'] + ' ' \
                            + profile_data['Summary'] + ' ' \
                            + profile_data['Education']
    return profile_data

In [4]:
# Reading in Zach's linkedin profile data.
profile_data_zach = gather_profile_data('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')
profile_data_zach

Unnamed: 0,Name,Titles,Skills,Summary,Education,Certifications,text
0,zachary brown,data science fellow python developer health ...,data analysis python (programming language) ...,i bridge the gap between data and climate poli...,data science intensive bachelor's of science,microsoft certified: azure data scientist asso...,data science fellow python developer health ...


In [5]:
# Reading in Nolan's linkedin profile data.
profile_data_nolan = gather_profile_data('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')
profile_data_nolan

Unnamed: 0,Name,Titles,Skills,Summary,Education,text
0,nolan arendt,data science fellow painter,data science python data analysis data mana...,an innovative data scientist who is passionate...,bachelor's degree software boot camp certificate,data science fellow painter data science pyt...


In [6]:
# Reading in Albert's linkedin profile data.
profile_data_albert = gather_profile_data('../data/linkedin/test-output/Albert_LinkedInData.csv')
profile_data_albert

Unnamed: 0,Name,Titles,Skills,Summary,Education,Projects,Certifications,text
0,albert frantz,data science fellow assistant teacher classr...,r python data analysis econometrics projec...,i am a detail-oriented data scientist that use...,bachelor of arts - ba nan nan,identifying the relationship between bike lane...,tableau a-z: hands-on tableau training fo...,data science fellow assistant teacher classr...


## Make recommendations

In [7]:
def get_recommendations(vectorizer, user_data):
    # Fit transform on text data
    cvec_jobtext = vectorizer.fit_transform(jobs['text'])

    # Transforming user profile text
    user_cvec = vectorizer.transform(user_data['text'])

    # Calculating cosine similarity between users profile and job text
    cos_similar_user = map(lambda x: cosine_similarity(user_cvec, x), cvec_jobtext)

    # Results are a list of our cosine similarity scores
    results = list(cos_similar_user)

    # Create a dataframe using our results and job_titles
    new_df = pd.DataFrame(data = [results, jobs['job_title']]).T
    new_df = new_df.rename(columns = {0: "sim_score", 1 : "job_title"})

    # Column sim_score as float so that we can use groupby
    new_df['sim_score'] = new_df['sim_score'].astype(float)

    # Group by job_title, using count and mean to sort values
    new_df = new_df.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)
    new_df = new_df.sort_values(by=['mean'], ascending = False)

    return new_df

In [8]:
# Instantiating countvectorizer
count_vectorizer = CountVectorizer()

# Instantiating countvectorizer with stopwords and min_df
count_vectorizer_stopwords = CountVectorizer(stop_words = "english", min_df = 10)

In [9]:
# Keep all words
zachs_recommendations = get_recommendations(count_vectorizer, profile_data_zach)
nolans_recommendations = get_recommendations(count_vectorizer, profile_data_nolan)
alberts_recommendations = get_recommendations(count_vectorizer, profile_data_albert)

# Remove stop words and min_df = 10
zachs_recommendations_stopwords = get_recommendations(count_vectorizer_stopwords, profile_data_zach)
nolans_recommendations_stopwords = get_recommendations(count_vectorizer_stopwords, profile_data_nolan)
alberts_recommendations_stopwords = get_recommendations(count_vectorizer_stopwords, profile_data_albert)

### Zach

In [10]:
# Credit for notebook styling: https://blog.softhints.com/display-two-pandas-dataframes-side-by-side-jupyter-notebook/
df1_styler = zachs_recommendations.style.set_table_attributes("style='display:inline'").set_caption('All Words')
df2_styler = zachs_recommendations_stopwords.style.set_table_attributes("style='display:inline'").set_caption('Stopwords Removed')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.367674
Analyst,1884,0.339164
Director,144,0.328041
Architect,823,0.323953
Manager,1379,0.305963
Engineer,4044,0.304451
Administrator,896,0.303012
Technician,342,0.301906
Support,342,0.294115
Developer,5179,0.293118

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.179976
Analyst,1884,0.127517
Architect,823,0.127388
Developer,5179,0.096288
Director,144,0.092779
Programmer,321,0.091975
Engineer,4044,0.078736
Administrator,896,0.073874
Consulting,578,0.072279
Manager,1379,0.064556


### Nolan

In [11]:
df1_styler = nolans_recommendations.style.set_table_attributes("style='display:inline'").set_caption('All Words')
df2_styler = nolans_recommendations_stopwords.style.set_table_attributes("style='display:inline'").set_caption('Stopwords Removed')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.193067
Analyst,1884,0.166393
Architect,823,0.162124
Developer,5179,0.150255
Director,144,0.146972
Programmer,321,0.146852
Engineer,4044,0.13927
Administrator,896,0.137523
Manager,1379,0.136588
Consulting,578,0.129598

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.205967
Analyst,1884,0.156924
Architect,823,0.147545
Director,144,0.130016
Developer,5179,0.127768
Programmer,321,0.124049
Administrator,896,0.1128
Manager,1379,0.108675
Engineer,4044,0.107925
Consulting,578,0.104279


### Albert

In [12]:
df1_styler = alberts_recommendations.style.set_table_attributes("style='display:inline'").set_caption('All Words')
df2_styler = alberts_recommendations_stopwords.style.set_table_attributes("style='display:inline'").set_caption('Stopwords Removed')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.397163
Director,144,0.367056
Analyst,1884,0.366224
Architect,823,0.359621
Manager,1379,0.35418
Engineer,4044,0.343097
Programmer,321,0.332967
Developer,5179,0.331946
Technician,342,0.327459
Designer,220,0.326477

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.21595
Architect,823,0.161085
Analyst,1884,0.160216
Director,144,0.138401
Manager,1379,0.126647
Developer,5179,0.123453
Programmer,321,0.121567
Consulting,578,0.111729
Engineer,4044,0.111438
Administrator,896,0.10485
