# CountVectorizer Model

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## Jobs data

In [2]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])
jobs.fillna('', inplace=True)
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

## User data

In [13]:
def gather_profile_data(file_path):
    profile_data = pd.read_csv(file_path)
    profile_data['text'] = profile_data['Titles'] + ' ' \
                            + profile_data['Skills'] + ' ' \
                            + profile_data['Summary'] + ' ' \
                            + profile_data['Education']
    return profile_data

In [16]:
# Reading in Zach's linkedin profile data.
profile_data_zach = gather_profile_data('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')
profile_data_zach

Unnamed: 0,Name,Titles,Skills,Summary,Education,Certifications,text
0,Zachary Brown,"Data Science Fellow, Python Developer, Health ...","Data Analysis, Python (Programming Language), ...",I bridge the gap between data and climate poli...,"Data Science Intensive, Bachelor's of Science",Microsoft Certified: Azure Data Scientist Asso...,"Data Science Fellow, Python Developer, Health ..."


In [17]:
# Reading in Nolan's linkedin profile data.
profile_data_nolan = gather_profile_data('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')
profile_data_nolan

Unnamed: 0,Name,Titles,Skills,Summary,Education,text
0,Nolan Arendt,"Data Science Fellow, Painter","Data Science, Python, Data Analysis, Data Mana...",An innovative Data Scientist who is passionate...,"Bachelor's degree, Software Boot Camp Certificate","Data Science Fellow, Painter Data Science, Pyt..."


## Make recommendations

In [19]:
def get_recommendations(vectorizer, user_data):
    # Fit transform on text data
    cvec_jobtext = vectorizer.fit_transform(jobs['text'])

    # Transforming user profile text
    user_cvec = vectorizer.transform(user_data['text'])

    # Calculating cosine similarity between users profile and job text
    cos_similar_user = map(lambda x: cosine_similarity(user_cvec, x), cvec_jobtext)

    # Results are a list of our cosine similarity scores
    results = list(cos_similar_user)

    # Create a dataframe using our results and job_titles
    new_df = pd.DataFrame(data = [results, jobs['job_title']]).T
    new_df = new_df.rename(columns = {0: "sim_score", 1 : "job_title"})

    # Column sim_score as float so that we can use groupby
    new_df['sim_score'] = new_df['sim_score'].astype(float)

    # Group by job_title, using count and mean to sort values
    new_df = new_df.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)
    new_df = new_df.sort_values(by=['mean'], ascending = False)

    return new_df

In [20]:
# Instantiating countvectorizer
count_vectorizer = CountVectorizer()

zachs_recommendations = get_recommendations(count_vectorizer, profile_data_zach)

In [21]:
zachs_recommendations

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.365817
Analyst,1884,0.340209
Director,144,0.333934
Architect,823,0.325763
Engineer,4045,0.311441
Manager,1379,0.308945
Administrator,896,0.304271
Technician,342,0.30279
Developer,5181,0.297937
Programmer,321,0.297358


In [22]:
# Instantiating countvectorizer
count_vectorizer = CountVectorizer()

nolans_recommendations = get_recommendations(count_vectorizer, profile_data_nolan)

In [23]:
nolans_recommendations

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.190303
Analyst,1884,0.167011
Architect,823,0.162032
Developer,5181,0.149351
Director,144,0.14833
Programmer,321,0.146484
Engineer,4045,0.14002
Administrator,896,0.138011
Manager,1379,0.137698
Consulting,578,0.130275


In [24]:
# Instantiating countvectorizer with stopwords and min_df
count_vectorizer = CountVectorizer(stop_words = "english", min_df = 10)

zachs_recommendations_stopwords = get_recommendations(count_vectorizer, profile_data_zach)

In [25]:
zachs_recommendations_stopwords

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.175788
Analyst,1884,0.127041
Architect,823,0.126426
Developer,5181,0.095018
Director,144,0.092491
Programmer,321,0.091021
Engineer,4045,0.07772
Administrator,896,0.073461
Consulting,578,0.072279
Manager,1379,0.064677


In [26]:
# Instantiating countvectorizer with stopwords and min_df
count_vectorizer = CountVectorizer(stop_words = "english", min_df = 10)

nolans_recommendations_stopwords = get_recommendations(count_vectorizer, profile_data_nolan)

In [27]:
nolans_recommendations_stopwords

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.202853
Analyst,1884,0.156946
Architect,823,0.146167
Director,144,0.128633
Developer,5181,0.123517
Programmer,321,0.119675
Administrator,896,0.112249
Manager,1379,0.108413
Engineer,4045,0.104781
Consulting,578,0.104426
