# Tfidf Model

In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

## Jobs data

In [2]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])
jobs.fillna('', inplace=True)
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

## User data

In [3]:
def gather_profile_data(file_path):
    profile_data = pd.read_csv(file_path)
    profile_data['text'] = profile_data['Titles'] + ' ' \
                            + profile_data['Skills'] + ' ' \
                            + profile_data['Summary'] + ' ' \
                            + profile_data['Education']
    return profile_data

In [4]:
# Reading in Zach's linkedin profile data.
profile_data_zach = gather_profile_data('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')
profile_data_zach

Unnamed: 0,Name,Titles,Skills,Summary,Education,Certifications,text
0,Zachary Brown,"Data Science Fellow, Python Developer, Health ...","Data Analysis, Python (Programming Language), ...",I bridge the gap between data and climate poli...,"Data Science Intensive, Bachelor's of Science",Microsoft Certified: Azure Data Scientist Asso...,"Data Science Fellow, Python Developer, Health ..."


In [5]:
# Reading in Nolan's linkedin profile data.
profile_data_nolan = gather_profile_data('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')
profile_data_nolan

Unnamed: 0,Name,Titles,Skills,Summary,Education,text
0,Nolan Arendt,"Data Science Fellow, Painter","Data Science, Python, Data Analysis, Data Mana...",An innovative Data Scientist who is passionate...,"Bachelor's degree, Software Boot Camp Certificate","Data Science Fellow, Painter Data Science, Pyt..."


In [6]:
def get_recommendations(vectorizer, user_data):
    # Cvec fit transform on column text in data
    cvec_jobtext = vectorizer.fit_transform(jobs['text'])

    # Transforming user profile column text
    user_cvec = vectorizer.transform(user_data['text'])

    # Calculating cosine similarity between users profile and job text
    cos_similar_user = map(lambda x: cosine_similarity(user_cvec, x), cvec_jobtext)

    # Results are a list of our cosine similarity scores
    results = list(cos_similar_user)

    # Create a dataframe using our results and job_titles, then transposing
    new_df = pd.DataFrame(data = [results, jobs['job_title']]).T

    # Renaming column 0 to sim score, similar to our tfidf model, and 1 to job_title
    new_df = new_df.rename(columns = {0: "sim_score", 1 : "job_title"})

    # Column sim_score as float so that we can use groupby
    new_df['sim_score'] = new_df['sim_score'].astype(float)

    # Group by job_title, using count and mean to sort values
    new_df = new_df.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)

    # Sorting our dataframe by mean
    new_df = new_df.sort_values(by=['mean'], ascending = False)

    return new_df

In [7]:
# Instantiating countvectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = "english")

zachs_recommendations = get_recommendations(tfidf_vectorizer, profile_data_zach)

In [8]:
zachs_recommendations

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.079419
Analyst,1884,0.053555
Architect,823,0.047552
Director,144,0.038748
Developer,5181,0.037282
Programmer,321,0.036279
Engineer,4045,0.032124
Administrator,896,0.028264
Manager,1379,0.026298
Consulting,578,0.024505


In [9]:
# Instantiating countvectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words = "english")

nolans_recommendations = get_recommendations(tfidf_vectorizer, profile_data_nolan)

In [10]:
nolans_recommendations

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Data Position,280,0.086699
Analyst,1884,0.057694
Architect,823,0.053962
Developer,5181,0.050618
Director,144,0.049599
Programmer,321,0.049511
Engineer,4045,0.043423
Administrator,896,0.04172
Manager,1379,0.039304
Designer,220,0.033327
