# Spacy

In [20]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import spacy
from IPython.display import display_html

---

In [3]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type'])

In [4]:
jobs.head(5)

Unnamed: 0,job_description,job_title,location,skills
0,an EDI Analyst with experience please read on...,Analyst,Northeast United States,edi trustedlink as van
1,Informatica ETL DeveloperSt Petersburg FL Only...,Developer,Southern United States,etl informatica b data exchange netezza oracle...
2,This nationally recognized Microsoft Gold Part...,Manager,Western United States,microsoft dynamics ax project manager - toront...
3,a .NET Developer with experience please read ...,Developer,Northeast United States,c asp.net sql javascript mvc
4,Hatstand a global financial consultancy is see...,Developer,Northeast United States,java linux unix sdlc; multi-threaded or concur...


In [5]:
jobs.isna().sum()

job_description      0
job_title            0
location             0
skills             266
dtype: int64

In [6]:
# Replace nan's with empty string
jobs.fillna('', inplace=True)

In [7]:
# Combine job description and text into a single column
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

# Spacy
https://spacy.io/usage/spacy-101

In [8]:
# Load one of the larger models for a better similarity score
nlp = spacy.load("en_core_web_lg")

In [13]:
# How to use .similarity function
tokens = nlp("dog cat banana")

for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

dog dog 1.0
dog cat 0.8016855
dog banana 0.24327648
cat dog 0.8016855
cat cat 1.0
cat banana 0.28154367
banana dog 0.24327648
banana cat 0.28154367
banana banana 1.0


In [14]:
# Some more examples
print(nlp("dog").similarity(nlp("dog")))
print(nlp("dog").similarity(nlp("cat")))
print(nlp("dog").similarity(nlp("banana")))

1.0
0.8016855517329495
0.24327647954195658


In [15]:
# Displaying some attributes of an nlp object
tokens = nlp("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
banana True 6.700014 False
afskfsd False 0.0 True


---
## Using our jobs data

In [16]:
# New stop words list 
# customize_stop_words = [
#     'attach'
# ]

# # Mark them as stop words
# for w in customize_stop_words:
    
#     nlp.vocab[w].is_stop = True

In [17]:
# nlp.vocab['aaaaaaaaaaaa'].is_stop

In [9]:
titles_and_docs = jobs[['job_title']].copy()
titles_and_docs['doc'] = jobs['text'].map(nlp)

In [10]:
titles_and_docs

Unnamed: 0,job_title,doc
0,Analyst,"( , an, EDI, Analyst, with, experience, please..."
1,Developer,"(Informatica, ETL, DeveloperSt, Petersburg, FL..."
2,Manager,"(This, nationally, recognized, Microsoft, Gold..."
3,Developer,"( , a, .NET, Developer, with, experience, plea..."
4,Developer,"(Hatstand, a, global, financial, consultancy, ..."
...,...,...
16431,Developer,"( , JPMorgan, Chase, &, Co., (, NYSE, :, JPM,..."
16432,Administrator,"(Seeking, Jr., Systems, Administrators, with, ..."
16433,Developer,"( , a, Senior, Lead, Devops, Engineer, with, a..."
16434,Developer,"(Headquartered, in, downtown, San, Francisco, ..."


In [11]:
def gather_profile_data(file_path):
    profile_data = pd.read_csv(file_path)
    profile_data['text'] = profile_data['Titles'] + ' ' \
                            + profile_data['Skills'] + ' ' \
                            + profile_data['Summary'] + ' ' \
                            + profile_data['Education']
    return profile_data

In [17]:
def get_recommendations(profile_data):
    # Create nlp doc from profile
    profile_text = profile_data['text'][0]
    profile_doc = nlp(profile_text)
    
    # Calculate scores
    scores = jobs[['job_title']].copy()
    scores['sim_score'] = titles_and_docs['doc'].map(lambda x: x.similarity(profile_doc))
    
    return scores

In [13]:
# Reading in Zach's linkedin profile data.
profile_data_zach = gather_profile_data('../data/linkedin/test-output/Zach_LinkedInData_12-16-2020.csv')
profile_data_zach

Unnamed: 0,Name,Titles,Skills,Summary,Education,Certifications,text
0,Zachary Brown,"Data Science Fellow, Python Developer, Health ...","Data Analysis, Python (Programming Language), ...",I bridge the gap between data and climate poli...,"Data Science Intensive, Bachelor's of Science",Microsoft Certified: Azure Data Scientist Asso...,"Data Science Fellow, Python Developer, Health ..."


In [14]:
# Reading in Nolan's linkedin profile data.
profile_data_nolan = gather_profile_data('../data/linkedin/test-output/Nolan_LinkedInData_12-16-2020.csv')
profile_data_nolan

Unnamed: 0,Name,Titles,Skills,Summary,Education,text
0,Nolan Arendt,"Data Science Fellow, Painter","Data Science, Python, Data Analysis, Data Mana...",An innovative Data Scientist who is passionate...,"Bachelor's degree, Software Boot Camp Certificate","Data Science Fellow, Painter Data Science, Pyt..."


In [15]:
# Reading in Nolan's linkedin profile data.
profile_data_albert = gather_profile_data('../data/linkedin/test-output/Albert_LinkedInData.csv')
profile_data_albert

Unnamed: 0,Name,Titles,Skills,Summary,Education,Projects,Certifications,text
0,Albert Frantz,"Data Science Fellow, Assistant Teacher, Classr...","R, Python, Data Analysis, Econometrics, Projec...",I am a detail-oriented data scientist that use...,"Bachelor of Arts - BA, nan, nan",Identifying the Relationship Between Bike Lane...,Tableau 2020 A-Z: Hands-On Tableau Training fo...,"Data Science Fellow, Assistant Teacher, Classr..."


In [18]:
# Calculate scores
zach_scores = get_recommendations(profile_data_zach)
nolan_scores = get_recommendations(profile_data_nolan)
albert_scores = get_recommendations(profile_data_albert)

In [21]:
# Group by job title
zachs_recommendations = zach_scores.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)
nolans_recommendations = nolan_scores.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)
alberts_recommendations = albert_scores.groupby('job_title').agg(['count', 'mean'])['sim_score'].sort_values('mean', ascending=False)

In [22]:
# Credit for notebook styling: https://blog.softhints.com/display-two-pandas-dataframes-side-by-side-jupyter-notebook/
df1_styler = zachs_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Zach')
df2_styler = nolans_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Nolan')
df3_styler = alberts_recommendations.style.set_table_attributes("style='display:inline'").set_caption('Albert')

space = "\xa0" * 10
display_html(df1_styler._repr_html_() + space + df2_styler._repr_html_() + space + df3_styler._repr_html_(), raw=True)

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Analyst,1884,0.906737
Data Position,280,0.90667
Architect,823,0.903956
Engineer,4045,0.90356
Programmer,321,0.902418
Director,144,0.901784
Developer,5181,0.901217
Manager,1379,0.900325
Support,343,0.900125
Administrator,896,0.899479

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Architect,823,0.82777
Administrator,896,0.826981
Developer,5181,0.824243
Data Position,280,0.823465
Programmer,321,0.82329
Consulting,578,0.822425
Analyst,1884,0.820208
Engineer,4045,0.815983
Designer,220,0.8125
Support,343,0.812417

Unnamed: 0_level_0,count,mean
job_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Designer,220,0.917509
Engineer,4045,0.915938
Programmer,321,0.915708
Data Position,280,0.914672
Developer,5181,0.914564
Analyst,1884,0.914427
Director,144,0.913369
Architect,823,0.913088
Support,343,0.911667
Manager,1379,0.911522
