In [89]:
import random

import pandas as pd
import numpy as np
import nltk
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Explore the input data

In [90]:
opportunity = {
    'industry': 'Internet',
    'min_years_experience': 3,
    'location': ['San Francisco', 'CA'],
    'highestLevelOfEducation': 'Bachelor',
    'blurb': 'I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
    'title': 'Seeking Software Engineer (Full-Time)'
}

In [91]:
industrys = ['Internet', 'Technology', 'Enterprise Software']
min_years_experiences = [1, 2, 3, 4, 5, 6, 7]
locations = [['San Francisco', 'CA'], ['New York City', 'NY'], ['Los Angeles', 'CA']]
highestLevelOfEducations = ['High School', 'Bachelor', 'Master', 'Doctorate']
blurbs = ['I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
          'We need a full stack machine learning engineer that knows REST APIs, flask, and tensorflow.',
          'Find me data scientist that knows SQL, tableu, excel, machine learning.', 
          'We just need a developer that has six or 7 years of experience']
titles = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Software Engineer (Machine Learning)']

df_test = pd.DataFrame()

num_opportunities = 5
rows = []

for i in range(num_opportunities):
    row_ele = {
    'industry': random.choice(industrys),
    'min_years_experience': random.choice(min_years_experiences),
    'location': random.choice(locations),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations),
    'blurb': random.choice(blurbs),
    'title': random.choice(titles)
    }
    rows.append(row_ele)


## Construct Dummy Table - Opportunity Submission Table

#### This is a table with queries that a poster might use to find a candidate in their network

In [92]:
opportunities_df = pd.DataFrame(rows)
opportunities_df.head(10)

Unnamed: 0,blurb,highestLevelOfEducation,industry,location,min_years_experience,title
0,"Find me data scientist that knows SQL, tableu,...",Bachelor,Technology,"[Los Angeles, CA]",6,Software Engineer (Machine Learning)
1,We just need a developer that has six or 7 yea...,Doctorate,Internet,"[Los Angeles, CA]",5,Software Engineer (Machine Learning)
2,We just need a developer that has six or 7 yea...,Doctorate,Internet,"[Los Angeles, CA]",4,Data Scientist
3,"Find me data scientist that knows SQL, tableu,...",High School,Internet,"[San Francisco, CA]",1,Machine Learning Engineer
4,We need a full stack machine learning engineer...,Doctorate,Enterprise Software,"[San Francisco, CA]",5,Machine Learning Engineer


## Construct Dummy Table - Candidate Table

Candidate dictionary
{
    id: int
    email: 'str'
    name: 'str'
    location: list['city', 'state']
    current_position: 'str',
    years_of_experience: int
    industry: 'str'
    degree_connection: str
    degree = str
}

In [93]:
candidates_total = 10000
emails = []
names = []

for i in range(candidates_total):
    email = ('{}'+'xxxxxxx@gmail.com').format(i)
    emails.append(email)
    
for i in range(candidates_total):
    name = ('name + {}'.format(i))
    names.append(name)

In [94]:
ids = [i for i in range(candidates_total)]
emails = emails
names = names
locations = [['San Francisco', 'CA'], ['New York City', 'NY'], ['Los Angeles', 'CA'], ['Miami', 'FL']]
current_positions = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Bioengineer']
years_of_experiences = [1, 2, 3, 4, 5, 6, 7]
industrys = ['Internet', 'Technology', 'Enterprise Software', 'Biotech']
degree_connections = ['1st', '2nd']
highestLevelOfEducations = ['High School', 'Bachelor', 'Master', 'Doctorate']


candidate_rows = []

for i in range(candidates_total):
    row_ele = {
    'id': random.choice(ids),
    'email': random.choice(emails),
    'name': random.choice(names),
    'location': random.choice(locations),
    'current_position': random.choice(current_positions),
    'years_of_experience': random.choice(years_of_experiences),
    'industry': random.choice(industrys),
    'degree_connection': random.choice(degree_connections),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations)
    }
    candidate_rows.append(row_ele)

In [95]:
df_candidates = pd.DataFrame(candidate_rows)
df_candidates

Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience
0,Bioengineer,1st,2347xxxxxxx@gmail.com,Master,6684,Biotech,"[San Francisco, CA]",name + 1674,6
1,Software Engineer,2nd,7421xxxxxxx@gmail.com,High School,4697,Internet,"[New York City, NY]",name + 8319,6
2,Software Engineer,1st,9351xxxxxxx@gmail.com,Master,1896,Enterprise Software,"[Miami, FL]",name + 7532,3
3,Machine Learning Engineer,2nd,4979xxxxxxx@gmail.com,High School,5326,Enterprise Software,"[San Francisco, CA]",name + 5533,4
4,Software Engineer,1st,1662xxxxxxx@gmail.com,Master,3999,Internet,"[Los Angeles, CA]",name + 7849,7
5,Software Engineer,2nd,1297xxxxxxx@gmail.com,High School,5844,Technology,"[Los Angeles, CA]",name + 5673,5
6,Machine Learning Engineer,2nd,2868xxxxxxx@gmail.com,Master,6942,Internet,"[Miami, FL]",name + 2346,7
7,Machine Learning Engineer,1st,6836xxxxxxx@gmail.com,High School,7922,Biotech,"[Miami, FL]",name + 8246,5
8,Software Engineer,1st,1270xxxxxxx@gmail.com,High School,1093,Enterprise Software,"[Los Angeles, CA]",name + 6135,1
9,Machine Learning Engineer,2nd,2016xxxxxxx@gmail.com,Bachelor,4533,Internet,"[Miami, FL]",name + 9446,3


### Phase 1: Knowledge Based Heuristic ###

1. Filter 1: Filter by location
2. Filter 2: Filter by >= years of experience
3. Filter 3: Filter subset by title embeddings
4. Filter 4: Filter by degrees
5. Filter 5: Filter by industry
6. Filter 6: Filter by blurb

Create score column in candidate table that assigns reward for each applicable filter

In [96]:
opportunity_sample_request = opportunities_df.iloc[0]
opportunity_sample_request.head() #pandas series

blurb                      Find me data scientist that knows SQL, tableu,...
highestLevelOfEducation                                             Bachelor
industry                                                          Technology
location                                                   [Los Angeles, CA]
min_years_experience                                                       6
Name: 0, dtype: object

In [97]:
opportunity_sample_request['location']

['Los Angeles', 'CA']

In [98]:
opportunity_sample_request['min_years_experience']

6

In [99]:
(opportunity_sample_request['location'])

['Los Angeles', 'CA']

## Filter based on location 

df_candidates[(df_candidates['city'] == opportunity_sample_request['location'][0]) & ( 
              df_candidates['state'] == opportunity_sample_request['location'][1])]

## Filter based on current_position - (TBD)

opportunity_sample_request['current_position']

df_candidates[df_candidates['current_position'] == str(opportunity_sample_request['current_position'])].head()

## Filter based on Degree

## Filter based on degree major

In [100]:
df_candidates[df_candidates['highestLevelOfEducation'] == str(opportunity_sample_request['highestLevelOfEducation'])].head()

Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience
9,Machine Learning Engineer,2nd,2016xxxxxxx@gmail.com,Bachelor,4533,Internet,"[Miami, FL]",name + 9446,3
10,Software Engineer,1st,3765xxxxxxx@gmail.com,Bachelor,7025,Internet,"[Los Angeles, CA]",name + 5752,7
21,Machine Learning Engineer,2nd,1972xxxxxxx@gmail.com,Bachelor,4030,Enterprise Software,"[New York City, NY]",name + 6720,6
27,Software Engineer,2nd,765xxxxxxx@gmail.com,Bachelor,4361,Internet,"[San Francisco, CA]",name + 5717,3
29,Software Engineer,2nd,6385xxxxxxx@gmail.com,Bachelor,9889,Biotech,"[San Francisco, CA]",name + 3189,7


## Sort by degree connection 

In [101]:
df_candidates.sort_values(by = 'degree_connection', ascending = True).head()

Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience
0,Bioengineer,1st,2347xxxxxxx@gmail.com,Master,6684,Biotech,"[San Francisco, CA]",name + 1674,6
5822,Data Scientist,1st,9513xxxxxxx@gmail.com,Master,9193,Enterprise Software,"[Los Angeles, CA]",name + 3005,2
8262,Data Scientist,1st,4672xxxxxxx@gmail.com,High School,2050,Internet,"[New York City, NY]",name + 9276,1
5823,Software Engineer,1st,1366xxxxxxx@gmail.com,Master,781,Technology,"[Los Angeles, CA]",name + 1098,6
5824,Data Scientist,1st,8342xxxxxxx@gmail.com,Bachelor,1253,Enterprise Software,"[San Francisco, CA]",name + 2136,6


## Filter by years experience

In [102]:
df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()






Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience
0,Bioengineer,1st,2347xxxxxxx@gmail.com,Master,6684,Biotech,"[San Francisco, CA]",name + 1674,6
1,Software Engineer,2nd,7421xxxxxxx@gmail.com,High School,4697,Internet,"[New York City, NY]",name + 8319,6
4,Software Engineer,1st,1662xxxxxxx@gmail.com,Master,3999,Internet,"[Los Angeles, CA]",name + 7849,7
6,Machine Learning Engineer,2nd,2868xxxxxxx@gmail.com,Master,6942,Internet,"[Miami, FL]",name + 2346,7
10,Software Engineer,1st,3765xxxxxxx@gmail.com,Bachelor,7025,Internet,"[Los Angeles, CA]",name + 5752,7


# Heuristic 1: Knowledge Based Recommendation System


In [103]:
def heuristic_one(opportunity_request, df_candidates):
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
            
        OUTPUTS:
        ranked dataframe based on simple heuristic of filters
    '''
    blurb = opportunity_request['blurb']
    highestLevelOfEducation = opportunity_request['highestLevelOfEducation']
    industry = opportunity_request['industry']
    years_exp = opportunity_sample_request['min_years_experience']
    df_candidates['city'] = df_candidates['location'].str[0]
    df_candidates['state'] = df_candidates['location'].str[1]
    
    #rank based on blurb relevance score 
    
    #filter based on years experience 
    ranked_df = df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()

    #filter based on location
    ranked_df = ranked_df[(ranked_df['city'] == opportunity_sample_request['location'][0]) & ( 
              ranked_df['state'] == opportunity_sample_request['location'][1])]
    
    #filter based on match to degree
   # ranked_df = ranked_df[ranked_df['highestLevelOfEducation'] == str(opportunity_sample_request['highestLevelOfEducation'])].head()
    
    ranked_df = ranked_df.sort_values(by = 'degree_connection', ascending = True).reset_index().drop(['index'], axis = True)
    return ranked_df


In [104]:
opportunity_sample_request = opportunities_df.iloc[1]

heuristic_one(opportunity_request=opportunity_sample_request, df_candidates=df_candidates)

Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience,city,state
0,Software Engineer,1st,1662xxxxxxx@gmail.com,Master,3999,Internet,"[Los Angeles, CA]",name + 7849,7,Los Angeles,CA
1,Software Engineer,2nd,1297xxxxxxx@gmail.com,High School,5844,Technology,"[Los Angeles, CA]",name + 5673,5,Los Angeles,CA


# Heuristic Two: NLP Driven Score Metric for Relevance + No Filters (All Score Based) - Neighborhood Based Collaborative Filtering


In [116]:
opportunity_sample_request

blurb                      We just need a developer that has six or 7 yea...
highestLevelOfEducation                                            Doctorate
industry                                                            Internet
location                                                   [Los Angeles, CA]
min_years_experience                                                       5
title                                   Software Engineer (Machine Learning)
Name: 1, dtype: object

### Opportunity title to current position - similarity score


In [164]:
tokens = word_tokenize(opportunity_sample_request['title'])
tokens

cleaned_tokens = []

for tok in tokens:
    cleaned_tokens.append(tok.lower())

cleaned_tokens

['software', 'engineer', '(', 'machine', 'learning', ')']

### Blurb embeddeding score to text info

### Location score function - similarity score

Distance based similarily score 

In [127]:
def location_point_converter(location):
    if location == opportunity_sample_request['location']:
        return -1
    else:
        return 0
    

df_candidates['location_eucld_score'] = df_candidates['location'].apply(location_point_converter)

### Years of Experience Score - similarity score 

In [128]:
df_candidates['years_of_experience_eucld_dist'] = df_candidates['years_of_experience'].apply(lambda x: np.linalg.norm(x-opportunity_sample_request['min_years_experience']))


### Euclidean Distance for Education Level

In [129]:
df_candidates['highestLevelOfEducation_eucld_dist'] = 0

In [130]:
def educonverter(education_level):
    if education_level == 'High School':
        return 1
    elif education_level == 'Bachelor':
        return 2
    elif education_level == 'Master':
        return 3
    elif education_level == 'Doctorate':
        return 4

In [131]:
df_candidates['edunum'] = df_candidates['highestLevelOfEducation'].apply(educonverter)

In [132]:
df_candidates['highestLevelOfEducation_eucld_dist'] = df_candidates['edunum'].apply(lambda x: np.linalg.norm(x-educonverter(opportunity_sample_request['highestLevelOfEducation'])))

### Calculate Total Similarity Score

In [133]:
df_candidates['eucld_dist'] = df_candidates['highestLevelOfEducation_eucld_dist'] + df_candidates['years_of_experience_eucld_dist'] + df_candidates['location_eucld_score']



#### Sort by Euclidean Distance - Less means they are closer in similarity

In [138]:
n_top = 5000
ranked_df = df_candidates.sort_values(by = 'eucld_dist', ascending = True).reset_index().drop(['index', 'edunum'], axis = 1)
ranked_df.iloc[0:n_top].sort_values(by = 'degree_connection', ascending = True)

Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience,city,state,highestLevelOfEducation_eucld_dist,location_eucld_score,years_of_experience_eucld_dist,eucld_dist
0,Data Scientist,1st,8623xxxxxxx@gmail.com,Doctorate,2075,Biotech,"[Los Angeles, CA]",name + 2812,5,Los Angeles,CA,0.0,-1,0.0,-1.0
2800,Machine Learning Engineer,1st,3480xxxxxxx@gmail.com,Doctorate,5943,Enterprise Software,"[San Francisco, CA]",name + 1280,7,San Francisco,CA,0.0,0,2.0,2.0
2802,Machine Learning Engineer,1st,1384xxxxxxx@gmail.com,Master,6306,Internet,"[Miami, FL]",name + 6583,4,Miami,FL,1.0,0,1.0,2.0
2803,Machine Learning Engineer,1st,1194xxxxxxx@gmail.com,Bachelor,1250,Internet,"[Los Angeles, CA]",name + 8319,6,Los Angeles,CA,2.0,-1,1.0,2.0
2804,Software Engineer,1st,6683xxxxxxx@gmail.com,Master,263,Technology,"[New York City, NY]",name + 8318,6,New York City,NY,1.0,0,1.0,2.0
2805,Data Scientist,1st,5501xxxxxxx@gmail.com,Master,8065,Internet,"[Miami, FL]",name + 1934,4,Miami,FL,1.0,0,1.0,2.0
2806,Bioengineer,1st,8722xxxxxxx@gmail.com,Bachelor,2953,Enterprise Software,"[Miami, FL]",name + 7053,5,Miami,FL,2.0,0,0.0,2.0
2808,Software Engineer,1st,3511xxxxxxx@gmail.com,Bachelor,1648,Internet,"[Miami, FL]",name + 8663,5,Miami,FL,2.0,0,0.0,2.0
2810,Machine Learning Engineer,1st,3716xxxxxxx@gmail.com,Bachelor,6614,Technology,"[San Francisco, CA]",name + 5637,5,San Francisco,CA,2.0,0,0.0,2.0
2812,Machine Learning Engineer,1st,8948xxxxxxx@gmail.com,Bachelor,110,Biotech,"[San Francisco, CA]",name + 4011,5,San Francisco,CA,2.0,0,0.0,2.0


In [139]:
def location_point_converter(location):
    if location == opportunity_sample_request['location']:
        return -1
    else:
        return 0

def educonverter(education_level):
    if education_level == 'High School':
        return 1
    elif education_level == 'Bachelor':
        return 2
    elif education_level == 'Master':
        return 3
    elif education_level == 'Doctorate':
        return 4

In [152]:
def heuristic_two(opp_request, df_candidates, n_top = 50): 
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
    Returns:
        ranked dataframe of users and similarity score sorted by degree connection
    '''
    
    def location_point_converter(location):
        if location == opp_request['location']:
            return -1
        else:
            return 0
    
    #Calculate Location Similarity Score 
    df_candidates['location_eucld_score'] = df_candidates['location'].apply(location_point_converter)

    #Calculate Years of Experience Similarity Score
    df_candidates['years_of_experience_eucld_dist'] = df_candidates['years_of_experience'].apply(lambda x: np.linalg.norm(x-opp_request['min_years_experience']))

    #Calculate Level of Education Similarity Score
    df_candidates['edunum'] = df_candidates['highestLevelOfEducation'].apply(educonverter)
    df_candidates['highestLevelOfEducation_eucld_dist'] = df_candidates['edunum'].apply(lambda x: np.linalg.norm(x-educonverter(opp_request['highestLevelOfEducation'])))
    
    #Calculate Total Similarity Score
    df_candidates['eucld_dist'] = df_candidates['highestLevelOfEducation_eucld_dist'] + df_candidates['years_of_experience_eucld_dist'] + df_candidates['location_eucld_score']
    
    
    print(opp_request)
    print('')
    ranked_df = df_candidates.sort_values(by = ['eucld_dist', 'degree_connection'], ascending = True).reset_index().drop(['index', 'edunum'], axis = 1)
    
    
    return ranked_df.iloc[0:n_top] #ranked_df.iloc[0:n_top].sort_values(by = 'degree_connection', ascending = True)



In [153]:
heuristic_two(opportunity_sample_request, df_candidates)

blurb                      We just need a developer that has six or 7 yea...
highestLevelOfEducation                                            Doctorate
industry                                                            Internet
location                                                   [Los Angeles, CA]
min_years_experience                                                       5
title                                   Software Engineer (Machine Learning)
Name: 1, dtype: object



Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience,city,state,highestLevelOfEducation_eucld_dist,location_eucld_score,years_of_experience_eucld_dist,eucld_dist
0,Software Engineer,1st,1078xxxxxxx@gmail.com,Doctorate,6828,Technology,"[Los Angeles, CA]",name + 7413,5,Los Angeles,CA,0.0,-1,0.0,-1.0
1,Bioengineer,1st,3537xxxxxxx@gmail.com,Doctorate,7397,Technology,"[Los Angeles, CA]",name + 7565,5,Los Angeles,CA,0.0,-1,0.0,-1.0
2,Bioengineer,1st,9832xxxxxxx@gmail.com,Doctorate,1490,Technology,"[Los Angeles, CA]",name + 205,5,Los Angeles,CA,0.0,-1,0.0,-1.0
3,Software Engineer,1st,6207xxxxxxx@gmail.com,Doctorate,320,Enterprise Software,"[Los Angeles, CA]",name + 7359,5,Los Angeles,CA,0.0,-1,0.0,-1.0
4,Machine Learning Engineer,1st,5373xxxxxxx@gmail.com,Doctorate,9255,Enterprise Software,"[Los Angeles, CA]",name + 9670,5,Los Angeles,CA,0.0,-1,0.0,-1.0
5,Data Scientist,1st,8000xxxxxxx@gmail.com,Doctorate,4670,Internet,"[Los Angeles, CA]",name + 394,5,Los Angeles,CA,0.0,-1,0.0,-1.0
6,Data Scientist,1st,4063xxxxxxx@gmail.com,Doctorate,5323,Enterprise Software,"[Los Angeles, CA]",name + 6778,5,Los Angeles,CA,0.0,-1,0.0,-1.0
7,Machine Learning Engineer,1st,4472xxxxxxx@gmail.com,Doctorate,9089,Biotech,"[Los Angeles, CA]",name + 9456,5,Los Angeles,CA,0.0,-1,0.0,-1.0
8,Machine Learning Engineer,1st,3857xxxxxxx@gmail.com,Doctorate,5169,Biotech,"[Los Angeles, CA]",name + 3228,5,Los Angeles,CA,0.0,-1,0.0,-1.0
9,Software Engineer,1st,3459xxxxxxx@gmail.com,Doctorate,1133,Biotech,"[Los Angeles, CA]",name + 9272,5,Los Angeles,CA,0.0,-1,0.0,-1.0


# Heuristic Three: MORE NLP Driven Score Metric for Relevance

In [88]:
def heuristic_three(df):
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
    '''
    pass