In [63]:
import random

import pandas as pd
import numpy as np
import nltk
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Explore the input data

In [117]:
opportunity = {
    'industry': 'Internet',
    'min_years_experience': 3,
    'city': 'San Francisco',
    'highestLevelOfEducation': 'bachelor',
    'blurb': 'I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
    'title': 'Seeking Software Engineer (Full-Time)'
}

In [118]:
industrys = ['Internet', 'Technology', 'Enterprise Software']
min_years_experiences = [1, 2, 3, 4, 5, 6, 7]
citys = ['San Francisco', 'New York City', 'Los Angeles']
highestLevelOfEducations = ['high_school', 'associate', 'bachelor', 'master', 'phd']
blurbs = ['I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
          'We need a full stack machine learning engineer that knows REST APIs, flask, and tensorflow.',
          'Find me data scientist that knows SQL, tableu, excel, machine learning.', 
          'We just need a developer that has six or 7 years of experience']
titles = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Software Engineer (Machine Learning)']

df_test = pd.DataFrame()

num_opportunities = 5
rows = []

for i in range(num_opportunities):
    row_ele = {
    'industry': random.choice(industrys),
    'min_years_experience': random.choice(min_years_experiences),
    'city': random.choice(citys),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations),
    'blurb': random.choice(blurbs),
    'title': random.choice(titles)
    }
    rows.append(row_ele)


## Construct Dummy Table - Opportunity Submission Table

#### This is a table with queries that a poster might use to find a candidate in their network

In [119]:
opportunities_df = pd.DataFrame(rows)
opportunities_df.head(10)

Unnamed: 0,blurb,city,highestLevelOfEducation,industry,min_years_experience,title
0,We need a full stack machine learning engineer...,Los Angeles,bachelor,Enterprise Software,6,Data Scientist
1,I need a software engineer that has experience...,New York City,master,Technology,3,Machine Learning Engineer
2,We just need a developer that has six or 7 yea...,New York City,high_school,Internet,5,Data Scientist
3,I need a software engineer that has experience...,Los Angeles,associate,Internet,3,Data Scientist
4,We just need a developer that has six or 7 yea...,New York City,associate,Technology,2,Software Engineer (Machine Learning)


## Construct Dummy Table - Candidate Table

Candidate dictionary
{
    id: int
    email: 'str'
    name: 'str'
    location: list['city', 'state']
    current_position: 'str',
    years_of_experience: int
    industry: 'str'
    degree_connection: str
    degree = str
}

In [159]:
candidates_total = 10000
emails = []
names = []

for i in range(candidates_total):
    email = ('{}'+'xxxxxxx@gmail.com').format(i)
    emails.append(email)
    
for i in range(candidates_total):
    name = ('name + {}'.format(i))
    names.append(name)

ids = [i for i in range(candidates_total)]
emails = emails
names = names
citys = ['San Francisco', 'New York City', 'Los Angeles']
#state = ['California', 'New York', 'Florida']
current_positions = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Bioengineer']
industrys = ['Internet', 'Technology', 'Enterprise Software', 'Biotech']
years_of_experiences = [1, 2, 3, 4, 5, 6, 7]
linkedin_urls = ['https://www.linkedin.com/adrianlievano']
image_urls = ['https://www.linkedin.com/adrianlievano/profile_pic.png']
degrees = ['1st', '2nd']
highestLevelOfEducations = ['high_school', 'associate', 'bachelor', 'master', 'phd']

candidate_rows = []

for i in range(candidates_total):
    row_ele = {
    'id': random.choice(ids),
    'email': random.choice(emails),
    'name': random.choice(names),
    'phone_number': '786-515-4282',
    'city': random.choice(citys),
    #'location': random.choice(locations),
    'current_position': random.choice(current_positions),
    'industry': random.choice(industrys),
    'years_of_experience': random.choice(years_of_experiences),
    'degree': random.choice(degrees),
    'linkedin_url': random.choice(linkedin_urls),
    'image_url': random.choice(image_urls),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations)
    }
    candidate_rows.append(row_ele)

In [162]:
can`didate_rows

[{'city': 'Los Angeles',
  'current_position': 'Machine Learning Engineer',
  'degree': '1st',
  'email': '5770xxxxxxx@gmail.com',
  'highestLevelOfEducation': 'associate',
  'id': 6676,
  'image_url': 'https://www.linkedin.com/adrianlievano/profile_pic.png',
  'industry': 'Enterprise Software',
  'linkedin_url': 'https://www.linkedin.com/adrianlievano',
  'name': 'name + 6914',
  'phone_number': '786-515-4282',
  'years_of_experience': 2},
 {'city': 'Los Angeles',
  'current_position': 'Bioengineer',
  'degree': '2nd',
  'email': '799xxxxxxx@gmail.com',
  'highestLevelOfEducation': 'phd',
  'id': 5302,
  'image_url': 'https://www.linkedin.com/adrianlievano/profile_pic.png',
  'industry': 'Technology',
  'linkedin_url': 'https://www.linkedin.com/adrianlievano',
  'name': 'name + 79',
  'phone_number': '786-515-4282',
  'years_of_experience': 1},
 {'city': 'New York City',
  'current_position': 'Bioengineer',
  'degree': '1st',
  'email': '630xxxxxxx@gmail.com',
  'highestLevelOfEducati

In [160]:
df_candidates = pd.DataFrame(candidate_rows)
df_candidates

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
0,Los Angeles,Machine Learning Engineer,1st,5770xxxxxxx@gmail.com,associate,6676,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 6914,786-515-4282,2
1,Los Angeles,Bioengineer,2nd,799xxxxxxx@gmail.com,phd,5302,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 79,786-515-4282,1
2,New York City,Bioengineer,1st,630xxxxxxx@gmail.com,associate,231,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 8170,786-515-4282,4
3,Los Angeles,Bioengineer,1st,5655xxxxxxx@gmail.com,bachelor,1118,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 5327,786-515-4282,2
4,New York City,Machine Learning Engineer,2nd,7750xxxxxxx@gmail.com,phd,9486,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 2336,786-515-4282,5
5,San Francisco,Data Scientist,2nd,4601xxxxxxx@gmail.com,high_school,1833,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 8338,786-515-4282,2
6,Los Angeles,Machine Learning Engineer,2nd,3306xxxxxxx@gmail.com,phd,2419,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 8978,786-515-4282,4
7,San Francisco,Machine Learning Engineer,1st,4195xxxxxxx@gmail.com,master,3560,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 7493,786-515-4282,7
8,New York City,Data Scientist,1st,1191xxxxxxx@gmail.com,associate,4093,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 5862,786-515-4282,2
9,San Francisco,Software Engineer,1st,2423xxxxxxx@gmail.com,master,417,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 340,786-515-4282,2


### Phase 1: Knowledge Based Heuristic ###

1. Filter 1: Filter by location
2. Filter 2: Filter by >= years of experience
3. Filter 3: Filter subset by title embeddings
4. Filter 4: Filter by degrees
5. Filter 5: Filter by industry
6. Filter 6: Filter by blurb

Create score column in candidate table that assigns reward for each applicable filter

In [167]:
opportunity_sample_request = dict(opportunities_df.iloc[0])
opportunity_sample_request #pandas series

opportunity_sample_request['industry']

'Enterprise Software'

In [126]:
opportunity_sample_request['city']

'Los Angeles'

In [127]:
opportunity_sample_request['min_years_experience']

6

## Filter based on location 

df_candidates[(df_candidates['city'] == opportunity_sample_request['location'][0]) & ( 
              df_candidates['state'] == opportunity_sample_request['location'][1])]

## Filter based on current_position - (TBD)

opportunity_sample_request['current_position']

df_candidates[df_candidates['current_position'] == str(opportunity_sample_request['current_position'])].head()

## Filter based on degree major

In [129]:
df_candidates[df_candidates['highestLevelOfEducation'] == str(opportunity_sample_request['highestLevelOfEducation'])].head()

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
3,Los Angeles,Software Engineer,2nd,8899xxxxxxx@gmail.com,bachelor,5552,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 7827,786-515-4282,4
5,San Francisco,Software Engineer,2nd,2257xxxxxxx@gmail.com,bachelor,4277,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 8063,786-515-4282,1
23,San Francisco,Data Scientist,1st,7762xxxxxxx@gmail.com,bachelor,5741,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 8014,786-515-4282,3
25,New York City,Software Engineer,2nd,5481xxxxxxx@gmail.com,bachelor,5729,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 8086,786-515-4282,7
44,Los Angeles,Machine Learning Engineer,1st,7137xxxxxxx@gmail.com,bachelor,7098,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 8203,786-515-4282,6


## Sort by degree connection 

In [131]:
df_candidates.sort_values(by = 'degree', ascending = True).head()

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
4999,New York City,Machine Learning Engineer,1st,9974xxxxxxx@gmail.com,associate,4355,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 5890,786-515-4282,2
4163,New York City,Machine Learning Engineer,1st,7978xxxxxxx@gmail.com,phd,5457,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 6676,786-515-4282,5
7916,San Francisco,Bioengineer,1st,2906xxxxxxx@gmail.com,bachelor,7421,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 865,786-515-4282,2
4161,New York City,Software Engineer,1st,344xxxxxxx@gmail.com,associate,3906,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 4789,786-515-4282,3
7917,Los Angeles,Software Engineer,1st,7690xxxxxxx@gmail.com,associate,6827,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 1023,786-515-4282,6


## Filter by years experience

In [132]:
df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
0,New York City,Machine Learning Engineer,2nd,6712xxxxxxx@gmail.com,phd,7780,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 5368,786-515-4282,6
4,Los Angeles,Data Scientist,1st,7200xxxxxxx@gmail.com,master,6351,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 1951,786-515-4282,6
9,Los Angeles,Software Engineer,2nd,1415xxxxxxx@gmail.com,associate,3329,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 5945,786-515-4282,6
12,San Francisco,Software Engineer,2nd,9215xxxxxxx@gmail.com,phd,700,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 1584,786-515-4282,6
13,New York City,Software Engineer,1st,6289xxxxxxx@gmail.com,master,9598,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 6765,786-515-4282,6


# Heuristic 1: Knowledge Based Recommendation System


In [137]:
def heuristic_one(opportunity_request, df_candidates):
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
            
        OUTPUTS:
        ranked dataframe based on simple heuristic of filters
    '''
    blurb = opportunity_request['blurb']
    highestLevelOfEducation = opportunity_request['highestLevelOfEducation']
    industry = opportunity_request['industry']
    years_exp = opportunity_sample_request['min_years_experience']
    df_candidates['city'] = df_candidates['city']
    #df_candidates['state'] = df_candidates['location'].str[1]
    
    #rank based on blurb relevance score 
    
    #filter based on years experience 
    ranked_df = df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()

    #filter based on location
    ranked_df = ranked_df[(ranked_df['city'] == opportunity_sample_request['city'])]
    
    #filter based on match to degree
   # ranked_df = ranked_df[ranked_df['highestLevelOfEducation'] == str(opportunity_sample_request['highestLevelOfEducation'])].head()
    
    ranked_df = ranked_df.sort_values(by = 'degree', ascending = True).reset_index().drop(['index'], axis = True)
    return ranked_df


In [138]:
opportunity_sample_request = opportunities_df.iloc[1]

heuristic_one(opportunity_request=opportunity_sample_request, df_candidates=df_candidates)

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
0,New York City,Machine Learning Engineer,2nd,6712xxxxxxx@gmail.com,phd,7780,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 5368,786-515-4282,6


# Heuristic Two: NLP Driven Score Metric for Relevance + No Filters (All Score Based) - Neighborhood Based Collaborative Filtering


In [139]:
opportunity_sample_request

blurb                      I need a software engineer that has experience...
city                                                           New York City
highestLevelOfEducation                                               master
industry                                                          Technology
min_years_experience                                                       3
title                                              Machine Learning Engineer
Name: 1, dtype: object

### Opportunity title to current position - similarity score


In [140]:
import gensim 
from gensim.models import Word2Vec 

In [141]:
##helper function to tokenize words
def tokenize(tokens):
    tokens = word_tokenize(opportunity_sample_request['title'])
    stop_words = stopwords.words('english')
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_tokens = []
    lemmatizer = WordNetLemmatizer()

    for tok in tokens:
        cleaned_tokens.append(lemmatizer.lemmatize(tok).lower().strip())

    return cleaned_tokens

In [142]:
tokens = word_tokenize(opportunity_sample_request['title'])
stop_words = stopwords.words('english')
tokens = [word for word in tokens if word not in stop_words]
cleaned_tokens = []
lemmatizer = WordNetLemmatizer()

for tok in tokens:
    cleaned_tokens.append(lemmatizer.lemmatize(tok).lower().strip())


In [143]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
])

In [144]:
## count vectorize & td-idf represents word in a frequency vector. They do not capture syntatical information

In [145]:

model_job = gensim.models.Word2Vec(cleaned_tokens, min_count = 1,  
                              size = 100, window = 5)



model_job.similarity('software', 'engineer')

### Blurb embeddeding score to text info

### Location score function - similarity score

Distance based similarily score 

In [146]:
def city_point_converter(city):
    if city == opportunity_sample_request['city']:
        return -1
    else:
        return 0
    

df_candidates['city_eucld_score'] = df_candidates['city'].apply(city_point_converter)

### Years of Experience Score - similarity score 

In [147]:
df_candidates['years_of_experience_eucld_dist'] = df_candidates['years_of_experience'].apply(lambda x: np.linalg.norm(x-opportunity_sample_request['min_years_experience']))


### Euclidean Distance for Education Level

In [148]:
df_candidates['highestLevelOfEducation_eucld_dist'] = 0

In [149]:
np.linalg.norm(4-6)/6

0.3333333333333333

In [150]:
def educonverter(education_level):
    if education_level == 'high_school':
        return 1
    elif education_level == 'associate':
        return 2
    elif education_level == 'bachelor':
        return 3
    elif education_level == 'master':
        return 4
    elif education_level == 'phd':
        return 5

In [151]:
df_candidates['edunum'] = df_candidates['highestLevelOfEducation'].apply(educonverter)

In [152]:
mag_edu = educonverter(opportunity_sample_request['highestLevelOfEducation'])

In [153]:
df_candidates['highestLevelOfEducation_eucld_dist'] = df_candidates['edunum'].apply(lambda x: np.linalg.norm((x-mag_edu)/mag_edu))
                                                                                    
                                                                                    
                                                                                    
                                                                                    
                                                                                    

### Calculate Total Similarity Score

In [154]:
df_candidates['eucld_dist'] = df_candidates['highestLevelOfEducation_eucld_dist'] + df_candidates['years_of_experience_eucld_dist'] + df_candidates['city_eucld_score']



#### Sort by Euclidean Distance - Less means they are closer in similarity

In [155]:
n_top = 5000
ranked_df = df_candidates.sort_values(by = 'eucld_dist', ascending = True).reset_index().drop(['index', 'edunum'], axis = 1)
ranked_df.iloc[0:n_top].sort_values(by = 'degree', ascending = True)

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience,city_eucld_score,years_of_experience_eucld_dist,highestLevelOfEducation_eucld_dist,eucld_dist
0,New York City,Data Scientist,1st,6448xxxxxxx@gmail.com,master,8049,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 7827,786-515-4282,3,-1,0.0,0.00,-1.00
4007,San Francisco,Software Engineer,1st,1921xxxxxxx@gmail.com,phd,997,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 5706,786-515-4282,2,0,1.0,0.25,1.25
2035,San Francisco,Bioengineer,1st,8013xxxxxxx@gmail.com,high_school,96,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 2356,786-515-4282,3,0,0.0,0.75,0.75
4005,San Francisco,Bioengineer,1st,8176xxxxxxx@gmail.com,bachelor,8730,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 3456,786-515-4282,4,0,1.0,0.25,1.25
2038,New York City,Data Scientist,1st,1229xxxxxxx@gmail.com,high_school,6731,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 9626,786-515-4282,4,-1,1.0,0.75,0.75
4004,New York City,Bioengineer,1st,8632xxxxxxx@gmail.com,phd,9173,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 4858,786-515-4282,5,-1,2.0,0.25,1.25
2040,New York City,Bioengineer,1st,6561xxxxxxx@gmail.com,high_school,4107,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 7746,786-515-4282,2,-1,1.0,0.75,0.75
2041,Los Angeles,Data Scientist,1st,6558xxxxxxx@gmail.com,high_school,9159,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 6425,786-515-4282,3,0,0.0,0.75,0.75
2042,New York City,Software Engineer,1st,6907xxxxxxx@gmail.com,high_school,9729,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 6207,786-515-4282,4,-1,1.0,0.75,0.75
2033,New York City,Bioengineer,1st,3853xxxxxxx@gmail.com,high_school,5306,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 8964,786-515-4282,2,-1,1.0,0.75,0.75


### Generate list of candidates

In [None]:
candidates_total = 10000
emails = []
names = []

for i in range(candidates_total):
    email = ('{}'+'xxxxxxx@gmail.com').format(i)
    emails.append(email)
    
for i in range(candidates_total):
    name = ('name + {}'.format(i))
    names.append(name)

ids = [i for i in range(candidates_total)]
emails = emails
names = names
citys = ['San Francisco', 'New York City', 'Los Angeles']
#state = ['California', 'New York', 'Florida']
current_positions = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Bioengineer']
industrys = ['Internet', 'Technology', 'Enterprise Software', 'Biotech']
years_of_experiences = [1, 2, 3, 4, 5, 6, 7]
linkedin_urls = ['https://www.linkedin.com/adrianlievano']
image_urls = ['https://www.linkedin.com/adrianlievano/profile_pic.png']
degrees = ['1st', '2nd']
highestLevelOfEducations = ['high_school', 'associate', 'bachelor', 'master', 'phd']

candidate_rows = []

for i in range(candidates_total):
    row_ele = {
    'id': random.choice(ids),
    'email': random.choice(emails),
    'name': random.choice(names),
    'phone_number': '786-515-4282',
    'city': random.choice(citys),
    #'location': random.choice(locations),
    'current_position': random.choice(current_positions),
    'industry': random.choice(industrys),
    'years_of_experience': random.choice(years_of_experiences),
    'degree': random.choice(degrees),
    'linkedin_url': random.choice(linkedin_urls),
    'image_url': random.choice(image_urls),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations)
    }
    candidate_rows.append(row_ele)

In [156]:
def city_point_converter(city):
    if city == opportunity_sample_request['city']:
        return -1
    else:
        return 0

def educonverter(education_level):
    if education_level == 'high_school':
        return 1
    elif education_level == 'associate':
        return 2
    elif education_level == 'bachelor':
        return 3
    elif education_level == 'master':
        return 4
    elif education_level == 'phd':
        return 5

In [168]:
sample_candidate = {
    'id': random.choice(ids),
    'email': random.choice(emails),
    'name': random.choice(names),
    'phone_number': '786-515-4282',
    'city': random.choice(citys),
    #'location': random.choice(locations),
    'current_position': random.choice(current_positions),
    'industry': random.choice(industrys),
    'years_of_experience': random.choice(years_of_experiences),
    'degree': random.choice(degrees),
    'linkedin_url': random.choice(linkedin_urls),
    'image_url': random.choice(image_urls),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations)
    }

In [173]:
pd.DataFrame.from_dict(sample_candidate)
pd.DataFrame(list(sample_candidate.items()), columns=['id', 'email', 'name', 'phone_number', 'city', 
                                                     'current_position', 'industry', 'years_of_experience',
                                                     'degree', 'linkedin_url', 'image_url', 'highestLevelOfEducation'], typ='series')

ValueError: If using all scalar values, you must pass an index

In [174]:
from app.models import User

ModuleNotFoundError: No module named 'app'

In [175]:
class User: 
    id = db.Column(db.Integer, primary_key=True)
    email = db.Column(db.String(120), index=True, unique=True)
    name = db.Column(db.String(64))
    phone_number = db.Column(db.String(20))

    city = db.Column(db.String(64))
    current_position = db.Column(db.String(64))
    industry = db.Column(db.String(64))
    years_of_experience = db.Column(db.Integer)
    degree = db.Column(db.Enum(Degree))
    linkedin_url = db.Column(db.String(64))
    image_url = db.Column(db.String(64))

NameError: name 'db' is not defined

In [157]:
def heuristic_two(candidates, opp_request, n_top = 50): 
    '''Arguments:
            candidates: list of Users
            opp_request: a dictionary from a poster seeking candidates

       Returns:
        ranked dataframe of users and similarity score sorted by degree connection
    '''
    
    
    df_candidates = pd.DataFrame(candidates)
    
    def city_point_converter(city):
        if city == opportunity_sample_request['city']:
            return -1
        else:
            return 0
    
    #Calculate Location Similarity Score 
    df_candidates['city_eucld_score'] = df_candidates['city'].apply(city_point_converter)

    #Calculate Years of Experience Similarity Score
    df_candidates['years_of_experience_eucld_dist'] = df_candidates['years_of_experience'].apply(lambda x: np.linalg.norm(x-opp_request['min_years_experience']))

    #Calculate Level of Education Similarity Score
    df_candidates['edunum'] = df_candidates['highestLevelOfEducation'].apply(educonverter)
    df_candidates['highestLevelOfEducation_eucld_dist'] = df_candidates['edunum'].apply(lambda x: np.linalg.norm(x-educonverter(opp_request['highestLevelOfEducation'])))
    
    #Calculate Total Similarity Score
    df_candidates['eucld_dist'] = df_candidates['highestLevelOfEducation_eucld_dist'] + df_candidates['years_of_experience_eucld_dist'] + df_candidates['city_eucld_score']
    
    
    print(opp_request)
    print('')
    ranked_df = df_candidates.sort_values(by = ['eucld_dist', 'degree_connection'], ascending = True).reset_index().drop(['index', 'edunum'], axis = 1)
    
    
    return ranked_df.iloc[0:n_top]


In [164]:
opportunity_sample_request?

In [32]:
heuristic_two(df_candidates, opportunity_sample_request,)

blurb                      I need a software engineer that has experience...
highestLevelOfEducation                                          High School
industry                                                 Enterprise Software
location                                                 [New York City, NY]
min_years_experience                                                       5
title                                              Machine Learning Engineer
Name: 1, dtype: object



Unnamed: 0,current_position,degree_connection,email,highestLevelOfEducation,id,industry,location,name,years_of_experience,city,state,location_eucld_score,years_of_experience_eucld_dist,highestLevelOfEducation_eucld_dist,eucld_dist
0,Machine Learning Engineer,1st,7841xxxxxxx@gmail.com,High School,6489,Technology,"[New York City, NY]",name + 4258,5,New York City,NY,-1,0.0,0.0,-1.0
1,Data Scientist,1st,9059xxxxxxx@gmail.com,High School,1386,Internet,"[New York City, NY]",name + 9723,5,New York City,NY,-1,0.0,0.0,-1.0
2,Software Engineer,1st,9407xxxxxxx@gmail.com,High School,577,Biotech,"[New York City, NY]",name + 324,5,New York City,NY,-1,0.0,0.0,-1.0
3,Bioengineer,1st,9456xxxxxxx@gmail.com,High School,4462,Biotech,"[New York City, NY]",name + 5024,5,New York City,NY,-1,0.0,0.0,-1.0
4,Bioengineer,1st,3268xxxxxxx@gmail.com,High School,4890,Internet,"[New York City, NY]",name + 6370,5,New York City,NY,-1,0.0,0.0,-1.0
5,Software Engineer,1st,1135xxxxxxx@gmail.com,High School,3097,Biotech,"[New York City, NY]",name + 4264,5,New York City,NY,-1,0.0,0.0,-1.0
6,Bioengineer,1st,5016xxxxxxx@gmail.com,High School,1929,Internet,"[New York City, NY]",name + 3357,5,New York City,NY,-1,0.0,0.0,-1.0
7,Machine Learning Engineer,1st,4543xxxxxxx@gmail.com,High School,6627,Biotech,"[New York City, NY]",name + 4253,5,New York City,NY,-1,0.0,0.0,-1.0
8,Data Scientist,1st,3363xxxxxxx@gmail.com,High School,2729,Internet,"[New York City, NY]",name + 447,5,New York City,NY,-1,0.0,0.0,-1.0
9,Data Scientist,1st,4554xxxxxxx@gmail.com,High School,8778,Biotech,"[New York City, NY]",name + 1696,5,New York City,NY,-1,0.0,0.0,-1.0


# Heuristic Three: MORE NLP Driven Score Metric for Relevance

In [88]:
def heuristic_three(df):
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
    '''
    pass