In [94]:
import random

import pandas as pd
import nltk
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Explore the input data

In [95]:
opportunity = {
    'industry': 'Internet',
    'min_years_experience': 3,
    'location': ['San Francisco', 'CA'],
    'degrees': 'Computer Science',
    'blurb': 'I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
    'title': 'Seeking Software Engineer (Full-Time)'
}

In [99]:
industrys = ['Internet', 'Technology', 'Enterprise Software']
min_years_experiences = [1, 2, 3, 4, 5, 6, 7]
locations = [['San Francisco', 'CA'], ['New York City', 'NY'], ['Los Angeles', 'CA']]
degrees = ['Engineering', 'Math', 'Economics']
blurbs = ['I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
          'We need a full stack machine learning engineer that knows REST APIs, flask, and tensorflow.',
          'Find me data scientist that knows SQL, tableu, excel, machine learning.', 
          'We just need a developer that has six or 7 years of experience']
titles = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Software Engineer (Machine Learning)']

df_test = pd.DataFrame()

num_opportunities = 5
rows = []

for i in range(num_opportunities):
    row_ele = {
    'industry': random.choice(industrys),
    'min_years_experience': random.choice(min_years_experiences),
    'location': random.choice(locations),
    'degrees': random.choice(degrees),
    'blurb': random.choice(blurbs),
    'title': random.choice(titles)
    }
    rows.append(row_ele)


## Construct Dummy Table - Opportunity Submission Table

#### This is a table with queries that a poster might use to find a candidate in their network

In [100]:
opportunities_df = pd.DataFrame(rows)
opportunities_df.head(10)

Unnamed: 0,blurb,degrees,industry,location,min_years_experience,title
0,I need a software engineer that has experience...,Economics,Enterprise Software,"[New York City, NY]",7,Machine Learning Engineer
1,We need a full stack machine learning engineer...,Economics,Technology,"[San Francisco, CA]",4,Software Engineer
2,"Find me data scientist that knows SQL, tableu,...",Math,Internet,"[Los Angeles, CA]",2,Software Engineer
3,We need a full stack machine learning engineer...,Engineering,Internet,"[San Francisco, CA]",4,Software Engineer
4,We just need a developer that has six or 7 yea...,Math,Enterprise Software,"[New York City, NY]",2,Software Engineer (Machine Learning)


## Construct Dummy Table - Candidate Table

Candidate dictionary
{
    id: int
    email: 'str'
    name: 'str'
    location: list['city', 'state']
    current_position: 'str',
    years_of_experience: int
    industry: 'str'
    degree_connection: str
    degree = str
}

In [101]:
candidates_total = 10000
emails = []
names = []

for i in range(candidates_total):
    email = ('{}'+'xxxxxxx@gmail.com').format(i)
    emails.append(email)
    
for i in range(candidates_total):
    name = ('name + {}'.format(i))
    names.append(name)

In [102]:
ids = [i for i in range(candidates_total)]
emails = emails
names = names
locations = [['San Francisco', 'CA'], ['New York City', 'NY'], ['Los Angeles', 'CA'], ['Miami', 'FL']]
current_positions = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Bioengineer']
years_of_experiences = [1, 2, 3, 4, 5, 6, 7]
industrys = ['Internet', 'Technology', 'Enterprise Software', 'Biotech']
degree_connections = ['1st', '2nd']
degrees = ['Engineering', 'Math', 'Economics', 'Biology', 'Chemistry', 'English']


candidate_rows = []

for i in range(candidates_total):
    row_ele = {
    'id': random.choice(ids),
    'email': random.choice(emails),
    'name': random.choice(names),
    'location': random.choice(locations),
    'current_position': random.choice(current_positions),
    'years_of_experience': random.choice(years_of_experiences),
    'industry': random.choice(industrys),
    'degree_connection': random.choice(degree_connections),
    'degree': random.choice(degrees)
    }
    candidate_rows.append(row_ele)

In [103]:
df_candidates = pd.DataFrame(candidate_rows)
df_candidates

Unnamed: 0,current_position,degree,degree_connection,email,id,industry,location,name,years_of_experience
0,Machine Learning Engineer,Math,2nd,6210xxxxxxx@gmail.com,9289,Biotech,"[Miami, FL]",name + 792,2
1,Bioengineer,Math,2nd,5940xxxxxxx@gmail.com,1393,Biotech,"[Miami, FL]",name + 8998,6
2,Bioengineer,Chemistry,1st,9503xxxxxxx@gmail.com,5473,Enterprise Software,"[Miami, FL]",name + 7114,5
3,Machine Learning Engineer,Economics,1st,6964xxxxxxx@gmail.com,8539,Biotech,"[New York City, NY]",name + 8009,5
4,Data Scientist,Economics,1st,7550xxxxxxx@gmail.com,1712,Internet,"[San Francisco, CA]",name + 6795,4
5,Machine Learning Engineer,Economics,2nd,3694xxxxxxx@gmail.com,7862,Technology,"[Los Angeles, CA]",name + 538,6
6,Bioengineer,Economics,2nd,4674xxxxxxx@gmail.com,2618,Internet,"[New York City, NY]",name + 6798,5
7,Software Engineer,Engineering,2nd,1712xxxxxxx@gmail.com,1191,Enterprise Software,"[Los Angeles, CA]",name + 876,4
8,Bioengineer,Chemistry,2nd,4423xxxxxxx@gmail.com,3918,Technology,"[Miami, FL]",name + 96,6
9,Bioengineer,Math,1st,3727xxxxxxx@gmail.com,5150,Technology,"[San Francisco, CA]",name + 8956,6


### Phase 1: Knowledge Based Heuristic ###

1. Filter 1: Filter by location
2. Filter 2: Filter by >= years of experience
3. Filter 3: Filter subset by title embeddings
4. Filter 4: Filter by degrees
5. Filter 5: Filter by industry
6. Filter 6: Filter by blurb

Create score column in candidate table that assigns reward for each applicable filter

In [149]:
opportunity_sample_request = opportunities_df.iloc[0]
opportunity_sample_request.head() #pandas series

blurb                   I need a software engineer that has experience...
degrees                                                         Economics
industry                                              Enterprise Software
location                                              [New York City, NY]
min_years_experience                                                    7
Name: 0, dtype: object

In [152]:
opportunity_sample_request['location']

['New York City', 'NY']

In [154]:
opportunity_sample_request['min_years_experience']

7

In [128]:
(opportunity_sample_request['location'])

['New York City', 'NY']

## Filter based on location 

In [None]:
df_candidates[df_candidates['location'] == list(opportunity_sample_request['location'])].head()

## Filter based on current_position - (TBD)

opportunity_sample_request['current_position']

df_candidates[df_candidates['current_position'] == str(opportunity_sample_request['current_position'])].head()

## Filter based on Degree

## Filter based on degree major

In [145]:
df_candidates[df_candidates['degree'] == str(opportunity_sample_request['degrees'])].head()

Unnamed: 0,current_position,degree,degree_connection,email,id,industry,location,name,years_of_experience
3,Machine Learning Engineer,Economics,1st,6964xxxxxxx@gmail.com,8539,Biotech,"[New York City, NY]",name + 8009,5
4,Data Scientist,Economics,1st,7550xxxxxxx@gmail.com,1712,Internet,"[San Francisco, CA]",name + 6795,4
5,Machine Learning Engineer,Economics,2nd,3694xxxxxxx@gmail.com,7862,Technology,"[Los Angeles, CA]",name + 538,6
6,Bioengineer,Economics,2nd,4674xxxxxxx@gmail.com,2618,Internet,"[New York City, NY]",name + 6798,5
11,Data Scientist,Economics,1st,7300xxxxxxx@gmail.com,7369,Internet,"[New York City, NY]",name + 2125,5


## Sort by degree connection 

In [147]:
df_candidates.sort_values(by = 'degree_connection', ascending = True).head()

Unnamed: 0,current_position,degree,degree_connection,email,id,industry,location,name,years_of_experience
8347,Data Scientist,Engineering,1st,1406xxxxxxx@gmail.com,7900,Internet,"[Los Angeles, CA]",name + 736,4
8245,Machine Learning Engineer,English,1st,2663xxxxxxx@gmail.com,354,Internet,"[San Francisco, CA]",name + 791,1
8246,Data Scientist,Economics,1st,281xxxxxxx@gmail.com,7978,Internet,"[Miami, FL]",name + 5033,2
3533,Data Scientist,Economics,1st,4877xxxxxxx@gmail.com,6443,Biotech,"[Miami, FL]",name + 8377,2
3532,Bioengineer,Economics,1st,3793xxxxxxx@gmail.com,2383,Biotech,"[New York City, NY]",name + 2939,1


## Filter by years experience

In [155]:
df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()






Unnamed: 0,current_position,degree,degree_connection,email,id,industry,location,name,years_of_experience
14,Data Scientist,Economics,2nd,6264xxxxxxx@gmail.com,5429,Technology,"[New York City, NY]",name + 7797,7
16,Machine Learning Engineer,Chemistry,1st,6330xxxxxxx@gmail.com,8316,Technology,"[San Francisco, CA]",name + 7460,7
27,Software Engineer,Math,2nd,6108xxxxxxx@gmail.com,888,Biotech,"[New York City, NY]",name + 4758,7
36,Bioengineer,Engineering,1st,8547xxxxxxx@gmail.com,716,Biotech,"[Miami, FL]",name + 5187,7
38,Data Scientist,Engineering,2nd,2467xxxxxxx@gmail.com,5221,Biotech,"[New York City, NY]",name + 6410,7


## Come up with Relevance Score when user makes a query for less clear categories

filter location
score = score_location + score

In [172]:
def heuristic_one(opportunity_request, df_candidates):
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
            
        OUTPUTS:
        ranked dataframe based on simple heuristic of filters
    '''
    blurb = opportunity_request['blurb']
    degrees = opportunity_request['degrees']
    industry = opportunity_request['industry']
    years_exp = opportunity_sample_request['min_years_experience']
    
    
    #rank based on blurb relevance score 
    
    #filter based on years experience 
    ranked_df = df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()

    #filter based on location
    #ranked_df
    
    #filter based on match to degree
    ranked_df = ranked_df[ranked_df['degree'] == str(opportunity_sample_request['degrees'])].head()
    
    ranked_df = ranked_df.sort_values(by = 'degree_connection', ascending = True).reset_index().drop(['index'], axis = True)
    return ranked_df


In [180]:
opportunity_sample_request = opportunities_df.iloc[1]
opportunity_sample_request

blurb                   We need a full stack machine learning engineer...
degrees                                                         Economics
industry                                                       Technology
location                                              [San Francisco, CA]
min_years_experience                                                    4
title                                                   Software Engineer
Name: 1, dtype: object

In [179]:
opportunity_sample_request = opportunities_df.iloc[1]
heuristic_one(opportunity_request=opportunity_sample_request, df_candidates=df_candidates)

Unnamed: 0,current_position,degree,degree_connection,email,id,industry,location,name,years_of_experience
0,Machine Learning Engineer,Economics,1st,6964xxxxxxx@gmail.com,8539,Biotech,"[New York City, NY]",name + 8009,5
1,Data Scientist,Economics,1st,7550xxxxxxx@gmail.com,1712,Internet,"[San Francisco, CA]",name + 6795,4
2,Machine Learning Engineer,Economics,2nd,3694xxxxxxx@gmail.com,7862,Technology,"[Los Angeles, CA]",name + 538,6


# Heuristic Two: NLP Driven Score Metric for Relevance

In [176]:
def heuristic_two(df): 
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
    '''
    pass



# Heuristic Three: MORE NLP Driven Score Metric for Relevance

In [177]:
def heuristic_three(df):
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
    '''
    pass