In [1]:
import random

import pandas as pd
import numpy as np
import nltk
from sqlalchemy import create_engine
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report
from sklearn.externals import joblib

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adrianlievano/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


  from numpy.core.umath_tests import inner1d


## Explore the input data

In [2]:
opportunity = {
    'industry': 'Internet',
    'min_years_experience': 3,
    'city': 'San Francisco',
    'highestLevelOfEducation': 'bachelor',
    'blurb': 'I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
    'title': 'Seeking Software Engineer (Full-Time)'
}

In [3]:
industrys = ['Internet', 'Technology', 'Enterprise Software']
min_years_experiences = [1, 2, 3, 4, 5, 6, 7]
citys = ['San Francisco', 'New York City', 'Los Angeles']
highestLevelOfEducations = ['high_school', 'associate', 'bachelor', 'master', 'phd']
blurbs = ['I need a software engineer that has experience in pytorch, python, aws, spark, and airflow. They should want to work for a startup that has less than 500 employees. We do not want new graduates.',
          'We need a full stack machine learning engineer that knows REST APIs, flask, and tensorflow.',
          'Find me data scientist that knows SQL, tableu, excel, machine learning.', 
          'We just need a developer that has six or 7 years of experience']
titles = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Software Engineer (Machine Learning)']

df_test = pd.DataFrame()

num_opportunities = 5
rows = []

for i in range(num_opportunities):
    row_ele = {
    'industry': random.choice(industrys),
    'min_years_experience': random.choice(min_years_experiences),
    'city': random.choice(citys),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations),
    'blurb': random.choice(blurbs),
    'title': random.choice(titles)
    }
    rows.append(row_ele)


## Construct Dummy Table - Opportunity Submission Table

#### This is a table with queries that a poster might use to find a candidate in their network

In [4]:
opportunities_df = pd.DataFrame(rows)
opportunities_df.head(10)

Unnamed: 0,blurb,city,highestLevelOfEducation,industry,min_years_experience,title
0,We just need a developer that has six or 7 yea...,New York City,master,Internet,2,Software Engineer (Machine Learning)
1,We just need a developer that has six or 7 yea...,San Francisco,bachelor,Enterprise Software,7,Machine Learning Engineer
2,"Find me data scientist that knows SQL, tableu,...",New York City,master,Internet,6,Software Engineer
3,We just need a developer that has six or 7 yea...,San Francisco,associate,Technology,5,Software Engineer (Machine Learning)
4,We just need a developer that has six or 7 yea...,New York City,phd,Enterprise Software,5,Software Engineer


## Construct Dummy Table - Candidate Table

Candidate dictionary
{
    id: int
    email: 'str'
    name: 'str'
    location: list['city', 'state']
    current_position: 'str',
    years_of_experience: int
    industry: 'str'
    degree_connection: str
    degree = str
}

In [5]:
candidates_total = 10000
emails = []
names = []

for i in range(candidates_total):
    email = ('{}'+'xxxxxxx@gmail.com').format(i)
    emails.append(email)
    
for i in range(candidates_total):
    name = ('name + {}'.format(i))
    names.append(name)

ids = [i for i in range(candidates_total)]
emails = emails
names = names
citys = ['San Francisco', 'New York City', 'Los Angeles']
#state = ['California', 'New York', 'Florida']
current_positions = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Bioengineer']
industrys = ['Internet', 'Technology', 'Enterprise Software', 'Biotech']
years_of_experiences = [1, 2, 3, 4, 5, 6, 7]
linkedin_urls = ['https://www.linkedin.com/adrianlievano']
image_urls = ['https://www.linkedin.com/adrianlievano/profile_pic.png']
degrees = ['1st', '2nd']
highestLevelOfEducations = ['high_school', 'associate', 'bachelor', 'master', 'phd']

candidate_rows = []

for i in range(candidates_total):
    row_ele = {
    'id': random.choice(ids),
    'email': random.choice(emails),
    'name': random.choice(names),
    'phone_number': '786-515-4282',
    'city': random.choice(citys),
    #'location': random.choice(locations),
    'current_position': random.choice(current_positions),
    'industry': random.choice(industrys),
    'years_of_experience': random.choice(years_of_experiences),
    'degree': random.choice(degrees),
    'linkedin_url': random.choice(linkedin_urls),
    'image_url': random.choice(image_urls),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations)
    }
    candidate_rows.append(row_ele)

In [6]:
can`didate_rows

SyntaxError: invalid syntax (<ipython-input-6-80b8204725b9>, line 1)

In [None]:
df_candidates = pd.DataFrame(candidate_rows)
df_candidates

### Phase 1: Knowledge Based Heuristic ###

1. Filter 1: Filter by location
2. Filter 2: Filter by >= years of experience
3. Filter 3: Filter subset by title embeddings
4. Filter 4: Filter by degrees
5. Filter 5: Filter by industry
6. Filter 6: Filter by blurb

Create score column in candidate table that assigns reward for each applicable filter

In [None]:
opportunity_sample_request = dict(opportunities_df.iloc[0])
opportunity_sample_request #pandas series

opportunity_sample_request['industry']

In [None]:
opportunity_sample_request['city']

In [None]:
opportunity_sample_request['min_years_experience']

## Filter based on location 

df_candidates[(df_candidates['city'] == opportunity_sample_request['location'][0]) & ( 
              df_candidates['state'] == opportunity_sample_request['location'][1])]

## Filter based on current_position - (TBD)

opportunity_sample_request['current_position']

df_candidates[df_candidates['current_position'] == str(opportunity_sample_request['current_position'])].head()

## Filter based on degree major

In [11]:
df_candidates = pd.DataFrame(candidate_rows)

In [12]:
df_candidates[df_candidates['highestLevelOfEducation'] == str(opportunity_sample_request['highestLevelOfEducation'])].head()

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
0,New York City,Machine Learning Engineer,2nd,6287xxxxxxx@gmail.com,bachelor,2701,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 1163,786-515-4282,6
2,Los Angeles,Data Scientist,1st,3503xxxxxxx@gmail.com,bachelor,2834,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 4863,786-515-4282,4
8,Los Angeles,Machine Learning Engineer,2nd,34xxxxxxx@gmail.com,bachelor,5740,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 859,786-515-4282,3
14,New York City,Machine Learning Engineer,1st,4399xxxxxxx@gmail.com,bachelor,991,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 1149,786-515-4282,6
17,New York City,Bioengineer,2nd,6853xxxxxxx@gmail.com,bachelor,31,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 4969,786-515-4282,1


## Sort by degree connection 

In [13]:
df_candidates.sort_values(by = 'degree', ascending = True).head()

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
4999,New York City,Data Scientist,1st,8493xxxxxxx@gmail.com,associate,904,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 4731,786-515-4282,5
7956,New York City,Data Scientist,1st,2751xxxxxxx@gmail.com,master,3739,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 2521,786-515-4282,4
4152,Los Angeles,Bioengineer,1st,7105xxxxxxx@gmail.com,bachelor,6979,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 1668,786-515-4282,6
4149,San Francisco,Machine Learning Engineer,1st,9723xxxxxxx@gmail.com,high_school,8280,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 4844,786-515-4282,2
4147,San Francisco,Software Engineer,1st,4780xxxxxxx@gmail.com,bachelor,7187,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 7471,786-515-4282,3


## Filter by years experience

In [14]:
df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
4,Los Angeles,Data Scientist,2nd,3034xxxxxxx@gmail.com,phd,8464,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 7636,786-515-4282,7
28,Los Angeles,Machine Learning Engineer,2nd,321xxxxxxx@gmail.com,master,7477,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 9206,786-515-4282,7
35,New York City,Machine Learning Engineer,1st,30xxxxxxx@gmail.com,bachelor,7187,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 449,786-515-4282,7
42,Los Angeles,Data Scientist,2nd,6622xxxxxxx@gmail.com,bachelor,9567,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 156,786-515-4282,7
44,San Francisco,Software Engineer,2nd,2948xxxxxxx@gmail.com,master,1559,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 8893,786-515-4282,7


# Heuristic 1: Knowledge Based Recommendation System


In [15]:
def heuristic_one(opportunity_request, df_candidates):
    '''Arguments:
            opportunity: a dictionary from a poster seeking candidates
            df_candidates: total pool of 1st and 2nd degree connections from the opportunity poster
            
        OUTPUTS:
        ranked dataframe based on simple heuristic of filters
    '''
    blurb = opportunity_request['blurb']
    highestLevelOfEducation = opportunity_request['highestLevelOfEducation']
    industry = opportunity_request['industry']
    years_exp = opportunity_sample_request['min_years_experience']
    df_candidates['city'] = df_candidates['city']
    #df_candidates['state'] = df_candidates['location'].str[1]
    
    #rank based on blurb relevance score 
    
    #filter based on years experience 
    ranked_df = df_candidates[df_candidates['years_of_experience'] >= int(opportunity_sample_request['min_years_experience'])].head()

    #filter based on location
    ranked_df = ranked_df[(ranked_df['city'] == opportunity_sample_request['city'])]
    
    #filter based on match to degree
   # ranked_df = ranked_df[ranked_df['highestLevelOfEducation'] == str(opportunity_sample_request['highestLevelOfEducation'])].head()
    
    ranked_df = ranked_df.sort_values(by = 'degree', ascending = True).reset_index().drop(['index'], axis = True)
    return ranked_df


In [16]:
opportunity_sample_request = opportunities_df.iloc[1]

heuristic_one(opportunity_request=opportunity_sample_request, df_candidates=df_candidates)

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience
0,San Francisco,Software Engineer,2nd,2948xxxxxxx@gmail.com,master,1559,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 8893,786-515-4282,7


# Heuristic Two: NLP Driven Score Metric for Relevance + No Filters (All Score Based) - Neighborhood Based Collaborative Filtering


In [17]:
opportunity_sample_request

blurb                      We just need a developer that has six or 7 yea...
city                                                           San Francisco
highestLevelOfEducation                                             bachelor
industry                                                 Enterprise Software
min_years_experience                                                       7
title                                              Machine Learning Engineer
Name: 1, dtype: object

### Opportunity title to current position - similarity score


In [18]:
import gensim 
from gensim.models import Word2Vec 

In [19]:
##helper function to tokenize words
def tokenize(tokens):
    tokens = word_tokenize(opportunity_sample_request['title'])
    stop_words = stopwords.words('english')
    tokens = [word for word in tokens if word not in stop_words]
    cleaned_tokens = []
    lemmatizer = WordNetLemmatizer()

    for tok in tokens:
        cleaned_tokens.append(lemmatizer.lemmatize(tok).lower().strip())

    return cleaned_tokens

In [20]:
tokens = word_tokenize(opportunity_sample_request['title'])
stop_words = stopwords.words('english')
tokens = [word for word in tokens if word not in stop_words]
cleaned_tokens = []
lemmatizer = WordNetLemmatizer()

for tok in tokens:
    cleaned_tokens.append(lemmatizer.lemmatize(tok).lower().strip())


In [21]:
pipeline = Pipeline([
    ('vect', CountVectorizer(tokenizer=tokenize)),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
])

In [22]:
## count vectorize & td-idf represents word in a frequency vector. They do not capture syntatical information

In [23]:

model_job = gensim.models.Word2Vec(cleaned_tokens, min_count = 1,  
                              size = 100, window = 5)



model_job.similarity('software', 'engineer')

### Blurb embeddeding score to text info

### Location score function - similarity score

Distance based similarily score 

In [24]:
def city_point_converter(city):
    if city == opportunity_sample_request['city']:
        return -1
    else:
        return 0
    

df_candidates['city_eucld_score'] = df_candidates['city'].apply(city_point_converter)

### Years of Experience Score - similarity score 

In [25]:
df_candidates['years_of_experience_eucld_dist'] = df_candidates['years_of_experience'].apply(lambda x: np.linalg.norm(x-opportunity_sample_request['min_years_experience']))


### Euclidean Distance for Education Level

In [26]:
df_candidates['highestLevelOfEducation_eucld_dist'] = 0

In [27]:
np.linalg.norm(4-6)/6

0.3333333333333333

In [28]:
def educonverter(education_level):
    if education_level == 'high_school':
        return 1
    elif education_level == 'associate':
        return 2
    elif education_level == 'bachelor':
        return 3
    elif education_level == 'master':
        return 4
    elif education_level == 'phd':
        return 5

In [29]:
df_candidates['edunum'] = df_candidates['highestLevelOfEducation'].apply(educonverter)

In [30]:
mag_edu = educonverter(opportunity_sample_request['highestLevelOfEducation'])

In [31]:
df_candidates['highestLevelOfEducation_eucld_dist'] = df_candidates['edunum'].apply(lambda x: np.linalg.norm((x-mag_edu)/mag_edu))
                                                                                    
                                                                                    
                                                                                    
                                                                                    
                                                                                    

### Calculate Total Similarity Score

In [32]:
df_candidates['eucld_dist'] = df_candidates['highestLevelOfEducation_eucld_dist'] + df_candidates['years_of_experience_eucld_dist'] + df_candidates['city_eucld_score']



#### Sort by Euclidean Distance - Less means they are closer in similarity

In [33]:
n_top = 5000
ranked_df = df_candidates.sort_values(by = 'eucld_dist', ascending = True).reset_index().drop(['index', 'edunum'], axis = 1)
ranked_df.iloc[0:n_top].sort_values(by = 'degree', ascending = True)

Unnamed: 0,city,current_position,degree,email,highestLevelOfEducation,id,image_url,industry,linkedin_url,name,phone_number,years_of_experience,city_eucld_score,years_of_experience_eucld_dist,highestLevelOfEducation_eucld_dist,eucld_dist
3330,New York City,Bioengineer,1st,8345xxxxxxx@gmail.com,high_school,5477,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 9969,786-515-4282,6,0,1.0,0.666667,1.666667
2897,Los Angeles,Data Scientist,1st,9150xxxxxxx@gmail.com,high_school,5459,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 2458,786-515-4282,6,0,1.0,0.666667,1.666667
1759,Los Angeles,Software Engineer,1st,7656xxxxxxx@gmail.com,phd,2668,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 3754,786-515-4282,7,0,0.0,0.666667,0.666667
3557,New York City,Bioengineer,1st,4617xxxxxxx@gmail.com,bachelor,8408,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 144,786-515-4282,5,0,2.0,0.000000,2.000000
1761,Los Angeles,Machine Learning Engineer,1st,6346xxxxxxx@gmail.com,high_school,7254,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 4648,786-515-4282,7,0,0.0,0.666667,0.666667
2894,Los Angeles,Machine Learning Engineer,1st,8010xxxxxxx@gmail.com,phd,3508,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 2874,786-515-4282,6,0,1.0,0.666667,1.666667
1763,New York City,Machine Learning Engineer,1st,6328xxxxxxx@gmail.com,phd,8509,https://www.linkedin.com/adrianlievano/profile...,Enterprise Software,https://www.linkedin.com/adrianlievano,name + 5844,786-515-4282,7,0,0.0,0.666667,0.666667
1764,New York City,Bioengineer,1st,86xxxxxxx@gmail.com,high_school,9642,https://www.linkedin.com/adrianlievano/profile...,Technology,https://www.linkedin.com/adrianlievano,name + 645,786-515-4282,7,0,0.0,0.666667,0.666667
1756,Los Angeles,Data Scientist,1st,7083xxxxxxx@gmail.com,phd,6663,https://www.linkedin.com/adrianlievano/profile...,Biotech,https://www.linkedin.com/adrianlievano,name + 9333,786-515-4282,7,0,0.0,0.666667,0.666667
1765,New York City,Data Scientist,1st,1001xxxxxxx@gmail.com,high_school,4897,https://www.linkedin.com/adrianlievano/profile...,Internet,https://www.linkedin.com/adrianlievano,name + 7897,786-515-4282,7,0,0.0,0.666667,0.666667


### Generate list of candidates

In [81]:
candidates_total = 10000
emails = []
names = []

for i in range(candidates_total):
    email = ('{}'+'xxxxxxx@gmail.com').format(i)
    emails.append(email)
    
for i in range(candidates_total):
    name = ('name + {}'.format(i))
    names.append(name)

ids = [i for i in range(candidates_total)]
emails = emails
names = names
citys = ['San Francisco', 'New York City', 'Los Angeles']
#state = ['California', 'New York', 'Florida']
current_positions = ['Software Engineer', 'Data Scientist', 'Machine Learning Engineer', 'Bioengineer']
industrys = ['Internet', 'Technology', 'Enterprise Software', 'Biotech']
years_of_experiences = [1, 2, 3, 4, 5, 6, 7]
linkedin_urls = ['https://www.linkedin.com/adrianlievano']
image_urls = ['https://www.linkedin.com/adrianlievano/profile_pic.png']
degrees = ['1st', '2nd']
highestLevelOfEducations = ['high_school', 'associate', 'bachelor', 'master', 'phd']

candidate_rows = []

for i in range(candidates_total):
    row_ele = {
    'id': random.choice(ids),
    'email': random.choice(emails),
    'name': random.choice(names),
    'phone_number': '786-515-4282',
    'city': random.choice(citys),
    #'location': random.choice(locations),
    'current_position': random.choice(current_positions),
    'industry': random.choice(industrys),
    'years_of_experience': random.choice(years_of_experiences),
    'degree': random.choice(degrees),
    'linkedin_url': random.choice(linkedin_urls),
    'image_url': random.choice(image_urls),
    'highestLevelOfEducation': random.choice(highestLevelOfEducations)
    }
    candidate_rows.append(row_ele)

In [84]:
candidate_rows

[{'city': 'New York City',
  'current_position': 'Software Engineer',
  'degree': '2nd',
  'email': '7747xxxxxxx@gmail.com',
  'highestLevelOfEducation': 'bachelor',
  'id': 2586,
  'image_url': 'https://www.linkedin.com/adrianlievano/profile_pic.png',
  'industry': 'Enterprise Software',
  'linkedin_url': 'https://www.linkedin.com/adrianlievano',
  'name': 'name + 170',
  'phone_number': '786-515-4282',
  'years_of_experience': 6},
 {'city': 'Los Angeles',
  'current_position': 'Machine Learning Engineer',
  'degree': '2nd',
  'email': '8108xxxxxxx@gmail.com',
  'highestLevelOfEducation': 'bachelor',
  'id': 619,
  'image_url': 'https://www.linkedin.com/adrianlievano/profile_pic.png',
  'industry': 'Internet',
  'linkedin_url': 'https://www.linkedin.com/adrianlievano',
  'name': 'name + 2698',
  'phone_number': '786-515-4282',
  'years_of_experience': 2},
 {'city': 'San Francisco',
  'current_position': 'Bioengineer',
  'degree': '1st',
  'email': '6081xxxxxxx@gmail.com',
  'highestLe

In [82]:
candidate_rows[0]

{'city': 'New York City',
 'current_position': 'Software Engineer',
 'degree': '2nd',
 'email': '7747xxxxxxx@gmail.com',
 'highestLevelOfEducation': 'bachelor',
 'id': 2586,
 'image_url': 'https://www.linkedin.com/adrianlievano/profile_pic.png',
 'industry': 'Enterprise Software',
 'linkedin_url': 'https://www.linkedin.com/adrianlievano',
 'name': 'name + 170',
 'phone_number': '786-515-4282',
 'years_of_experience': 6}

In [50]:
opportunity_sample_request

blurb                      We just need a developer that has six or 7 yea...
city                                                           San Francisco
highestLevelOfEducation                                             bachelor
industry                                                 Enterprise Software
min_years_experience                                                       7
title                                              Machine Learning Engineer
Name: 1, dtype: object

In [156]:
def city_point_converter(city):
    if city == opportunity_sample_request['city']:
        return -1
    else:
        return 0

def educonverter(education_level):
    if education_level == 'high_school':
        return 1
    elif education_level == 'associate':
        return 2
    elif education_level == 'bachelor':
        return 3
    elif education_level == 'master':
        return 4
    elif education_level == 'phd':
        return 5

In [156]:
sample_candidate = [candidate_rows[0]]
sample_candidate

[{'city': 'New York City',
  'current_position': 'Software Engineer',
  'degree': '2nd',
  'email': '7747xxxxxxx@gmail.com',
  'highestLevelOfEducation': 'bachelor',
  'id': 2586,
  'image_url': 'https://www.linkedin.com/adrianlievano/profile_pic.png',
  'industry': 'Enterprise Software',
  'linkedin_url': 'https://www.linkedin.com/adrianlievano',
  'name': 'name + 170',
  'phone_number': '786-515-4282',
  'years_of_experience': 6}]

In [157]:
df = pd.DataFrame(sample_candidate)
df['city']

0    New York City
Name: city, dtype: object

pd.DataFrame.from_dict(sample_candidate)
pd.DataFrame(list(sample_candidate.items()), columns=['id', 'email', 'name', 'phone_number', 'city', 
                                                     'current_position', 'industry', 'years_of_experience',
                                                     'degree', 'linkedin_url', 'image_url', 'highestLevelOfEducation'], typ='series')

In [213]:
def heuristic_two(candidate, opp_request, n_top = 50): 
    '''Arguments:
            candidate: list of User info
            opp_request: a dictionary from a poster seeking candidates

       Returns:
        ranked dataframe of users and similarity score sorted by degree connection
    '''
    
    candidates_list = candidate
    df_candidates = pd.DataFrame(candidates_list)
    
    def city_point_converter(city):
        if city == opportunity_request['city']:
            return -1
        else:
            return 0
    
    #Calculate Location Similarity Score 
   
    #df_candidates['city_eucld_score'] = df_candidates['city'].apply(city_point_converter)
    df_candidates['city_eucld_score'] = 0
    
    #Calculate Years of Experience Similarity Score
    df_candidates['years_of_experience_eucld_dist'] = df_candidates['years_of_experience'].apply(lambda x: np.linalg.norm(x-opp_request['min_years_experience']))

    #Calculate Level of Education Similarity Score
    df_candidates['edunum'] = df_candidates['highestLevelOfEducation'].apply(educonverter)
    df_candidates['highestLevelOfEducation_eucld_dist'] = df_candidates['edunum'].apply(lambda x: np.linalg.norm(x-educonverter(opp_request['highestLevelOfEducation'])))
    
    #Calculate Total Similarity Score
    df_candidates['eucld_dist'] = df_candidates['highestLevelOfEducation_eucld_dist'] + df_candidates['years_of_experience_eucld_dist'] + df_candidates['city_eucld_score']
    
    print(opp_request)
    print('')
    
    df_candidates = df_candidates.sort_values(by = ['eucld_dist', 'degree'], ascending = True).reset_index().drop(['index', 'edunum'], axis = 1)
    tupp = (candidate[0], df_candidates['eucld_dist'][0])
    return tupp


In [214]:
opportunity_sample_request

blurb                      We just need a developer that has six or 7 yea...
city                                                           San Francisco
highestLevelOfEducation                                             bachelor
industry                                                 Enterprise Software
min_years_experience                                                       7
title                                              Machine Learning Engineer
Name: 1, dtype: object

In [215]:
heuristic_two(sample_candidate, opportunity_sample_request)

blurb                      We just need a developer that has six or 7 yea...
city                                                           San Francisco
highestLevelOfEducation                                             bachelor
industry                                                 Enterprise Software
min_years_experience                                                       7
title                                              Machine Learning Engineer
Name: 1, dtype: object



({'city': 'New York City',
  'current_position': 'Software Engineer',
  'degree': '2nd',
  'email': '7747xxxxxxx@gmail.com',
  'highestLevelOfEducation': 'bachelor',
  'id': 2586,
  'image_url': 'https://www.linkedin.com/adrianlievano/profile_pic.png',
  'industry': 'Enterprise Software',
  'linkedin_url': 'https://www.linkedin.com/adrianlievano',
  'name': 'name + 170',
  'phone_number': '786-515-4282',
  'years_of_experience': 6},
 1.0)