In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('../data/external/potential-talents.csv').set_index('id')
data.shape

(104, 4)

In [3]:
data.drop_duplicates(inplace = True)
data.shape

(53, 4)

In [4]:
data.head()

Unnamed: 0_level_0,job_title,location,connection,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
4,People Development Coordinator at Ryan,"Denton, Texas",500+,
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [5]:
data.drop('fit', axis=1, inplace=True)

In [6]:
df_v1 = data.copy()
print(df_v1.shape)
df_v1.head()

(53, 3)


Unnamed: 0_level_0,job_title,location,connection
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
2,Native English Teacher at EPIK (English Progra...,Kanada,500+
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
4,People Development Coordinator at Ryan,"Denton, Texas",500+
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


### Bag of words

In [7]:
import math
import re
from collections import Counter

In [8]:
WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])
    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator/denominator)
    
def text_to_vector(text):
    return Counter(WORD.findall(text))

In [9]:
keywords = 'Aspiring Human Resources'

job_title_vectors = [text_to_vector(text) for text in df_v1['job_title']]
keyword_vectors = text_to_vector(keywords)

In [10]:
df_v1['bow_fit'] = [get_cosine(keyword_vectors, title_vector) for title_vector in job_title_vectors]
df_bow = df_v1.sort_values('bow_fit', ascending=False)
df_bow.head(10)

Unnamed: 0_level_0,job_title,location,connection,bow_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.866025
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866025
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.866025
73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.800641
74,Human Resources Professional,Greater Boston Area,16,0.666667
79,Liberal Arts Major. Aspiring Human Resources A...,"Baton Rouge, Louisiana Area",7,0.654654
100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.629941
27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.612372
72,Business Management Major and Aspiring Human R...,"Monroe, Louisiana Area",5,0.612372
88,Human Resources Management Major,"Milpitas, California",18,0.57735


### TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [12]:
df_v2 = data.copy()

In [13]:
documents = df_v2['job_title'].tolist()
keywords = 'Aspiring Human Resources'

In [14]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
keyword_vector = vectorizer.transform([keywords])
similarities = [cosine_similarity(keyword_vector, text_vector) for text_vector in tfidf_matrix]
similarities_list = []
for i in similarities:
    similarities_list.append(i.item())
df_v2['tfidf_fit'] = similarities_list
df_tfidf = df_v2.sort_values('tfidf_fit', ascending=False)
df_tfidf

Unnamed: 0_level_0,job_title,location,connection,tfidf_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.761756
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.761756
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.682828
73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.576029
72,Business Management Major and Aspiring Human R...,"Monroe, Louisiana Area",5,0.415676
74,Human Resources Professional,Greater Boston Area,16,0.412834
27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.39881
66,Experienced Retail Manager and aspiring Human ...,"Austin, Texas Area",57,0.391409
7,Student at Humber College and Aspiring Human R...,Kanada,61,0.370897
79,Liberal Arts Major. Aspiring Human Resources A...,"Baton Rouge, Louisiana Area",7,0.356918


### GloVe

In [15]:
import torchtext

In [16]:
df_v3 = data.copy()
df_v3.shape

(53, 3)

In [17]:
## getting nan values for embedding when simple clearning is not done for a few job_titles
def simple_cleaning(text):
    text = re.sub(r'\s+', ' ', text).strip()
    return text.lower()
df_v3['job_title'] = df_v3['job_title'].apply(simple_cleaning)
df_v3.head()

Unnamed: 0_level_0,job_title,location,connection
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2019 c.t. bauer college of business graduate (...,"Houston, Texas",85
2,native english teacher at epik (english progra...,Kanada,500+
3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44
4,people development coordinator at ryan,"Denton, Texas",500+
5,advisory board member at celal bayar university,"İzmir, Türkiye",500+


In [18]:
glove = torchtext.vocab.GloVe(name='6B', dim=100)

In [19]:
def string_to_glove_embedding(string):
    tokens = string.split()
    indices = [glove.stoi[token] for token in tokens if token in glove.stoi]
    vectors = glove.vectors[indices]
    vectors_array = vectors.numpy()
    embedding = vectors_array.mean(axis=0)
    return embedding

In [20]:
keywords = 'aspiring human resources'
embeddings = df_v3['job_title'].apply(string_to_glove_embedding)
text_embedding_list = [embedding for embedding in embeddings]
keyword_array = string_to_glove_embedding(keywords)

similarity_scores = [cosine_similarity(array.reshape(1, -1), keyword_array.reshape(1, -1))[0, 0] for array in text_embedding_list]

df_v3['gloVe_fit'] = similarity_scores
df_gloVe = df_v3.sort_values('gloVe_fit', ascending=False)
df_gloVe.head()

Unnamed: 0_level_0,job_title,location,connection,gloVe_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,aspiring human resources specialist,Greater New York City Area,1,0.953001
97,aspiring human resources professional,"Kokomo, Indiana Area",71,0.948721
3,aspiring human resources professional,"Raleigh-Durham, North Carolina Area",44,0.948721
74,human resources professional,Greater Boston Area,16,0.922117
73,"aspiring human resources manager, seeking inte...","Houston, Texas Area",7,0.920849


### Word2Vec

In [21]:
df_v4 = data.copy()
df_v4.head()

Unnamed: 0_level_0,job_title,location,connection
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85
2,Native English Teacher at EPIK (English Progra...,Kanada,500+
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44
4,People Development Coordinator at Ryan,"Denton, Texas",500+
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+


In [22]:
def tokenized_corpus(text):
    tokens = text.split()
    return tokens

toks = df_v4['job_title'].apply(tokenized_corpus)
tokenized_list = toks.tolist()

In [23]:
from gensim.models import Word2Vec

In [24]:
model = Word2Vec(tokenized_list, vector_size=100, window=5, min_count=2, workers=4)
model.save('word2vec.model')

In [25]:
model = Word2Vec.load("word2vec.model")

In [26]:
def string_to_word2vec_embedding(string):
    tokens = string.split()
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    embedding = np.mean(vectors, axis=0)
    return embedding

In [27]:
keywords = 'aspiring human resources'
embeddings_w2v = df_v4['job_title'].apply(string_to_word2vec_embedding)
text_embedding_list_w2v = [embedding for embedding in embeddings_w2v]
keyword_array_w2v = string_to_word2vec_embedding(keywords)

similarity_scores_w2v = [cosine_similarity(array.reshape(1, -1), keyword_array_w2v.reshape(1, -1))[0, 0] for array in text_embedding_list_w2v]

df_v4['w2v_fit'] = similarity_scores
df__w2v = df_v4.sort_values('w2v_fit', ascending=False)
df__w2v.head()

Unnamed: 0_level_0,job_title,location,connection,w2v_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.953001
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.948721
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.948721
74,Human Resources Professional,Greater Boston Area,16,0.922117
73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.920849


### BERT

In [28]:
import transformers
import torch

In [29]:
model = transformers.BertModel.from_pretrained('bert-base-uncased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [30]:
def string_to_bert_embedding(string):
    input_ids = tokenizer.encode_plus(string, add_special_tokens=True, return_tensors='pt')
    output = model(**input_ids)
    embedding = torch.mean(output.last_hidden_state, dim=1)
    return embedding

In [31]:
df_v5 = data.copy()

In [32]:
keywords = 'aspiring human resources'
txt_emb_bert = [embedding.detach().numpy() for embedding in df_v5['job_title'].apply(string_to_bert_embedding)]
kw_e_bert = string_to_bert_embedding(keywords)
kw_emb_bert = kw_e_bert.detach().numpy()   
df_v5['bert_fit'] = [cosine_similarity(txt_emb, kw_emb_bert).item() for txt_emb in txt_emb_bert]
df_bert = df_v5.sort_values('bert_fit', ascending=False)
df_bert

Unnamed: 0_level_0,job_title,location,connection,bert_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90548
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.902632
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.902632
28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.823453
99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.81541
74,Human Resources Professional,Greater Boston Area,16,0.814312
8,HR Senior Specialist,San Francisco Bay Area,500+,0.80481
66,Experienced Retail Manager and aspiring Human ...,"Austin, Texas Area",57,0.766706
27,Aspiring Human Resources Management student se...,"Houston, Texas Area",500+,0.756199
4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.752288


### SBERT

In [33]:
model_sbert = transformers.AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
tokenizer_sbert = transformers.AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

def string_to_sbert_embedding(string):
    input_ids = tokenizer_sbert.encode_plus(string, add_special_tokens=True, return_tensors='pt')
    output = model_sbert(**input_ids)
    embedding = torch.mean(output.last_hidden_state, dim=1)
    return embedding


In [34]:
df_v6 = data.copy()

In [35]:
keywords = 'aspiring human resources'
txt_emb_sbert = [embedding.detach().numpy() for embedding in df_v6['job_title'].apply(string_to_sbert_embedding)]
kw_e_sbert = string_to_sbert_embedding(keywords)
kw_emb_sbert = kw_e_sbert.detach().numpy()   
df_v6['sbert_fit'] = [cosine_similarity(txt_emb, kw_emb_sbert).item() for txt_emb in txt_emb_sbert]
df_sbert = df_v6.sort_values('sbert_fit', ascending=False)
df_sbert

Unnamed: 0_level_0,job_title,location,connection,sbert_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.949807
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.928035
99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.808784
28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.799642
74,Human Resources Professional,Greater Boston Area,16,0.794648
82,Aspiring Human Resources Professional | An ene...,"Austin, Texas Area",174,0.784048
100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.760215
7,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781
76,Aspiring Human Resources Professional | Passio...,"New York, New York",212,0.749486


## Conclusion

All the models have performed reasonably well.  The models are tested on a small sample and it does not seem to demand semantic capabilities provided by BERT and SBERT which are usually better models.

### Reranking

In [36]:
###  function for adding starred candidates
def add_starred_candidate_to_keywords(keywords, star_candidates_id_list, df):
    for i in star_candidates_id_list:
        keywords_list = (keywords.lower()).split()
        words_list = (df['job_title'][i].lower()).split()
        for word in words_list:
            if word not in keywords_list:
                keywords_list.append(word)
                keywords = ' '.join(keywords_list)
    return keywords

##### Re-ranking with SBERT

In [37]:
df_v7 = df_v6.copy()
df_v7.head()

Unnamed: 0_level_0,job_title,location,connection,sbert_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.573268
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.239483
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.380222
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.230512


In [38]:
keywords = 'aspirinG Human resouRces'
starred_candidates = [78]
new_keywords = add_starred_candidate_to_keywords(keywords, starred_candidates, df_v7)
txt_emb_sbert = [embedding.detach().numpy() for embedding in df_v7['job_title'].apply(string_to_sbert_embedding)]
kw_e_sbert = string_to_sbert_embedding(new_keywords)
kw_emb_sbert = kw_e_sbert.detach().numpy()   
df_v7['sbert_re_ranked_fit'] = [cosine_similarity(txt_emb, kw_emb_sbert).item() for txt_emb in txt_emb_sbert]
df_sbert_re_ranked = df_v7.sort_values('sbert_re_ranked_fit', ascending=False)
df_sbert_re_ranked

Unnamed: 0_level_0,job_title,location,connection,sbert_fit,sbert_re_ranked_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.623912,0.943331
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.928035,0.733155
7,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781,0.727135
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.949807,0.720409
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807,0.720409
10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.744711,0.708912
99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.808784,0.677796
28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.799642,0.653626
71,"Human Resources Generalist at ScottMadden, Inc.","Raleigh-Durham, North Carolina Area",500+,0.581877,0.653439
74,Human Resources Professional,Greater Boston Area,16,0.794648,0.641738


In [39]:
keywords = 'aspirinG Human resouRces'
starred_candidates = [78, 95]
new_keywords = add_starred_candidate_to_keywords(keywords, starred_candidates, df_v7)
txt_emb_sbert = [embedding.detach().numpy() for embedding in df_v7['job_title'].apply(string_to_sbert_embedding)]
kw_e_sbert = string_to_sbert_embedding(new_keywords)
kw_emb_sbert = kw_e_sbert.detach().numpy()   
df_v7['sbert_re_ranked_fit'] = [cosine_similarity(txt_emb, kw_emb_sbert).item() for txt_emb in txt_emb_sbert]
df_sbert_re_ranked = df_v7.sort_values('sbert_re_ranked_fit', ascending=False)
df_sbert_re_ranked

Unnamed: 0_level_0,job_title,location,connection,sbert_fit,sbert_re_ranked_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781,0.818832
78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.623912,0.801851
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.928035,0.783806
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807,0.773108
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.949807,0.773108
10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.744711,0.732977
99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.808784,0.696664
82,Aspiring Human Resources Professional | An ene...,"Austin, Texas Area",174,0.784048,0.688154
28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.799642,0.676506
100,Aspiring Human Resources Manager | Graduating ...,"Cape Girardeau, Missouri",103,0.760215,0.672918


##### Re-ranking with BERT

In [40]:
df_v8 = df_v5.copy()
df_v8.head()

Unnamed: 0_level_0,job_title,location,connection,bert_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.587713
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.5468
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.902632
4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.752288
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.430944


In [41]:
keywords = 'seeking HUMAN resources'
starred_candidates = [78]
new_keywords = add_starred_candidate_to_keywords(keywords, starred_candidates, df_v8)
txt_emb_bert = [embedding.detach().numpy() for embedding in df_v8['job_title'].apply(string_to_bert_embedding)]
kw_e_bert = string_to_bert_embedding(new_keywords)
kw_emb_bert = kw_e_bert.detach().numpy()   
df_v8['bert_re_ranking_fit'] = [cosine_similarity(txt_emb, kw_emb_bert).item() for txt_emb in txt_emb_bert]
df_bert_re_ranked = df_v8.sort_values('bert_re_ranking_fit', ascending=False)
df_bert_re_ranked

Unnamed: 0_level_0,job_title,location,connection,bert_fit,bert_re_ranking_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.640211,0.963228
101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.681012,0.83594
71,"Human Resources Generalist at ScottMadden, Inc.","Raleigh-Durham, North Carolina Area",500+,0.662071,0.80427
81,Senior Human Resources Business Partner at Hei...,"Chattanooga, Tennessee Area",455,0.663948,0.802039
68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.749704,0.799098
10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.730277,0.781569
73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.748516,0.744745
4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.752288,0.741612
84,Human Resources professional for the world lea...,"Highland, California",50,0.629707,0.740578
13,Human Resources Coordinator at InterContinenta...,"Atlanta, Georgia",500+,0.619063,0.738123


In [42]:
keywords = 'aspirinG Human resouRces'
starred_candidates = [78, 95]
new_keywords = add_starred_candidate_to_keywords(keywords, starred_candidates, df_v8)
txt_emb_bert = [embedding.detach().numpy() for embedding in df_v8['job_title'].apply(string_to_bert_embedding)]
kw_e_bert = string_to_bert_embedding(new_keywords)
kw_emb_bert = kw_e_bert.detach().numpy()   
df_v8['bert_re_ranking_fit'] = [cosine_similarity(txt_emb, kw_emb_bert).item() for txt_emb in txt_emb_bert]
df_bert_re_ranked = df_v8.sort_values('bert_re_ranking_fit', ascending=False)
df_bert_re_ranked

Unnamed: 0_level_0,job_title,location,connection,bert_fit,bert_re_ranking_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.640211,0.828577
101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.681012,0.77796
72,Business Management Major and Aspiring Human R...,"Monroe, Louisiana Area",5,0.729307,0.762198
7,Student at Humber College and Aspiring Human R...,Kanada,61,0.686413,0.761635
71,"Human Resources Generalist at ScottMadden, Inc.","Raleigh-Durham, North Carolina Area",500+,0.662071,0.76009
96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.52857,0.755744
95,Student at Westfield State University,"Bridgewater, Massachusetts",57,0.526098,0.753532
73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.748516,0.747952
79,Liberal Arts Major. Aspiring Human Resources A...,"Baton Rouge, Louisiana Area",7,0.706893,0.743681
68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.749704,0.739608


#### Re-Ranking - another approach..

In [43]:
def get_starred_candidates_keywords(star_candidates_id_list, df):
    keywords_list = []
    for i in star_candidates_id_list:
        candidates_list = (df['job_title'][i].lower()).split()
        for w in candidates_list:
            if w not in keywords_list:
                keywords_list.append(w)
    keywords = ' '.join(keywords_list)
    return keywords

In [44]:
def get_avg_embeddings_bert(keywords, starred_candidates):
    kw_e_bert = string_to_bert_embedding(keywords)
    sc_e_bert = string_to_bert_embedding(starred_candidates)
    return (kw_e_bert + sc_e_bert)/2

In [45]:
def get_avg_embeddings_sbert(keywords, starred_candidates):
    kw_e_sbert = string_to_sbert_embedding(keywords)
    sc_e_sbert = string_to_sbert_embedding(starred_candidates)
    return (kw_e_sbert + sc_e_sbert)/2

##### BERT

In [46]:
df_v9 = df_v5.copy()
df_v9.head()

Unnamed: 0_level_0,job_title,location,connection,bert_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.587713
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.5468
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.902632
4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.752288
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.430944


In [47]:
keywords = 'aspiring human resources'
starred_candidates = [78]
star_candidates_kw = get_starred_candidates_keywords(starred_candidates, df_v9)
txt_emb_bert = [embedding.detach().numpy() for embedding in df_v9['job_title'].apply(string_to_bert_embedding)]
kw_e_bert = get_avg_embeddings_bert(keywords, star_candidates_kw)
kw_emb_bert = kw_e_bert.detach().numpy()   
df_v9['bert_re_ranking_fit_avg_method'] = [cosine_similarity(txt_emb, kw_emb_bert).item() for txt_emb in txt_emb_bert]
df_bert_re_ranked_avg_method = df_v9.sort_values('bert_re_ranking_fit_avg_method', ascending=False)
df_bert_re_ranked_avg_method

Unnamed: 0_level_0,job_title,location,connection,bert_fit,bert_re_ranking_fit_avg_method
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.640211,0.909415
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.90548,0.877775
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.902632,0.874501
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.902632,0.874501
68,Human Resources Specialist at Luxottica,Greater New York City Area,500+,0.749704,0.863066
101,Human Resources Generalist at Loparex,"Raleigh-Durham, North Carolina Area",500+,0.681012,0.849107
73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.748516,0.828342
89,Director Human Resources at EY,Greater Atlanta Area,349,0.745304,0.827598
71,"Human Resources Generalist at ScottMadden, Inc.","Raleigh-Durham, North Carolina Area",500+,0.662071,0.826175
74,Human Resources Professional,Greater Boston Area,16,0.814312,0.823679


##### SBERT

In [48]:
df_v10 = df_v6.copy()
df_v10.head()

Unnamed: 0_level_0,job_title,location,connection,sbert_fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.573268
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.239483
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807
4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.380222
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.230512


In [49]:
keywords = 'aspiring human resources'
starred_candidates = [78]
star_candidates_kw = get_starred_candidates_keywords(starred_candidates, df_v10)
txt_emb_sbert = [embedding.detach().numpy() for embedding in df_v10['job_title'].apply(string_to_sbert_embedding)]
kw_e_sbert = get_avg_embeddings_sbert(keywords, star_candidates_kw)
kw_emb_sbert = kw_e_sbert.detach().numpy()   
df_v10['sbert_re_ranked_fit_avg_method'] = [cosine_similarity(txt_emb, kw_emb_sbert).item() for txt_emb in txt_emb_sbert]
df_sbert_re_ranked_avg_method = df_v10.sort_values('sbert_re_ranked_fit_avg_method', ascending=False)
df_sbert_re_ranked_avg_method

Unnamed: 0_level_0,job_title,location,connection,sbert_fit,sbert_re_ranked_fit_avg_method
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.949807,0.892753
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.949807,0.892753
6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.928035,0.883845
78,Human Resources Generalist at Schwan's,Amerika Birleşik Devletleri,500+,0.623912,0.865722
99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.808784,0.812383
74,Human Resources Professional,Greater Boston Area,16,0.794648,0.800659
10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.744711,0.792237
28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.799642,0.778038
7,Student at Humber College and Aspiring Human R...,Kanada,61,0.75781,0.767962
67,"Human Resources, Staffing and Recruiting Profe...","Jackson, Mississippi Area",500+,0.745484,0.762507


### Conclusion:
As we can see, the re-ranking is working well for both the approaches.  We would now be able to star as many candidates as we need and re-rank successfully using BERT and SBERT models