<a href="https://colab.research.google.com/github/trtrgfh/GlVYfAbQjtuwUHlZ/blob/main/notebook/Potential_Talents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("/content/potential-talents.csv")
print(df.shape)
df.head()

(104, 5)


Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
job_titles = df["job_title"].to_list()
job_titles = [string.lower() for string in job_titles]
keywords = ["aspiring human resources", "seeking human resources"]

combined_list = keywords + job_titles

In [4]:
def df_fit(df, cos_similarity1, cos_similarity2):
  res_df1 = df.copy()
  res_df2 = df.copy()

  res_df1['fit'] = res_df1['fit'].fillna(pd.Series(cos_similarity1))
  res_df2['fit'] = res_df2['fit'].fillna(pd.Series(cos_similarity2))
  res_df1 = res_df1.sort_values(by=['fit'], ascending=False)
  res_df2 = res_df2.sort_values(by=['fit'], ascending=False)

  return res_df1, res_df2

# Bag-of-words

In [5]:
CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
# fit and transform
Count_data = CountVec.fit_transform(combined_list)

cv_keyword_vec = Count_data[:len(keywords)].toarray()
cv_title_vec = Count_data[len(keywords):].toarray()
print("cv_keyword_vec shape: {}, cv_title_vec shape: {}".format(cv_keyword_vec.shape, cv_title_vec.shape))

cv_keyword_vec shape: (2, 181), cv_title_vec shape: (104, 181)


In [6]:
# Compute cosine similarity between the keyword vectors and job title vectors
cv_cs = cosine_similarity(cv_keyword_vec, cv_title_vec)

In [7]:
cv_df1, cv_df2 = df_fit(df, cv_cs[0], cv_cs[1])

In [8]:
cv_df1

Unnamed: 0,id,job_title,location,connection,fit
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.866025
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.866025
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866025
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866025
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866025
...,...,...,...,...,...
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000


In [9]:
cv_df2

Unnamed: 0,id,job_title,location,connection,fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.866025
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.866025
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.866025
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.833333
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.707107
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


### Re-ranking with a Starred Candidate

#### Approach 1

In [10]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + cv_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + cv_df2.iloc[6]["job_title"].lower()
star_keywords = [star_key1, star_key2]
star_keywords

['aspiring human resources aspiring human resources professional',
 'seeking human resources seeking human resources hris and generalist positions']

In [11]:
# Get the vector of new keywords
star_keyword_vec = CountVec.transform(star_keywords)
star_cs = cosine_similarity(star_keyword_vec, cv_title_vec)
star_cs.shape

(2, 104)

In [12]:
star_df1, star_df2 = df_fit(df, star_cs[0], star_cs[1])

In [13]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.970725
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [14]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.774597
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


#### Approach 2

In [15]:
# Star the 7th candiate
star_key1 = cv_df1.iloc[6]["job_title"].lower()
star_key2 = cv_df2.iloc[6]["job_title"].lower()
star_keywords = [star_key1, star_key2]
star_keywords

['aspiring human resources professional',
 'seeking human resources hris and generalist positions']

In [16]:
# Get the vector of new keywords
star_keyword_vec = CountVec.transform(star_keywords)
star_cs = cosine_similarity(star_keyword_vec, cv_title_vec)

In [17]:
# Take the average of the starred keywords and original keywords
star_cs = (star_cs + cv_cs) / 2

In [18]:
star_df1, star_df2 = df_fit(df, star_cs[0], star_cs[1])

In [19]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.933013
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [20]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.739199
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


# Tf-Idf vectorizer

In [21]:
tfidf_vec = TfidfVectorizer(use_idf=True,
                        smooth_idf=True,
                        ngram_range=(1,1),stop_words='english')

tfidf_data = tfidf_vec.fit_transform(combined_list)

tfidf_keyword_vec = tfidf_data[:len(keywords)].toarray()
tfidf_title_vec = tfidf_data[len(keywords):].toarray()
print("tfidf_keyword_vec shape: {}, tfidf_title_vec shape: {}".format(cv_keyword_vec.shape, cv_title_vec.shape))

tfidf_keyword_vec shape: (2, 181), tfidf_title_vec shape: (104, 181)


In [22]:
tfidf_cs = cosine_similarity(tfidf_keyword_vec, tfidf_title_vec)

In [23]:
tfidf_df1, tfidf_df2 = df_fit(df, tfidf_cs[0], tfidf_cs[1])

In [24]:
tfidf_df1

Unnamed: 0,id,job_title,location,connection,fit
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
...,...,...,...,...,...
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
46,47,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000


In [25]:
tfidf_df2

Unnamed: 0,id,job_title,location,connection,fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.658595
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.658595
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.638511
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.615664
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.481908
...,...,...,...,...,...
31,32,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000


### Re-ranking with a Starred Candidate

#### Approach 1

In [26]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + tfidf_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + tfidf_df2.iloc[6]["job_title"].lower()
star_keywords = [star_key1, star_key2]
star_keywords

['aspiring human resources aspiring human resources professional',
 'seeking human resources seeking human resources hris and generalist positions']

In [27]:
# Get the vector of new keywords
star_keyword_vec = tfidf_vec.transform(star_keywords)
star_cs = cosine_similarity(star_keyword_vec, cv_title_vec)

In [28]:
star_df1, star_df2 = df_fit(df, star_cs[0], star_cs[1])

In [29]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.985222
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [30]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.610397
...,...,...,...,...,...
31,32,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000


#### Approach 2

In [31]:
# Star the 7th candiate
star_key1 = tfidf_df1.iloc[6]["job_title"].lower()
star_key2 = tfidf_df2.iloc[6]["job_title"].lower()
star_keywords = [star_key1, star_key2]
star_keywords

['aspiring human resources professional',
 'seeking human resources hris and generalist positions']

In [32]:
# Get the vector of new keywords
star_keyword_vec = CountVec.transform(star_keywords)
star_cs = cosine_similarity(star_keyword_vec, cv_title_vec)

In [33]:
# Take the average of the starred keywords and original keywords
star_cs = (star_cs + tfidf_cs) / 2

In [34]:
star_df1, star_df2 = df_fit(df, star_cs[0], star_cs[1])

In [35]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.874572
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.874572
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.874572
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.874572
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.874572
...,...,...,...,...,...
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
50,51,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [36]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.740954
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.740954
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.740954
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.740954
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.635484
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


# Word2vec

In [37]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v
from sklearn.decomposition import PCA

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
sw = stopwords.words('english')

In [39]:
# Tokenize each job title
word2vec_tokens = [word_tokenize(job_title) for job_title in job_titles]

In [40]:
# Remove stop words and puntuations
word2vec_clean = []

for line in word2vec_tokens:
  tokens = []
  for word in line:
    if (word not in sw and word not in string.punctuation):
        tokens.append(word)
  word2vec_clean.append(tokens)

In [41]:
w = w2v(
    word2vec_clean,
    min_count=1,
    sg = 1,
    window=5
)

# Create word embedding
word2vec_df = (
    pd.DataFrame(
        [w.wv.get_vector(str(n)) for n in w.wv.key_to_index],
        index = w.wv.key_to_index
    )
)
print(word2vec_df.shape)
word2vec_df.head()

(185, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
human,-0.000721,0.001137,0.004927,0.009055,-0.008244,-0.008943,0.007436,0.010913,-0.005582,-0.004577,...,0.003089,0.001556,0.003665,0.001883,0.011729,0.006008,-0.008459,-0.00742,0.001335,0.005793
resources,-0.008932,0.004368,0.005016,0.005808,0.008211,-0.007896,0.001784,0.007625,-0.003179,-0.006778,...,0.002101,-0.000378,0.00267,-0.006512,-0.000931,0.003528,0.005826,-0.002526,-0.009151,0.004093
aspiring,0.000145,0.003494,-0.007041,-0.001435,0.008183,0.006441,-0.003321,0.003717,-0.008738,0.005776,...,-0.003684,0.006493,0.009174,-0.00318,0.008934,0.005942,0.006044,0.000199,0.00857,-0.007339
professional,-0.008349,0.009518,-0.000155,-0.001794,0.0049,-0.004827,0.003044,0.007707,0.0059,-0.007954,...,-0.006945,-0.000452,-0.000588,-0.001859,0.010418,-4.3e-05,0.006028,-0.00758,-0.002253,-0.005721
student,-0.007151,0.001511,-0.007245,-0.002277,0.003972,0.005233,0.001529,0.002735,-0.004286,0.006964,...,0.003588,-0.004248,0.005417,-0.003617,0.003296,-0.007846,0.006414,0.004797,0.000978,0.002826


In [42]:
# Get document embedding
word2vec_keyword_vec1 = np.zeros((1, 100))
word2vec_keyword_vec2 = np.zeros((1, 100))
word2vec_title_vec = np.zeros((104, 100))

for keyword in ["aspiring", "human", "resources"]:
  word2vec_keyword_vec1 += w.wv.get_vector(keyword)

for keyword in ["seeking", "human", "resources"]:
  word2vec_keyword_vec2 += w.wv.get_vector(keyword)

for i in range(len(word2vec_clean)):
  for word in word2vec_clean[i]:
    word2vec_title_vec[i] += w.wv.get_vector(word)

In [43]:
word2vec_cs1 = cosine_similarity(word2vec_keyword_vec1, word2vec_title_vec)
word2vec_cs2 = cosine_similarity(word2vec_keyword_vec2, word2vec_title_vec)
print("word2vec_cs shape: {}".format(word2vec_cs1.shape))

word2vec_cs shape: (1, 104)


In [44]:
word2vec_df1, word2vec_df2 = df_fit(df, word2vec_cs1[0], word2vec_cs2[0])

In [45]:
word2vec_df1

Unnamed: 0,id,job_title,location,connection,fit
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
...,...,...,...,...,...
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.029081
58,59,People Development Coordinator at Ryan,"Denton, Texas",500+,0.029081
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.029081
91,92,Seeking employment opportunities within Custom...,"Torrance, California",64,0.027599


In [46]:
word2vec_df2

Unnamed: 0,id,job_title,location,connection,fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.901433
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.877797
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.877797
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.843673
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.752123
...,...,...,...,...,...
7,8,HR Senior Specialist,San Francisco Bay Area,500+,0.045769
60,61,HR Senior Specialist,San Francisco Bay Area,500+,0.045769
50,51,HR Senior Specialist,San Francisco Bay Area,500+,0.045769
103,104,Director Of Administration at Excellence Logging,"Katy, Texas",500+,0.019210


### Re-ranking with a Starred Candidate

#### Approach 1

In [47]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + word2vec_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + word2vec_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)
star_key1 = star_key1.split()
star_key2 = star_key2.split()

aspiring human resources aspiring human resources professional
seeking human resources seeking human resources hris and generalist positions


In [48]:
# Get document embedding
star_keyword_vec1 = np.zeros((1, 100))
star_keyword_vec2 = np.zeros((1, 100))

for keyword in star_key1:
  if keyword not in sw:
    star_keyword_vec1 += w.wv.get_vector(keyword)

for keyword in star_key2:
  if keyword not in sw:
    star_keyword_vec2 += w.wv.get_vector(keyword)

star_cs1 = cosine_similarity(star_keyword_vec1, word2vec_title_vec)
star_cs2 = cosine_similarity(star_keyword_vec2, word2vec_title_vec)

In [49]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [50]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.972292
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
...,...,...,...,...,...
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807
46,47,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807
58,59,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807


In [51]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.818093
...,...,...,...,...,...
37,38,HR Senior Specialist,San Francisco Bay Area,500+,0.033467
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.033467
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.031630
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.016665


#### Approach 2

In [52]:
# Star the 7th candiate
star_key1 = word2vec_df1.iloc[6]["job_title"].lower()
star_key2 = word2vec_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)
star_key1 = star_key1.split()
star_key2 = star_key2.split()

aspiring human resources professional
seeking human resources hris and generalist positions


In [53]:
# Get document embedding
star_keyword_vec1 = np.zeros((1, 100))
star_keyword_vec2 = np.zeros((1, 100))

for keyword in star_key1:
  if keyword not in sw:
    star_keyword_vec1 += w.wv.get_vector(keyword)

for keyword in star_key2:
  if keyword not in sw:
    star_keyword_vec2 += w.wv.get_vector(keyword)

star_cs1 = cosine_similarity(star_keyword_vec1, word2vec_title_vec)
star_cs2 = cosine_similarity(star_keyword_vec2, word2vec_title_vec)

In [54]:
# Take the average of the starred keywords and original keywords
star_cs1 = (star_cs1 + word2vec_cs1) / 2
star_cs2 = (star_cs2 + word2vec_cs2) / 2

In [55]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [56]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.937604
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.937604
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.937604
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.937604
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.937604
...,...,...,...,...,...
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.035250
46,47,People Development Coordinator at Ryan,"Denton, Texas",500+,0.035250
58,59,People Development Coordinator at Ryan,"Denton, Texas",500+,0.035250
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.035250


In [57]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.876061
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.876061
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.876061
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.876061
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.783377
...,...,...,...,...,...
7,8,HR Senior Specialist,San Francisco Bay Area,500+,0.033482
60,61,HR Senior Specialist,San Francisco Bay Area,500+,0.033482
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.033482
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.020200


## Glove

In [58]:
glove_embeddings = {}
# Download embeddings from https://nlp.stanford.edu/projects/glove/
# Load the pre-trained GloVe word vectors
with open("/content/glove.6B.100d.txt", 'r', encoding="utf-8") as f:
  for line in f:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      glove_embeddings[word] = vector

In [59]:
sw = stopwords.words('english')
# Tokenize each job title
glove_tokens = [word_tokenize(job_title) for job_title in job_titles]
# Remove stop words and puntuations
glove_clean = []

for line in word2vec_tokens:
  tokens = []
  for word in line:
    if (word not in sw and word not in string.punctuation):
        tokens.append(word)
  glove_clean.append(tokens)

In [60]:
glove_embeddings['human'].shape

(100,)

In [61]:
# Get document embedding
glove_keyword_vec1 = np.zeros((1, 100))
glove_keyword_vec2 = np.zeros((1, 100))
glove_title_vec = np.zeros((104, 100))

for keyword in ["aspiring", "human", "resources"]:
  if keyword in glove_embeddings:
    glove_keyword_vec1 += glove_embeddings[keyword]

for keyword in ["seeking", "human", "resources"]:
  if keyword in glove_embeddings:
    glove_keyword_vec2 += glove_embeddings[keyword]

for i in range(len(glove_clean)):
  for word in glove_clean[i]:
    if word in glove_embeddings:
      glove_title_vec[i] += glove_embeddings[word]

In [62]:
glove_cs1 = cosine_similarity(glove_keyword_vec1, glove_title_vec)
glove_cs2 = cosine_similarity(glove_keyword_vec2, glove_title_vec)
print("glove_cs shape: {}".format(glove_cs1.shape))

glove_cs shape: (1, 104)


In [63]:
glove_df1, glove_df2 = df_fit(df, glove_cs1[0], glove_cs2[0])

In [64]:
glove_df1

Unnamed: 0,id,job_title,location,connection,fit
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.953001
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.953001
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.953001
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.953001
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.953001
...,...,...,...,...,...
10,11,Student at Chapman University,"Lake Forest, California",2,0.459638
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.437611
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.437611
34,35,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.437611


In [65]:
glove_df2

Unnamed: 0,id,job_title,location,connection,fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.967874
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.967874
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.962921
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.939056
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.922638
...,...,...,...,...,...
86,87,Bachelor of Science in Biology from Victoria U...,"Baltimore, Maryland",40,0.430679
53,54,Student at Chapman University,"Lake Forest, California",2,0.416568
10,11,Student at Chapman University,"Lake Forest, California",2,0.416568
40,41,Student at Chapman University,"Lake Forest, California",2,0.416568


### Re-ranking with a Starred Candidate

#### Approach 1

In [66]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + glove_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + glove_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)
star_key1 = star_key1.split()
star_key2 = star_key2.split()

aspiring human resources aspiring human resources professional
seeking human resources seeking human resources hris and generalist positions


In [67]:
# Get document embedding
star_keyword_vec1 = np.zeros((1, 100))
star_keyword_vec2 = np.zeros((1, 100))

for keyword in star_key1:
  if keyword in glove_embeddings:
    star_keyword_vec1 += glove_embeddings[keyword]

for keyword in star_key2:
  if keyword in glove_embeddings:
    star_keyword_vec2 += glove_embeddings[keyword]

star_cs1 = cosine_similarity(star_keyword_vec1, glove_title_vec)
star_cs2 = cosine_similarity(star_keyword_vec2, glove_title_vec)

In [68]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [69]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.990475
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.990475
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.990475
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.990475
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.990475
...,...,...,...,...,...
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.513644
34,35,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.474955
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.474955
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.474955


In [70]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.975976
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.975976
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.975976
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.975976
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.971772
...,...,...,...,...,...
86,87,Bachelor of Science in Biology from Victoria U...,"Baltimore, Maryland",40,0.460053
40,41,Student at Chapman University,"Lake Forest, California",2,0.450510
10,11,Student at Chapman University,"Lake Forest, California",2,0.450510
53,54,Student at Chapman University,"Lake Forest, California",2,0.450510


#### Approach 2

In [71]:
# Star the 7th candiate
star_key1 = word2vec_df1.iloc[6]["job_title"].lower()
star_key2 = word2vec_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)
star_key1 = star_key1.split()
star_key2 = star_key2.split()

aspiring human resources professional
seeking human resources hris and generalist positions


In [72]:
# Get document embedding
star_keyword_vec1 = np.zeros((1, 100))
star_keyword_vec2 = np.zeros((1, 100))

for keyword in star_key1:
  if keyword in glove_embeddings:
    star_keyword_vec1 += glove_embeddings[keyword]

for keyword in star_key2:
  if keyword in glove_embeddings:
    star_keyword_vec2 += glove_embeddings[keyword]

star_cs1 = cosine_similarity(star_keyword_vec1, glove_title_vec)
star_cs2 = cosine_similarity(star_keyword_vec2, glove_title_vec)

In [73]:
# Take the average of the starred keywords and original keywords
star_cs1 = (star_cs1 + glove_cs1) / 2
star_cs2 = (star_cs2 + glove_cs2) / 2

In [74]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [75]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.974360
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.974360
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.974360
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.974360
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.974360
...,...,...,...,...,...
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.503249
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.465103
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.465103
34,35,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.465103


In [76]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.955459
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.951585
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.951585
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.951585
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.951585
...,...,...,...,...,...
86,87,Bachelor of Science in Biology from Victoria U...,"Baltimore, Maryland",40,0.447793
40,41,Student at Chapman University,"Lake Forest, California",2,0.437572
10,11,Student at Chapman University,"Lake Forest, California",2,0.437572
53,54,Student at Chapman University,"Lake Forest, California",2,0.437572


## Bert

In [77]:
# !pip install transformers

In [78]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [79]:
def get_bert_embeddings(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Convert tokens to token IDs
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensors
    input_tensor = torch.tensor([token_ids])

    # Get the BERT model outputs
    with torch.no_grad():
        outputs = model(input_tensor)

    # Get the word embeddings
    embeddings = outputs.last_hidden_state.squeeze()

    return embeddings

In [80]:
# Get keyword embeddings
keyword1 = "aspiring human resources"
keyword2 = "seeking human resources"

bert_key_embeddings1 = get_bert_embeddings(keyword1).sum(axis=0).unsqueeze(0)
bert_key_embeddings2 = get_bert_embeddings(keyword2).sum(axis=0).unsqueeze(0)

# Print the shape of the word embeddings
print(bert_key_embeddings1.shape)
print(bert_key_embeddings2.shape)

torch.Size([1, 768])
torch.Size([1, 768])


In [81]:
# Get document embeddings
bert_embeddings = []

# Iterate over each string
for title in job_titles:
  title_emb = get_bert_embeddings(title)
  if title_emb.ndim != 2:
    title_emb = title_emb.reshape(1, -1)
    # Append word embeddings
  bert_embeddings.append(np.array(title_emb.sum(axis=0)))

In [82]:
bert_cs1 = cosine_similarity(np.array(bert_key_embeddings1), np.array(bert_embeddings))
bert_cs2 = cosine_similarity(np.array(bert_key_embeddings2), np.array(bert_embeddings))
print("bert_cs1 shape: {}".format(bert_cs1.shape))

bert_cs1 shape: (1, 104)


In [83]:
bert_df1, bert_df2 = df_fit(df, bert_cs1[0], bert_cs2[0])

In [84]:
bert_df1

Unnamed: 0,id,job_title,location,connection,fit
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.722275
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.722275
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.722275
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.722275
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.722275
...,...,...,...,...,...
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.330464
93,94,Seeking Human Resources Opportunities. Open t...,Amerika Birleşik Devletleri,415,0.328400
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.303623
71,72,Business Management Major and Aspiring Human R...,"Monroe, Louisiana Area",5,0.291840


In [85]:
bert_df2

Unnamed: 0,id,job_title,location,connection,fit
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.681607
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.681607
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.681607
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.681607
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.681607
...,...,...,...,...,...
73,74,Human Resources Professional,Greater Boston Area,16,0.354283
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.342466
68,69,"Director of Human Resources North America, Gro...","Greater Grand Rapids, Michigan Area",500+,0.329675
71,72,Business Management Major and Aspiring Human R...,"Monroe, Louisiana Area",5,0.329672


### Re-ranking with a Starred Candidate

#### Approach 1

In [86]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + bert_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + bert_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)

aspiring human resources aspiring human resources professional
seeking human resources seeking human resources opportunities


In [87]:
# Get keyword embeddings
star_keyword_vec1 = get_bert_embeddings(star_key1).sum(axis=0).unsqueeze(0)
star_keyword_vec2 = get_bert_embeddings(star_key2).sum(axis=0).unsqueeze(0)

star_cs1 = cosine_similarity(np.array(star_keyword_vec1), bert_embeddings)
star_cs2 = cosine_similarity(np.array(star_keyword_vec2), bert_embeddings)

In [88]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [89]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.761423
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.723223
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.723223
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.723223
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.723223
...,...,...,...,...,...
34,35,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.448244
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.448244
94,95,Student at Westfield State University,"Bridgewater, Massachusetts",57,0.417569
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.361383


In [90]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.836801
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.836801
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.732915
91,92,Seeking employment opportunities within Custom...,"Torrance, California",64,0.670953
75,76,Aspiring Human Resources Professional | Passio...,"New York, New York",212,0.663514
...,...,...,...,...,...
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.431958
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.431958
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.397575
70,71,"Human Resources Generalist at ScottMadden, Inc.","Raleigh-Durham, North Carolina Area",500+,0.391027


#### Approach 2

In [91]:
# Star the 7th candiate
star_key1 = bert_df1.iloc[6]["job_title"].lower()
star_key2 = bert_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)

aspiring human resources professional
seeking human resources opportunities


In [92]:
# Get keyword embeddings
star_keyword_vec1 = get_bert_embeddings(star_key1).sum(axis=0).unsqueeze(0)
star_keyword_vec2 = get_bert_embeddings(star_key2).sum(axis=0).unsqueeze(0)

star_cs1 = cosine_similarity(np.array(star_keyword_vec1), bert_embeddings)
star_cs2 = cosine_similarity(np.array(star_keyword_vec2), bert_embeddings)

In [93]:
# Take the average of the starred keywords and original keywords
star_cs1 = (star_cs1 + bert_cs1) / 2
star_cs2 = (star_cs2 + bert_cs2) / 2

In [94]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [95]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.851590
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.851590
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.851590
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.851590
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.851590
...,...,...,...,...,...
68,69,"Director of Human Resources North America, Gro...","Greater Grand Rapids, Michigan Area",500+,0.356263
97,98,Student,"Houston, Texas Area",4,0.349707
93,94,Seeking Human Resources Opportunities. Open t...,Amerika Birleşik Devletleri,415,0.347315
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.342649


In [96]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.827114
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.827114
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.673240
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.665870
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.665870
...,...,...,...,...,...
8,9,Student at Humber College and Aspiring Human R...,Kanada,61,0.383760
24,25,Student at Humber College and Aspiring Human R...,Kanada,61,0.383760
78,79,Liberal Arts Major. Aspiring Human Resources A...,"Baton Rouge, Louisiana Area",7,0.383357
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.370209


## Sbert

In [97]:
# !pip install sentence-transformers

In [98]:
from sentence_transformers import SentenceTransformer

# Load the SBERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Encode the sentences to obtain embeddings
sbert_embeddings = sbert_model.encode(job_titles)
print(sbert_embeddings.shape)

(104, 768)


In [99]:
sbert_keyword_vec1 = sbert_model.encode("aspiring human resources").reshape(1, -1)
sbert_keyword_vec2 = sbert_model.encode("seeking human resources").reshape(1, -1)
sbert_cs1 = cosine_similarity(sbert_keyword_vec1, sbert_embeddings)
sbert_cs2 = cosine_similarity(sbert_keyword_vec2, sbert_embeddings)

In [100]:
sbert_df1, sbert_df2 = df_fit(df, sbert_cs1[0], sbert_cs2[0])

In [101]:
sbert_df1

Unnamed: 0,id,job_title,location,connection,fit
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
...,...,...,...,...,...
11,12,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,0.299499
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.259422
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.252835
92,93,Admissions Representative at Community medical...,"Long Beach, California",9,0.164934


In [102]:
sbert_df2

Unnamed: 0,id,job_title,location,connection,fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.969809
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.969809
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.964130
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.843657
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.843657
...,...,...,...,...,...
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.220571
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.197445
92,93,Admissions Representative at Community medical...,"Long Beach, California",9,0.186940
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.126020


### Re-ranking with a Starred Candidate

#### Approach 1

In [103]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + sbert_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + sbert_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)

aspiring human resources aspiring human resources professional
seeking human resources seeking human resources hris and generalist positions


In [104]:
# Get keyword embeddings
star_keyword_vec1 = sbert_model.encode(star_key1).reshape(1, -1)
star_keyword_vec2 = sbert_model.encode(star_key2).reshape(1, -1)

star_cs1 = cosine_similarity(sbert_keyword_vec1, sbert_embeddings)
star_cs2 = cosine_similarity(sbert_keyword_vec2, sbert_embeddings)

In [105]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [106]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
...,...,...,...,...,...
11,12,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,0.299499
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.259422
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.252835
92,93,Admissions Representative at Community medical...,"Long Beach, California",9,0.164934


In [107]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.969809
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.969809
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.964130
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.843657
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.843657
...,...,...,...,...,...
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.220571
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.197445
92,93,Admissions Representative at Community medical...,"Long Beach, California",9,0.186940
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.126020


#### Approach 2

In [108]:
# Star the 7th candiate
star_key1 = sbert_df1.iloc[6]["job_title"].lower()
star_key2 = sbert_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)

aspiring human resources professional
seeking human resources hris and generalist positions


In [109]:
# Get keyword embeddings
star_keyword_vec1 = sbert_model.encode(star_key1).reshape(1, -1)
star_keyword_vec2 = sbert_model.encode(star_key2).reshape(1, -1)

star_cs1 = cosine_similarity(sbert_keyword_vec1, sbert_embeddings)
star_cs2 = cosine_similarity(sbert_keyword_vec2, sbert_embeddings)

In [110]:
# Take the average of the starred keywords and original keywords
star_cs1 = (star_cs1 + sbert_cs1) / 2
star_cs2 = (star_cs2 + sbert_cs2) / 2

In [111]:
star_df1, star_df2 = df_fit(df, star_cs1[0], star_cs2[0])

In [112]:
star_df1

Unnamed: 0,id,job_title,location,connection,fit
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.942610
...,...,...,...,...,...
11,12,"SVP, CHRO, Marketing & Communications, CSR Off...","Houston, Texas Area",500+,0.299499
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.259422
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.252835
92,93,Admissions Representative at Community medical...,"Long Beach, California",9,0.164934


In [113]:
star_df2

Unnamed: 0,id,job_title,location,connection,fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.969809
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.969809
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.964130
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.843657
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.843657
...,...,...,...,...,...
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,0.220571
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.197445
92,93,Admissions Representative at Community medical...,"Long Beach, California",9,0.186940
95,96,Student at Indiana University Kokomo - Busines...,"Lafayette, Indiana",19,0.126020
