<a href="https://colab.research.google.com/github/trtrgfh/GlVYfAbQjtuwUHlZ/blob/main/Potential_Talents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("/content/potential-talents.csv")
print(df.shape)
df.head()

(104, 5)


Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
job_titles = df["job_title"].to_list()
job_titles = [string.lower() for string in job_titles]
keywords = ["aspiring human resources", "seeking human resources"]

combined_list = keywords + job_titles

# Bag-of-words

In [4]:
from nltk.stem import PorterStemmer

In [5]:
CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
# fit and transform
Count_data = CountVec.fit_transform(combined_list)

cv_keyword_vec = Count_data[:len(keywords)].toarray()
cv_title_vec = Count_data[len(keywords):].toarray()
print("cv_keyword_vec shape: {}, cv_title_vec shape: {}".format(cv_keyword_vec.shape, cv_title_vec.shape))

cv_keyword_vec shape: (2, 181), cv_title_vec shape: (104, 181)


In [6]:
# Compute cosine similarity between the keyword vectors and job title vectors
cv_cs = cosine_similarity(cv_keyword_vec, cv_title_vec)

In [7]:
# Get the cos similarity of each candidata with reaspect to candidata id
cv_res = pd.DataFrame(cv_cs, columns=df['id'])
cv_res

id,1,2,3,4,5,6,7,8,9,10,...,95,96,97,98,99,100,101,102,103,104
0,0.5,0.0,0.866025,0.0,0.0,0.866025,0.654654,0.0,0.654654,0.471405,...,0.0,0.0,0.866025,0.0,0.57735,0.680414,0.57735,0.0,0.0,0.0
1,0.333333,0.0,0.57735,0.0,0.0,0.57735,0.436436,0.0,0.436436,0.707107,...,0.0,0.0,0.57735,0.0,0.866025,0.680414,0.57735,0.0,0.0,0.0


In [8]:
cv_res_key1 = cv_res.iloc[0].reset_index(drop=True)
cv_res_key2 = cv_res.iloc[1].reset_index(drop=True)

# Create copies and fill "fit" with the cos similarity
cv_df1 = df.copy()
cv_df2 = df.copy()
cv_df1['fit'] = cv_df1['fit'].fillna(cv_res_key1)
cv_df2['fit'] = cv_df2['fit'].fillna(cv_res_key2)

In [9]:
cv_df1 = cv_df1.sort_values(by=['fit'], ascending=False)
cv_df1

Unnamed: 0,id,job_title,location,connection,fit
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.866025
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.866025
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866025
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866025
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.866025
...,...,...,...,...,...
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000


In [10]:
cv_df2 = cv_df2.sort_values(by=['fit'], ascending=False)
cv_df2

Unnamed: 0,id,job_title,location,connection,fit
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.866025
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.866025
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.866025
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.833333
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.707107
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


### Re-ranking with a Starred Candidate

#### Approach 1

In [11]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + cv_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + cv_df2.iloc[6]["job_title"].lower()
star_keywords = [star_key1, star_key2]
star_keywords

['aspiring human resources aspiring human resources professional',
 'seeking human resources seeking human resources hris and generalist positions']

In [12]:
# Get the vector of new keywords
star_keyword_vec = CountVec.transform(star_keywords)
star_cs = cosine_similarity(star_keyword_vec, cv_title_vec)

In [13]:
star_res = pd.DataFrame(star_cs, columns=df['id'])
star_res

id,1,2,3,4,5,6,7,8,9,10,...,95,96,97,98,99,100,101,102,103,104
0,0.560449,0.0,0.970725,0.0,0.0,0.83205,0.628971,0.0,0.628971,0.452911,...,0.0,0.0,0.970725,0.0,0.5547,0.65372,0.5547,0.0,0.0,0.0
1,0.298142,0.0,0.516398,0.0,0.0,0.516398,0.48795,0.0,0.48795,0.948683,...,0.0,0.0,0.516398,0.0,0.774597,0.608581,0.645497,0.0,0.0,0.0


In [14]:
star_res_key1 = star_res.iloc[0].reset_index(drop=True)
star_res_key2 = star_res.iloc[1].reset_index(drop=True)

# Create copies and fill "fit" with the cos similarity
star_df1 = df.copy()
star_df2 = df.copy()
star_df1['fit'] = star_df1['fit'].fillna(star_res_key1)
star_df2['fit'] = star_df2['fit'].fillna(star_res_key2)

In [15]:
star_df1 = star_df1.sort_values(by=['fit'], ascending=False)
star_df1

Unnamed: 0,id,job_title,location,connection,fit
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.970725
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [16]:
star_df2 = star_df2.sort_values(by=['fit'], ascending=False)
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.774597
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


#### Approach 2

In [17]:
# Star the 7th candiate
star_key1 = cv_df1.iloc[6]["job_title"].lower()
star_key2 = cv_df2.iloc[6]["job_title"].lower()
star_keywords = [star_key1, star_key2]
star_keywords

['aspiring human resources professional',
 'seeking human resources hris and generalist positions']

In [18]:
# Get the vector of new keywords
star_keyword_vec = CountVec.transform(star_keywords)
star_cv_cs = cosine_similarity(star_keyword_vec, cv_title_vec)

In [19]:
# Take the average of the starred keywords and original keywords
star_cv_cs = (star_cv_cs + cv_cs) / 2

In [20]:
star_res = pd.DataFrame(star_cs, columns=df['id'])
star_res

id,1,2,3,4,5,6,7,8,9,10,...,95,96,97,98,99,100,101,102,103,104
0,0.560449,0.0,0.970725,0.0,0.0,0.83205,0.628971,0.0,0.628971,0.452911,...,0.0,0.0,0.970725,0.0,0.5547,0.65372,0.5547,0.0,0.0,0.0
1,0.298142,0.0,0.516398,0.0,0.0,0.516398,0.48795,0.0,0.48795,0.948683,...,0.0,0.0,0.516398,0.0,0.774597,0.608581,0.645497,0.0,0.0,0.0


In [21]:
star_res_key1 = star_res.iloc[0].reset_index(drop=True)
star_res_key2 = star_res.iloc[1].reset_index(drop=True)

# Create copies and fill "fit" with the cos similarity
star_df1 = df.copy()
star_df2 = df.copy()
star_df1['fit'] = star_df1['fit'].fillna(star_res_key1)
star_df2['fit'] = star_df2['fit'].fillna(star_res_key2)

In [22]:
star_df1 = star_df1.sort_values(by=['fit'], ascending=False)
star_df1

Unnamed: 0,id,job_title,location,connection,fit
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.970725
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.970725
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [23]:
star_df2 = star_df2.sort_values(by=['fit'], ascending=False)
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.948683
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.774597
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


# Tf-Idf vectorizer

In [24]:
tfidf_vec = TfidfVectorizer(use_idf=True,
                        smooth_idf=True,
                        ngram_range=(1,1),stop_words='english')

tfidf_data = tfidf_vec.fit_transform(combined_list)

tfidf_keyword_vec = tfidf_data[:len(keywords)].toarray()
tfidf_title_vec = tfidf_data[len(keywords):].toarray()
print("tfidf_keyword_vec shape: {}, tfidf_title_vec shape: {}".format(cv_keyword_vec.shape, cv_title_vec.shape))

tfidf_keyword_vec shape: (2, 181), tfidf_title_vec shape: (104, 181)


In [25]:
tfidf_cs = cosine_similarity(tfidf_keyword_vec, tfidf_title_vec)

In [26]:
tfidf_res = pd.DataFrame(tfidf_cs,columns=df['id'])

In [27]:
tfidf_res_key1 = tfidf_res.iloc[0].reset_index(drop=True)
tfidf_res_key2 = tfidf_res.iloc[1].reset_index(drop=True)

# Create copies and fill "fit" with the cos similarity
tfidf_df1 = df.copy()
tfidf_df2 = df.copy()
tfidf_df1['fit'] = tfidf_df1['fit'].fillna(tfidf_res_key1)
tfidf_df2['fit'] = tfidf_df2['fit'].fillna(tfidf_res_key2)

In [28]:
tfidf_df1 = tfidf_df1.sort_values(by=['fit'], ascending=False)
tfidf_df1

Unnamed: 0,id,job_title,location,connection,fit
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.749144
...,...,...,...,...,...
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
46,47,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000


In [29]:
tfidf_df2 = tfidf_df2.sort_values(by=['fit'], ascending=False)
tfidf_df2

Unnamed: 0,id,job_title,location,connection,fit
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.658595
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.658595
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.638511
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.615664
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.481908
...,...,...,...,...,...
31,32,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000


### Re-ranking with a Starred Candidate

#### Approach 1

In [30]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + tfidf_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + tfidf_df2.iloc[6]["job_title"].lower()
star_keywords = [star_key1, star_key2]
star_keywords

['aspiring human resources aspiring human resources professional',
 'seeking human resources seeking human resources hris and generalist positions']

In [31]:
# Get the vector of new keywords
star_tfidf_keyword_vec = tfidf_vec.transform(star_keywords)
star_tfidf_cs = cosine_similarity(star_tfidf_keyword_vec, cv_title_vec)

In [32]:
star_tfidf_res = pd.DataFrame(star_tfidf_cs, columns=df['id'])
star_tfidf_res

id,1,2,3,4,5,6,7,8,9,10,...,95,96,97,98,99,100,101,102,103,104
0,0.568818,0.0,0.985222,0.0,0.0,0.783045,0.591926,0.0,0.591926,0.380347,...,0.0,0.0,0.985222,0.0,0.465829,0.588725,0.465829,0.0,0.0,0.0
1,0.181861,0.0,0.314992,0.0,0.0,0.314992,0.354686,0.0,0.354686,0.969447,...,0.0,0.0,0.314992,0.0,0.610397,0.436233,0.469205,0.0,0.0,0.0


In [33]:
star_tfidf_res_key1 = star_tfidf_res.iloc[0].reset_index(drop=True)
star_tfidf_res_key2 = star_tfidf_res.iloc[1].reset_index(drop=True)

# Create copies and fill "fit" with the cos similarity
star_tfidf_df1 = df.copy()
star_tfidf_df2 = df.copy()
star_tfidf_df1['fit'] = star_tfidf_df1['fit'].fillna(star_tfidf_res_key1)
star_tfidf_df2['fit'] = star_tfidf_df2['fit'].fillna(star_tfidf_res_key2)

In [34]:
star_tfidf_df1 = star_tfidf_df1.sort_values(by=['fit'], ascending=False)
star_tfidf_df1

Unnamed: 0,id,job_title,location,connection,fit
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.985222
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.985222
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [35]:
star_tfidf_df2 = star_tfidf_df2.sort_values(by=['fit'], ascending=False)
star_tfidf_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.969447
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.610397
...,...,...,...,...,...
31,32,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000


#### Approach 2

In [36]:
# Star the 7th candiate
star_key3 = tfidf_df1.iloc[6]["job_title"].lower()
star_key4 = tfidf_df2.iloc[6]["job_title"].lower()
star_keywords2 = [star_key3, star_key4]
star_keywords2

['aspiring human resources professional',
 'seeking human resources hris and generalist positions']

In [37]:
# Get the vector of new keywords
star_tfidf_keyword_vec2 = CountVec.transform(star_keywords2)
star_tfidf_cs2 = cosine_similarity(star_tfidf_keyword_vec2, cv_title_vec)

In [38]:
# Take the average of the starred keywords and original keywords
star_tfidf_cs2 = (star_tfidf_cs2 + cv_cs) / 2

In [39]:
star_tfidf_res2 = pd.DataFrame(star_tfidf_cs2, columns=df['id'])
star_tfidf_res2

id,1,2,3,4,5,6,7,8,9,10,...,95,96,97,98,99,100,101,102,103,104
0,0.538675,0.0,0.933013,0.0,0.0,0.808013,0.6108,0.0,0.6108,0.439826,...,0.0,0.0,0.933013,0.0,0.538675,0.634835,0.538675,0.0,0.0,0.0
1,0.284518,0.0,0.492799,0.0,0.0,0.492799,0.449673,0.0,0.449673,0.853553,...,0.0,0.0,0.492799,0.0,0.739199,0.58077,0.594861,0.0,0.0,0.0


In [40]:
star_tfidf_res_key3 = star_tfidf_res2.iloc[0].reset_index(drop=True)
star_tfidf_res_key4 = star_tfidf_res2.iloc[1].reset_index(drop=True)

# Create copies and fill "fit" with the cos similarity
star_tfidf_df3 = df.copy()
star_tfidf_df4 = df.copy()
star_tfidf_df3['fit'] = star_tfidf_df3['fit'].fillna(star_tfidf_res_key3)
star_tfidf_df4['fit'] = star_tfidf_df4['fit'].fillna(star_tfidf_res_key4)

In [41]:
star_tfidf_df3 = star_tfidf_df3.sort_values(by=['fit'], ascending=False)
star_tfidf_df3

Unnamed: 0,id,job_title,location,connection,fit
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.933013
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [42]:
star_tfidf_df4 = star_tfidf_df4.sort_values(by=['fit'], ascending=False)
star_tfidf_df4

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.739199
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


# Word2vec

In [43]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v
from sklearn.decomposition import PCA

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [44]:
sw = stopwords.words('english')

In [45]:
# Tokenize each job title
word2vec_tokens = [word_tokenize(job_title) for job_title in job_titles]

In [46]:
# Remove stop words and puntuations
word2vec_clean = []

for line in word2vec_tokens:
  tokens = []
  for word in line:
    if (word not in sw and word not in string.punctuation):
        tokens.append(word)
  word2vec_clean.append(tokens)

In [47]:
w = w2v(
    word2vec_clean,
    min_count=1,
    sg = 1,
    window=5
)

# Create word embedding
word2vec_df = (
    pd.DataFrame(
        [w.wv.get_vector(str(n)) for n in w.wv.key_to_index],
        index = w.wv.key_to_index
    )
)
print(word2vec_df.shape)
word2vec_df.head()

(185, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
human,-0.000721,0.001137,0.004927,0.009055,-0.008244,-0.008943,0.007436,0.010913,-0.005582,-0.004577,...,0.003089,0.001556,0.003665,0.001883,0.011729,0.006008,-0.008459,-0.00742,0.001335,0.005793
resources,-0.008932,0.004368,0.005016,0.005808,0.008211,-0.007896,0.001784,0.007625,-0.003179,-0.006778,...,0.002101,-0.000378,0.00267,-0.006512,-0.000931,0.003528,0.005826,-0.002526,-0.009151,0.004093
aspiring,0.000145,0.003494,-0.007041,-0.001435,0.008183,0.006441,-0.003321,0.003717,-0.008738,0.005776,...,-0.003684,0.006493,0.009174,-0.00318,0.008934,0.005942,0.006044,0.000199,0.00857,-0.007339
professional,-0.008349,0.009518,-0.000155,-0.001794,0.0049,-0.004827,0.003044,0.007707,0.0059,-0.007954,...,-0.006945,-0.000452,-0.000588,-0.001859,0.010418,-4.3e-05,0.006028,-0.00758,-0.002253,-0.005721
student,-0.007151,0.001511,-0.007245,-0.002277,0.003972,0.005233,0.001529,0.002735,-0.004286,0.006964,...,0.003588,-0.004248,0.005417,-0.003617,0.003296,-0.007846,0.006414,0.004797,0.000978,0.002826


In [48]:
# Get document embedding
word2vec_keyword_vec1 = np.zeros((1, 100))
word2vec_keyword_vec2 = np.zeros((1, 100))
word2vec_title_vec = np.zeros((104, 100))

for keyword in ["aspiring", "human", "resources"]:
  word2vec_keyword_vec1 += w.wv.get_vector(keyword)

for keyword in ["seeking", "human", "resources"]:
  word2vec_keyword_vec2 += w.wv.get_vector(keyword)

for i in range(len(word2vec_clean)):
  for word in word2vec_clean[i]:
    word2vec_title_vec[i] += w.wv.get_vector(word)

In [49]:
word2vec_cs1 = cosine_similarity(word2vec_keyword_vec1, word2vec_title_vec)
word2vec_cs2 = cosine_similarity(word2vec_keyword_vec2, word2vec_title_vec)
print("word2vec_cs shape: {}".format(word2vec_cs1.shape))

word2vec_cs shape: (1, 104)


In [50]:
# Create copies and fill "fit" with the cos similarity
word2vec_df1 = df.copy()
word2vec_df2 = df.copy()
word2vec_df1['fit'] = word2vec_df1['fit'].fillna(pd.Series(word2vec_cs1[0]))
word2vec_df2['fit'] = word2vec_df2['fit'].fillna(pd.Series(word2vec_cs2[0]))

In [51]:
word2vec_df1 = word2vec_df1.sort_values(by=['fit'], ascending=False)
word2vec_df1

Unnamed: 0,id,job_title,location,connection,fit
59,60,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
35,36,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
23,24,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
5,6,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
48,49,Aspiring Human Resources Specialist,Greater New York City Area,1,0.898957
...,...,...,...,...,...
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.029081
58,59,People Development Coordinator at Ryan,"Denton, Texas",500+,0.029081
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.029081
91,92,Seeking employment opportunities within Custom...,"Torrance, California",64,0.027599


In [52]:
word2vec_df2 = word2vec_df2.sort_values(by=['fit'], ascending=False)
word2vec_df2

Unnamed: 0,id,job_title,location,connection,fit
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.901433
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.877797
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.877797
72,73,"Aspiring Human Resources Manager, seeking inte...","Houston, Texas Area",7,0.843673
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.752123
...,...,...,...,...,...
7,8,HR Senior Specialist,San Francisco Bay Area,500+,0.045769
60,61,HR Senior Specialist,San Francisco Bay Area,500+,0.045769
50,51,HR Senior Specialist,San Francisco Bay Area,500+,0.045769
103,104,Director Of Administration at Excellence Logging,"Katy, Texas",500+,0.019210


### Re-ranking with a Starred Candidate

#### Approach 1

In [53]:
# Star the 7th candiate
star_key1 = keywords[0] + " " + word2vec_df1.iloc[6]["job_title"].lower()
star_key2 = keywords[1] + " " + word2vec_df2.iloc[6]["job_title"].lower()
print(star_key1)
print(star_key2)
star_key1 = star_key1.split()
star_key2 = star_key2.split()

aspiring human resources aspiring human resources professional
seeking human resources seeking human resources hris and generalist positions


In [54]:
# Get document embedding
star_keyword_vec1 = np.zeros((1, 100))
star_keyword_vec2 = np.zeros((1, 100))

for keyword in star_key1:
  if keyword not in sw:
    star_keyword_vec1 += w.wv.get_vector(keyword)

for keyword in star_key2:
  if keyword not in sw:
    star_keyword_vec2 += w.wv.get_vector(keyword)

star_cs1 = cosine_similarity(star_keyword_vec1, word2vec_title_vec)
star_cs2 = cosine_similarity(star_keyword_vec2, word2vec_title_vec)
print("word2vec_cs shape: {}".format(word2vec_cs1.shape))

word2vec_cs shape: (1, 104)


In [55]:
# Create copies and fill "fit" with the cos similarity
star_df1 = df.copy()
star_df2 = df.copy()
star_df1['fit'] = star_df1['fit'].fillna(pd.Series(star_cs1[0]))
star_df2['fit'] = star_df2['fit'].fillna(pd.Series(star_cs2[0]))

In [56]:
star_df1 = star_df1.sort_values(by=['fit'], ascending=False)
star_df1

Unnamed: 0,id,job_title,location,connection,fit
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.972292
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
16,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.972292
...,...,...,...,...,...
17,18,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807
46,47,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807
58,59,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,0.036807


In [57]:
star_df2 = star_df2.sort_values(by=['fit'], ascending=False)
star_df2

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.956578
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,0.818093
...,...,...,...,...,...
37,38,HR Senior Specialist,San Francisco Bay Area,500+,0.033467
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.033467
102,103,Always set them up for Success,Greater Los Angeles Area,500+,0.031630
101,102,Business Intelligence and Analytics at Travelers,Greater New York City Area,49,0.016665


#### Approach 2

In [58]:
# Star the 7th candiate
star_key3 = cv_df1.iloc[6]["job_title"].lower()
star_key4 = cv_df2.iloc[6]["job_title"].lower()
star_keywords2 = [star_key3, star_key4]
star_keywords2

['aspiring human resources professional',
 'seeking human resources hris and generalist positions']

In [59]:
# Get the vector of new keywords
star_cv_keyword_vec2 = CountVec.transform(star_keywords2)
star_cv_cs2 = cosine_similarity(star_cv_keyword_vec2, cv_title_vec)

In [60]:
# Take the average of the starred keywords and original keywords
star_cv_cs2 = (star_cv_cs2 + cv_cs) / 2

In [61]:
star_cv_res2 = pd.DataFrame(star_cv_cs2, columns=df['id'])
star_cv_res2

id,1,2,3,4,5,6,7,8,9,10,...,95,96,97,98,99,100,101,102,103,104
0,0.538675,0.0,0.933013,0.0,0.0,0.808013,0.6108,0.0,0.6108,0.439826,...,0.0,0.0,0.933013,0.0,0.538675,0.634835,0.538675,0.0,0.0,0.0
1,0.284518,0.0,0.492799,0.0,0.0,0.492799,0.449673,0.0,0.449673,0.853553,...,0.0,0.0,0.492799,0.0,0.739199,0.58077,0.594861,0.0,0.0,0.0


In [62]:
star_cv_res_key3 = star_cv_res2.iloc[0].reset_index(drop=True)
star_cv_res_key4 = star_cv_res2.iloc[1].reset_index(drop=True)

# Create copies and fill "fit" with the cos similarity
star_cv_df3 = df.copy()
star_cv_df4 = df.copy()
star_cv_df3['fit'] = star_cv_df3['fit'].fillna(star_cv_res_key3)
star_cv_df4['fit'] = star_cv_df4['fit'].fillna(star_cv_res_key4)

In [63]:
star_cv_df3 = star_cv_df3.sort_values(by=['fit'], ascending=False)
star_cv_df3

Unnamed: 0,id,job_title,location,connection,fit
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
20,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
57,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,0.933013
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,0.933013
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


In [64]:
star_cv_df4 = star_cv_df4.sort_values(by=['fit'], ascending=False)
star_cv_df4

Unnamed: 0,id,job_title,location,connection,fit
52,53,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
61,62,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
39,40,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
9,10,Seeking Human Resources HRIS and Generalist Po...,Greater Philadelphia Area,500+,0.853553
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,0.739199
...,...,...,...,...,...
25,26,HR Senior Specialist,San Francisco Bay Area,500+,0.000000
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.000000
21,22,People Development Coordinator at Ryan,"Denton, Texas",500+,0.000000
19,20,Native English Teacher at EPIK (English Progra...,Kanada,500+,0.000000


## Glove

In [65]:
from gensim.models import KeyedVectors

glove_embeddings = {}
# Load the pre-trained GloVe word vectors
with open("/content/glove.6B.100d.txt", 'r', encoding="utf-8") as f:
  for line in f:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      glove_embeddings[word] = vector

In [66]:
sw = stopwords.words('english')
# Tokenize each job title
glove_tokens = [word_tokenize(job_title) for job_title in job_titles]
# Remove stop words and puntuations
glove_clean = []

for line in word2vec_tokens:
  tokens = []
  for word in line:
    if (word not in sw and word not in string.punctuation):
        tokens.append(word)
  glove_clean.append(tokens)

In [67]:
glove_embeddings['human'].shape

(100,)

In [68]:
# Get document embedding
glove_keyword_vec1 = np.zeros((1, 100))
glove_keyword_vec2 = np.zeros((1, 100))
glove_title_vec = np.zeros((104, 100))

for keyword in ["aspiring", "human", "resources"]:
  if keyword in glove_embeddings:
    glove_keyword_vec1 += glove_embeddings[keyword]

for keyword in ["seeking", "human", "resources"]:
  if keyword in glove_embeddings:
    glove_keyword_vec2 += glove_embeddings[keyword]

for i in range(len(glove_clean)):
  for word in glove_clean[i]:
    if word in glove_embeddings:
      glove_title_vec[i] += glove_embeddings[word]

In [69]:
glove_cs1 = cosine_similarity(glove_keyword_vec1, glove_title_vec)
glove_cs2 = cosine_similarity(glove_keyword_vec2, glove_title_vec)
print("glove_cs shape: {}".format(glove_cs1.shape))

glove_cs shape: (1, 104)


In [70]:
glove_res1 = pd.DataFrame(glove_cs1[0], index=df['id'])
glove_res2 = pd.DataFrame(glove_cs2[0], index=df['id'])

In [71]:
glove_res1

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.614252
2,0.577611
3,0.948721
4,0.723682
5,0.437611
...,...
100,0.829207
101,0.879105
102,0.561562
103,0.621513


In [72]:
glove_res2

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.526134
2,0.530875
3,0.852185
4,0.713481
5,0.433207
...,...
100,0.837412
101,0.860519
102,0.600009
103,0.651057


## Bert

In [73]:
# !pip install transformers

In [74]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

ModuleNotFoundError: ignored

In [None]:
def get_bert_embeddings(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Convert tokens to token IDs
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensors
    input_tensor = torch.tensor([token_ids])

    # Get the BERT model outputs
    with torch.no_grad():
        outputs = model(input_tensor)

    # Get the word embeddings
    embeddings = outputs.last_hidden_state.squeeze()

    return embeddings

In [None]:
# Get keyword embeddings
keyword1 = "aspiring human resources"
keyword2 = "seeking human resources"

bert_key_embeddings1 = get_bert_embeddings(keyword1).sum(axis=0).unsqueeze(0)
bert_key_embeddings2 = get_bert_embeddings(keyword2).sum(axis=0).unsqueeze(0)

# Print the shape of the word embeddings
print(bert_key_embeddings1.shape)
print(bert_key_embeddings2.shape)

In [None]:
# Get document embeddings
bert_embeddings = []

# Iterate over each string
for title in job_titles:
  title_emb = get_bert_embeddings(title)
  if title_emb.ndim != 2:
    title_emb = title_emb.reshape(1, -1)
    # Append word embeddings
  bert_embeddings.append(np.array(title_emb.sum(axis=0)))

In [None]:
bert_cs1 = cosine_similarity(np.array(bert_key_embeddings1), np.array(bert_embeddings))
bert_cs2 = cosine_similarity(np.array(bert_key_embeddings2), np.array(bert_embeddings))
print("bert_cs1 shape: {}".format(bert_cs1.shape))

In [None]:
bert_res1 = pd.DataFrame(bert_cs1[0], index=df['id'])
bert_res2 = pd.DataFrame(bert_cs2[0], index=df['id'])

In [None]:
bert_res1

In [None]:
bert_res2

## Sbert

In [None]:
# !pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

# Load the SBERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Encode the sentences to obtain embeddings
sbert_embeddings = sbert_model.encode(job_titles)
print(sbert_embeddings.shape)

In [None]:
sbert_keyword_vec1 = sbert_model.encode("aspiring human resources").reshape(1, -1)
sbert_keyword_vec2 = sbert_model.encode("seeking human resources").reshape(1, -1)
sbert_cs1 = cosine_similarity(sbert_keyword_vec1, sbert_embeddings)
sbert_cs2 = cosine_similarity(sbert_keyword_vec2, sbert_embeddings)

In [None]:
sbert_res1 = pd.DataFrame(sbert_cs1[0], index=df['id'])
sbert_res2 = pd.DataFrame(sbert_cs2[0], index=df['id'])

In [None]:
sbert_res1

In [None]:
sbert_res2