<a href="https://colab.research.google.com/github/trtrgfh/GlVYfAbQjtuwUHlZ/blob/main/Potential_Talents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("/content/potential-talents.csv")
print(df.shape)
df.head()

(104, 5)


Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
job_titles = df["job_title"].to_list()
job_titles = [string.lower() for string in job_titles]
keywords = ["aspiring human resources", "seeking human resources"]

combined_list = keywords + job_titles

## Preprocessing

## Bag-of-words

In [4]:
from nltk.stem import PorterStemmer

In [5]:
CountVec = CountVectorizer(ngram_range=(1,1), # to use bigrams ngram_range=(2,2)
                           stop_words='english')
# fit and transform
Count_data = CountVec.fit_transform(combined_list)

cv_keyword_vec = Count_data[:len(keywords)].toarray()
cv_title_vec = Count_data[len(keywords):].toarray()
print("cv_keyword_vec shape: {}, cv_title_vec shape: {}".format(cv_keyword_vec.shape, cv_title_vec.shape))

cv_keyword_vec shape: (2, 181), cv_title_vec shape: (104, 181)


In [6]:
# Compute cosine similarity between the keyword vectors and job title vectors
cv_cs = cosine_similarity(cv_keyword_vec, cv_title_vec)

In [7]:
# Get the cos similarity of each candidata with reaspect to candidata id
cv_res = pd.DataFrame(cv_cs,columns=df['id'])

In [8]:
cv_res_key1 = cv_res.iloc[0]
cv_res_key2 = cv_res.iloc[1]
tfidf_res1 = pd.DataFrame(cv_res_key1, index=df['id'])
tfidf_res2 = pd.DataFrame(cv_res_key2, index=df['id'])
print(cv_res.shape)

(2, 104)


In [9]:
cv_res_key1

id
1      0.500000
2      0.000000
3      0.866025
4      0.000000
5      0.000000
         ...   
100    0.680414
101    0.577350
102    0.000000
103    0.000000
104    0.000000
Name: 0, Length: 104, dtype: float64

In [10]:
cv_res_key2

id
1      0.333333
2      0.000000
3      0.577350
4      0.000000
5      0.000000
         ...   
100    0.680414
101    0.577350
102    0.000000
103    0.000000
104    0.000000
Name: 1, Length: 104, dtype: float64

## Tf-Idf vectorizer

In [11]:
tfidf_vec = TfidfVectorizer(use_idf=True,
                        smooth_idf=True,
                        ngram_range=(1,1),stop_words='english')

tfidf_data = tfidf_vec.fit_transform(combined_list)

tfidf_keyword_vec = tfidf_data[:len(keywords)].toarray()
tfidf_title_vec = tfidf_data[len(keywords):].toarray()
print("tfidf_keyword_vec shape: {}, tfidf_title_vec shape: {}".format(cv_keyword_vec.shape, cv_title_vec.shape))

tfidf_keyword_vec shape: (2, 181), tfidf_title_vec shape: (104, 181)


In [12]:
tfidf_cs = cosine_similarity(tfidf_keyword_vec, tfidf_title_vec)

In [13]:
tfidf_res = pd.DataFrame(cv_cs,columns=df['id'])

In [14]:
tfidf_res_key1 = tfidf_res.iloc[0]
tfidf_res_key2 = tfidf_res.iloc[1]
tfidf_res1 = pd.DataFrame(tfidf_res_key1, index=df['id'])
tfidf_res2 = pd.DataFrame(tfidf_res_key2, index=df['id'])
print(tfidf_res.shape)

(2, 104)


In [15]:
tfidf_res1

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.500000
2,0.000000
3,0.866025
4,0.000000
5,0.000000
...,...
100,0.680414
101,0.577350
102,0.000000
103,0.000000


In [16]:
tfidf_res2

Unnamed: 0_level_0,1
id,Unnamed: 1_level_1
1,0.333333
2,0.000000
3,0.577350
4,0.000000
5,0.000000
...,...
100,0.680414
101,0.577350
102,0.000000
103,0.000000


## Word2vec

In [17]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
from gensim.models import Word2Vec as w2v
from sklearn.decomposition import PCA

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
sw = stopwords.words('english')

In [19]:
# Tokenize each job title
word2vec_tokens = [word_tokenize(job_title) for job_title in job_titles]

In [20]:
# Remove stop words and puntuations
word2vec_clean = []

for line in word2vec_tokens:
  tokens = []
  for word in line:
    if (word not in sw and word not in string.punctuation):
        tokens.append(word)
  word2vec_clean.append(tokens)

In [21]:
w = w2v(
    word2vec_clean,
    min_count=1,
    sg = 1,
    window=5
)

# Create word embedding
word2vec_df = (
    pd.DataFrame(
        [w.wv.get_vector(str(n)) for n in w.wv.key_to_index],
        index = w.wv.key_to_index
    )
)
print(word2vec_df.shape)
word2vec_df.head()

(185, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
human,-0.000721,0.001137,0.004927,0.009055,-0.008244,-0.008943,0.007436,0.010913,-0.005582,-0.004577,...,0.003089,0.001556,0.003665,0.001883,0.011729,0.006008,-0.008459,-0.00742,0.001335,0.005793
resources,-0.008932,0.004368,0.005016,0.005808,0.008211,-0.007896,0.001784,0.007625,-0.003179,-0.006778,...,0.002101,-0.000378,0.00267,-0.006512,-0.000931,0.003528,0.005826,-0.002526,-0.009151,0.004093
aspiring,0.000145,0.003494,-0.007041,-0.001435,0.008183,0.006441,-0.003321,0.003717,-0.008738,0.005776,...,-0.003684,0.006493,0.009174,-0.00318,0.008934,0.005942,0.006044,0.000199,0.00857,-0.007339
professional,-0.008349,0.009518,-0.000155,-0.001794,0.0049,-0.004827,0.003044,0.007707,0.0059,-0.007954,...,-0.006945,-0.000452,-0.000588,-0.001859,0.010418,-4.3e-05,0.006028,-0.00758,-0.002253,-0.005721
student,-0.007151,0.001511,-0.007245,-0.002277,0.003972,0.005233,0.001529,0.002735,-0.004286,0.006964,...,0.003588,-0.004248,0.005417,-0.003617,0.003296,-0.007846,0.006414,0.004797,0.000978,0.002826


In [22]:
# Get document embedding
word2vec_keyword_vec1 = np.zeros((1, 100))
word2vec_keyword_vec2 = np.zeros((1, 100))
word2vec_title_vec = np.zeros((104, 100))

for keyword in ["aspiring", "human", "resources"]:
  word2vec_keyword_vec1 += w.wv.get_vector(keyword)

for keyword in ["seeking", "human", "resources"]:
  word2vec_keyword_vec2 += w.wv.get_vector(keyword)

for i in range(len(word2vec_clean)):
  for word in word2vec_clean[i]:
    word2vec_title_vec[i] += w.wv.get_vector(word)

In [23]:
word2vec_cs1 = cosine_similarity(word2vec_keyword_vec1, word2vec_title_vec)
word2vec_cs2 = cosine_similarity(word2vec_keyword_vec2, word2vec_title_vec)
print("word2vec_cs shape: {}".format(word2vec_cs1.shape))

word2vec_cs shape: (1, 104)


In [24]:
word2vec_res1 = pd.DataFrame(word2vec_cs1[0], index=df['id'])
word2vec_res2 = pd.DataFrame(word2vec_cs2[0], index=df['id'])

In [25]:
word2vec_res1

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.611124
2,0.255798
3,0.875207
4,0.029081
5,0.212742
...,...
100,0.744252
101,0.667411
102,-0.016176
103,0.131685


In [26]:
word2vec_res2

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.441564
2,0.180267
3,0.652616
4,0.094957
5,0.188060
...,...
100,0.737926
101,0.593961
102,0.047024
103,0.119428


## Glove

In [27]:
from gensim.models import KeyedVectors

glove_embeddings = {}
# Load the pre-trained GloVe word vectors
with open("/content/glove.6B.100d.txt", 'r', encoding="utf-8") as f:
  for line in f:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      glove_embeddings[word] = vector

In [28]:
sw = stopwords.words('english')
# Tokenize each job title
glove_tokens = [word_tokenize(job_title) for job_title in job_titles]
# Remove stop words and puntuations
glove_clean = []

for line in word2vec_tokens:
  tokens = []
  for word in line:
    if (word not in sw and word not in string.punctuation):
        tokens.append(word)
  glove_clean.append(tokens)

In [29]:
glove_embeddings['human'].shape

(100,)

In [30]:
# Get document embedding
glove_keyword_vec1 = np.zeros((1, 100))
glove_keyword_vec2 = np.zeros((1, 100))
glove_title_vec = np.zeros((104, 100))

for keyword in ["aspiring", "human", "resources"]:
  if keyword in glove_embeddings:
    glove_keyword_vec1 += glove_embeddings[keyword]

for keyword in ["seeking", "human", "resources"]:
  if keyword in glove_embeddings:
    glove_keyword_vec2 += glove_embeddings[keyword]

for i in range(len(glove_clean)):
  for word in glove_clean[i]:
    if word in glove_embeddings:
      glove_title_vec[i] += glove_embeddings[word]

In [31]:
glove_cs1 = cosine_similarity(glove_keyword_vec1, glove_title_vec)
glove_cs2 = cosine_similarity(glove_keyword_vec2, glove_title_vec)
print("glove_cs shape: {}".format(glove_cs1.shape))

glove_cs shape: (1, 104)


In [32]:
glove_res1 = pd.DataFrame(glove_cs1[0], index=df['id'])
glove_res2 = pd.DataFrame(glove_cs2[0], index=df['id'])

In [33]:
glove_res1

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.614252
2,0.577611
3,0.948721
4,0.723682
5,0.437611
...,...
100,0.829207
101,0.879105
102,0.561562
103,0.621513


In [34]:
glove_res2

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.526134
2,0.530875
3,0.852185
4,0.713481
5,0.433207
...,...
100,0.837412
101,0.860519
102,0.600009
103,0.651057


## Bert

In [35]:
# !pip install transformers

In [36]:
from transformers import BertTokenizer, BertModel
import torch

# Load the pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
def get_bert_embeddings(text):
    # Tokenize the text
    tokens = tokenizer.tokenize(text)

    # Convert tokens to token IDs
    token_ids = tokenizer.convert_tokens_to_ids(tokens)

    # Convert token IDs to tensors
    input_tensor = torch.tensor([token_ids])

    # Get the BERT model outputs
    with torch.no_grad():
        outputs = model(input_tensor)

    # Get the word embeddings
    embeddings = outputs.last_hidden_state.squeeze()

    return embeddings

In [38]:
# Get keyword embeddings
keyword1 = "aspiring human resources"
keyword2 = "seeking human resources"

bert_key_embeddings1 = get_bert_embeddings(keyword1).sum(axis=0).unsqueeze(0)
bert_key_embeddings2 = get_bert_embeddings(keyword2).sum(axis=0).unsqueeze(0)

# Print the shape of the word embeddings
print(bert_key_embeddings1.shape)
print(bert_key_embeddings2.shape)

torch.Size([1, 768])
torch.Size([1, 768])


In [39]:
# Get document embeddings
bert_embeddings = []

# Iterate over each string
for title in job_titles:
  title_emb = get_bert_embeddings(title)
  if title_emb.ndim != 2:
    title_emb = title_emb.reshape(1, -1)
    # Append word embeddings
  bert_embeddings.append(np.array(title_emb.sum(axis=0)))

In [40]:
bert_cs1 = cosine_similarity(np.array(bert_key_embeddings1), np.array(bert_embeddings))
bert_cs2 = cosine_similarity(np.array(bert_key_embeddings2), np.array(bert_embeddings))
print("bert_cs1 shape: {}".format(bert_cs1.shape))

bert_cs1 shape: (1, 104)


In [41]:
bert_res1 = pd.DataFrame(bert_cs1[0], index=df['id'])
bert_res2 = pd.DataFrame(bert_cs2[0], index=df['id'])

In [42]:
bert_res1

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.330464
2,0.413803
3,0.703179
4,0.540259
5,0.393270
...,...
100,0.336968
101,0.382154
102,0.358250
103,0.387806


In [43]:
bert_res2

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.446525
2,0.466494
3,0.650271
4,0.533576
5,0.439140
...,...
100,0.379781
101,0.393702
102,0.405062
103,0.474416


## Sbert

In [44]:
# !pip install sentence-transformers

In [45]:
from sentence_transformers import SentenceTransformer

# Load the SBERT model
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Encode the sentences to obtain embeddings
sbert_embeddings = sbert_model.encode(job_titles)
print(sbert_embeddings.shape)

(104, 768)


In [47]:
sbert_keyword_vec1 = sbert_model.encode("aspiring human resources").reshape(1, -1)
sbert_keyword_vec2 = sbert_model.encode("seeking human resources").reshape(1, -1)
sbert_cs1 = cosine_similarity(sbert_keyword_vec1, sbert_embeddings)
sbert_cs2 = cosine_similarity(sbert_keyword_vec2, sbert_embeddings)

In [48]:
sbert_res1 = pd.DataFrame(sbert_cs1[0], index=df['id'])
sbert_res2 = pd.DataFrame(sbert_cs2[0], index=df['id'])

In [49]:
sbert_res1

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.497850
2,0.372279
3,0.927213
4,0.705831
5,0.463202
...,...
100,0.500861
101,0.609300
102,0.455116
103,0.618959


In [50]:
sbert_res2

Unnamed: 0_level_0,0
id,Unnamed: 1_level_1
1,0.220571
2,0.324733
3,0.709421
4,0.668772
5,0.468543
...,...
100,0.241682
101,0.650529
102,0.436132
103,0.488246
