# PART IV: CALCULATION OF COSINE SIMILARITY

### 1. Setting up

In [1]:
# Load nessesary libraries.
from transformers import pipeline, BertForSequenceClassification, AutoTokenizer
import pandas as pd
import torch
import torch.nn.functional as F
import time
import warnings
warnings.filterwarnings('ignore')



In [2]:
# Check whether CUDA is accessible.
cuda_available = torch.cuda.is_available()
cuda_device= torch.cuda.get_device_name(0)

if cuda_available == True:
    device = torch.device('cuda')
    print('CUDA was successfully installed and compiled on my device.')
    print('CUDA device name is:', cuda_device)
else:
    print('Cuda in not available')

CUDA was successfully installed and compiled on my device.
CUDA device name is: NVIDIA GeForce GTX 1650


### 2. Testing model and its embedding.

In [3]:
# Initialize a text classification pipeline.
classifier = pipeline('text-classification', model='ft_bert_temuulen2', tokenizer='ft_bert_temuulen_tokenizer2')

In [4]:
# Testing model with random text samples.
random_text = ["I promote health and help people who is sick", 
               "If your outlet isn't working and you don't have a light, you can ask me to fix it.", 
               "All day, I'm sitting in front of the screen, solving problems with my mouse and keyboard", 
               "I am familiar with the process of providing injections to individuals.",
               "If the streetlights fail, I can replace them with new ones.",
               "I am familiar with both Python and R, and I use these tools for my work."]

for x in range(len(random_text)):
    print(classifier(random_text[x]), '=', random_text[x])

[{'label': 'registered_nurse', 'score': 0.9447992444038391}] = I promote health and help people who is sick
[{'label': 'electrician', 'score': 0.4664452373981476}] = If your outlet isn't working and you don't have a light, you can ask me to fix it.
[{'label': 'data_analyst', 'score': 0.9107747673988342}] = All day, I'm sitting in front of the screen, solving problems with my mouse and keyboard
[{'label': 'registered_nurse', 'score': 0.8420090079307556}] = I am familiar with the process of providing injections to individuals.
[{'label': 'electrician', 'score': 0.7360942959785461}] = If the streetlights fail, I can replace them with new ones.
[{'label': 'data_analyst', 'score': 0.9674216508865356}] = I am familiar with both Python and R, and I use these tools for my work.


In [5]:
# Testing model using data from experiment participants.
df_jobseeker = pd.read_csv('data_jobseeker.csv', index_col=None)
df_jobseeker['professional_qualifications'] = df_jobseeker.education + ". " + df_jobseeker.skill + ". " + df_jobseeker.experience + '.'
df_jobseeker.drop(['education', 'skill', 'experience'], axis=1, inplace=True)

for x in range(3):
    print(classifier(df_jobseeker.iat[x, -1]), '=', df_jobseeker.iat[x, -1])

[{'label': 'registered_nurse', 'score': 0.9978272318840027}] = bachelor's degree: critical care nursing. patient care, wound care, medical procedures, adult nursing, infection control, diagnostic, time management, communication skills, attention to detail. registered nurse: 3 years.
[{'label': 'electrician', 'score': 0.9963247179985046}] = high school diploma, vocational electrician certification, construction safety certification. circuit testing, blueprint reading, fault finding, electrical wiring, troubleshooting, equipment inspection, installation, organization, maintenance, diagnostic, independent worker, safety knowledge. residential electrician's helper: 1 year.
[{'label': 'data_analyst', 'score': 0.9970858693122864}] = degree: master of science in data analytics, bachelor of science in business administration; certifications: microsoft certified - azure data scientist associate, google data analytics certificate. python, data mining and extraction, data analytics and visualizat

In [6]:
# Load a tokenizer used for the fine-tuned model.
tokenizer = AutoTokenizer.from_pretrained('ft_bert_temuulen_tokenizer2')

# Load a fine-tuned model with the hidden state output enabled.
model = BertForSequenceClassification.from_pretrained('ft_bert_temuulen2', output_hidden_states=True)

In [7]:
# Assign to a new variable text for testing.
input_text = df_jobseeker.iat[0, -1]

# Tokenize the input text and convert it to PyTorch tensors.
inputs = tokenizer(input_text, return_tensors='pt')

# Perform a forward pass through the model to get the hidden states.
with torch.no_grad():
    outputs = model(**inputs)

# Extract the last hidden states from the model outputs.
last_hidden_states = outputs.hidden_states[-1]

print(last_hidden_states.shape, '\n')
print(last_hidden_states)

torch.Size([1, 44, 768]) 

tensor([[[ 1.1255, -0.5105,  1.0606,  ..., -0.0646, -0.0972,  1.0354],
         [ 1.0868, -0.3361,  0.5461,  ...,  0.2573,  0.0850,  0.3742],
         [ 1.1630, -0.3186,  0.6489,  ...,  0.5629, -0.3385,  0.0202],
         ...,
         [ 0.7319,  0.4765,  1.3012,  ...,  0.1161, -0.2846,  0.3979],
         [ 0.0818, -0.5454,  0.5450,  ...,  0.7430, -0.1035,  0.1977],
         [ 1.1113, -0.2863,  0.7475,  ...,  0.5620, -0.3865,  0.0853]]])


### 3. Ebedding

**job seeker**

In [8]:
# Define a custom function to get embedding for job seekers' data.
def process_text(text):
    
    inputs = tokenizer(text, return_tensors='pt')
    
    with torch.no_grad():
        outputs = model(**inputs)
    last_hidden_states = outputs.hidden_states[-1]
    
    return last_hidden_states

In [9]:
# Apply function and create a new column with the extracted results.
df_jobseeker['last_layer'] = df_jobseeker.iloc[:, -1].apply(process_text)

print(df_jobseeker.iat[0, -1].shape, '\n')
print(df_jobseeker.iat[0, -1])

torch.Size([1, 44, 768]) 

tensor([[[ 1.1255, -0.5105,  1.0606,  ..., -0.0646, -0.0972,  1.0354],
         [ 1.0868, -0.3361,  0.5461,  ...,  0.2573,  0.0850,  0.3742],
         [ 1.1630, -0.3186,  0.6489,  ...,  0.5629, -0.3385,  0.0202],
         ...,
         [ 0.7319,  0.4765,  1.3012,  ...,  0.1161, -0.2846,  0.3979],
         [ 0.0818, -0.5454,  0.5450,  ...,  0.7430, -0.1035,  0.1977],
         [ 1.1113, -0.2863,  0.7475,  ...,  0.5620, -0.3865,  0.0853]]])


In [10]:
# Check the results.
df_jobseeker

Unnamed: 0,participant,data_collection,date,location,preferred_position,professional_qualifications,last_layer
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing. pati...,"[[[tensor(1.1255), tensor(-0.5105), tensor(1.0..."
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce...","[[[tensor(-0.1251), tensor(-0.3441), tensor(0...."
2,user_3,google form,2023-12-31 13:39:00,"dublin, ireland",data analyst,"degree: master of science in data analytics, b...","[[[tensor(-0.6026), tensor(-0.3314), tensor(-0..."


**job ad**

In [11]:
# Read job ads' dataset.
df_jobads = pd.read_csv('data_jobads_final.csv', index_col=None)

# Apply some data cleaning and preprocessing steps.
df_jobads['job_description'] = df_jobads['job_description'].str.replace('\n', ' ')
df_jobads = df_jobads.dropna()
df_jobads = df_jobads.reset_index(drop=True)

df_jobads.head(2)

Unnamed: 0,title,id,link,date,job_description,label
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse


In [12]:
# Calculate the word count for each ad and add its values to a new column.
df_jobads['word_count'] = df_jobads['job_description'].apply(lambda x: len(x.split()))

print('The total number of rows with a word count over 510 is:', df_jobads[df_jobads['word_count'] > 510].shape[0])
print('The word count for the longest text is:', df_jobads.iat[303, -1])

df_jobads.head(2)

The total number of rows with a word count over 510 is: 236
The word count for the longest text is: 3145


Unnamed: 0,title,id,link,date,job_description,label,word_count
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse,231


In [13]:
# Define a custom function to encode text with BERT, including additional rules for dealing with long text.
def embed_with_bert(df_column):
    
    embedded_texts = []
    
    # Iterate through each text in the DataFrame column.
    for text in df_column:
        
        # Tokenize each text without adding special tokens and without truncation or padding.
        tokens = tokenizer(text, add_special_tokens=False, return_tensors='pt', truncation=False, padding=False)['input_ids'].squeeze()
        token_length = len(tokens)
        
        # If the token length is less than or equal to 512, process it normally.
        if token_length <= 512:
            inputs = tokenizer(text, return_tensors='pt').to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            last_hidden_states = outputs.hidden_states[-1].cpu()  
            embedded_texts.append(last_hidden_states)
            
        # If the token length is greater than 512, split it into sliding windows withot lapping.
        else:
            max_length = 512
            stride = 0
            tokens = tokenizer(text, add_special_tokens=False, return_tensors='pt', truncation=False, padding=False)['input_ids'].squeeze().to(device)
            token_windows = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length - stride)]
            
            all_hidden_states = []
            
            # Add special tokens (CLS and SEP) and truncate if needed.
            for window in token_windows:
                window = torch.cat([torch.tensor([tokenizer.cls_token_id], device=device), window, torch.tensor([tokenizer.sep_token_id], device=device)])
                if len(window) > max_length:
                    window = torch.cat((window[:max_length-1], torch.tensor([tokenizer.sep_token_id], device=device)))
                inputs = {'input_ids': window.unsqueeze(0)}
                with torch.no_grad():
                    outputs = model(**inputs)
                hidden_states = outputs.hidden_states[-1].cpu()  
                all_hidden_states.append(hidden_states)
            
            # Concatenate all hidden states from each sliding window.
            embedded_texts.append(torch.cat(all_hidden_states, dim=1))
            
    return embedded_texts

In [14]:
# Move the model to the GPU.
model.to(device)

# Apply the 'embed_with_bert' function to each ad.
df_jobads['embedded_job_descriptions'] = df_jobads['job_description'].apply(lambda x: embed_with_bert([x])[0])

# Check the random cell to see the results.
print(df_jobads.iat[0, -1].shape, '\n')
print(df_jobads.iat[0, -1])

Token indices sequence length is longer than the specified maximum sequence length for this model (615 > 512). Running this sequence through the model will result in indexing errors


torch.Size([1, 617, 768]) 

tensor([[[ 1.1675, -0.2930,  1.1019,  ..., -0.1997, -0.1452,  0.9723],
         [ 1.0922,  0.0060,  0.8255,  ..., -0.0950, -0.0742,  0.5525],
         [ 1.0327, -0.2483,  0.9769,  ..., -0.2493, -0.2093,  0.4546],
         ...,
         [ 0.7858, -0.4055,  0.8485,  ...,  0.2169,  0.1360,  1.1505],
         [ 0.6305,  0.2808,  0.7732,  ...,  0.2455, -0.3459,  0.6443],
         [ 1.3295, -0.0883,  0.8348,  ...,  0.4321,  0.0147,  0.1967]]])


In [15]:
# Check the Data Frame.
df_jobads

Unnamed: 0,title,id,link,date,job_description,label,word_count,embedded_job_descriptions
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502,"[[[tensor(1.1675), tensor(-0.2930), tensor(1.1..."
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse,231,"[[[tensor(1.2444), tensor(-0.1842), tensor(1.1..."
2,registered nurse,job_4e16e9830b072344,https://ie.indeed.com/rc/clk?jk=4e16e9830b0723...,"January 10, 2024","access healthcare, one of irelands leading hea...",registered_nurse,182,"[[[tensor(1.2527), tensor(-0.2575), tensor(1.0..."
3,staff nurse,job_25a417a6373967b4,https://ie.indeed.com/rc/clk?jk=25a417a6373967...,"January 10, 2024",are you a dedicated and compassionate staff nu...,registered_nurse,199,"[[[tensor(1.2198), tensor(-0.3225), tensor(1.2..."
4,clinical research nurse - cardiology (cnm2),job_303eee71cce63f3d,https://ie.indeed.com/rc/clk?jk=303eee71cce63f...,"January 10, 2024",clinical research nurse cardiology cnm2 we a...,registered_nurse,180,"[[[tensor(1.2066), tensor(-0.2952), tensor(0.9..."
...,...,...,...,...,...,...,...,...
1161,junior analyst- private debt investments - fix...,job_ce346c7bfdd6dced,https://ie.indeed.com/rc/clk?jk=ce346c7bfdd6dc...,"before December 21, 2023",the successful candidate will have exposure to...,data_analyst,442,"[[[tensor(-0.6880), tensor(-0.2026), tensor(-0..."
1162,strategy analyst - tech,job_18b9089f580d5d70,https://ie.indeed.com/rc/clk?jk=18b9089f580d5d...,"before December 21, 2023",sector: fintech you will be a datadriven indiv...,data_analyst,114,"[[[tensor(-0.6742), tensor(-0.1148), tensor(-0..."
1163,"financial analyst, retail",job_e6ba95f1a8debaa8,https://ie.indeed.com/rc/clk?jk=e6ba95f1a8deba...,"before December 21, 2023",our client are recognised as a market leader a...,data_analyst,187,"[[[tensor(-0.7997), tensor(-0.2032), tensor(-0..."
1164,senior operations analyst (trade support),job_ac9aa90b2b36b178,https://ie.indeed.com/rc/clk?jk=ac9aa90b2b36b1...,"before December 21, 2023",the role our operations analysts are responsib...,data_analyst,191,"[[[tensor(-0.6538), tensor(-0.0521), tensor(-0..."


In [None]:
# Costum function that generates the evarage cosine similarity between the user's tensor and a job ad's tensor.
def calculate_average_similarity(tensor_user, tensor_ad):
    
    # Squeeze dimensions if the tensors have a batch dimension.
    tensor_user = tensor_user.squeeze(0) if tensor_user.dim() == 3 else tensor_user
    tensor_ad = tensor_ad.squeeze(0) if tensor_ad.dim() == 3 else tensor_ad

    tensor_ad = tensor_ad.to(tensor_user.device)

    # Initialize a similarity matrix with zeros.
    similarity_matrix = torch.zeros(tensor_user.size(0), tensor_ad.size(0), device=tensor_user.device)
    
    # Calculate cosine similarity for each pair of vectors.
    for i in range(tensor_user.size(0)):
        for j in range(tensor_ad.size(0)):
            similarity_matrix[i, j] = F.cosine_similarity(tensor_user[i].unsqueeze(0), tensor_ad[j].unsqueeze(0), dim=1)
            
    # Calculate the average similarity and convert it to a Python float.
    average_similarity = torch.mean(similarity_matrix).item()
    
    return average_similarity

In [None]:
# Record the start time.
start = time.time()

# Get the user1's tensor and move it to the GPU.
tensor_user = df_jobseeker.iat[0, -1]
tensor_user = tensor_user.to(device)

# Apply the calculation of average cosine similarity function to each job ad's tensor.
df_jobads['cosine_user1'] = df_jobads.iloc[:, -1].apply(lambda x: calculate_average_similarity(tensor_user, x.to(device)))

# Record the end time.
end = time.time()

print(f'The extraction was completed in: {int((end - start)) // 60} minutes and {int((end - start)) % 60} seconds.')

In [None]:
# Record the start time.
start = time.time()

# Get the user2's tensor and move it to the GPU.
tensor_user = df_jobseeker.iat[1, -1]
tensor_user = tensor_user.to(device)

# Apply the calculation of average cosine similarity function to each job ad's tensor.
df_jobads['cosine_user2'] = df_jobads.iloc[:, -1].apply(lambda x: calculate_average_similarity(tensor_user, x.to(device)))

# Record the end time.
end = time.time()

print(f'The extraction was completed in: {int((end - start)) // 60} minutes and {int((end - start)) % 60} seconds.')

In [None]:
# Record the start time.
start = time.time()

# Get the user3's tensor and move it to the GPU.
tensor_user = df_jobseeker.iat[2, -1]
tensor_user = tensor_user.to(device)

# Apply the calculation of average cosine similarity function to each job ad's tensor.
df_jobads['cosine_user3'] = df_jobads.iloc[:, -1].apply(lambda x: calculate_average_similarity(tensor_user, x.to(device)))

# Record the end time.
end = time.time()

print(f'The extraction was completed in: {int((end - start)) // 60} minutes and {int((end - start)) % 60} seconds.')

In [None]:
# Drop the encoded column from the Data Frame (it takes up too much memory and is no longer needed).
df_jobads = df_jobads.drop(columns=['embedded_job_descriptions'], inplace=True) 

df_jobads.head()

In [None]:
sorted_user1 = df_jobads.sort_values(by='cosine_user1', ascending=False) #.reset_index(drop=True)
sorted_user1.head()

In [None]:
sorted_user2 = df_jobads.sort_values(by='cosine_user1', ascending=False) #.reset_index(drop=True)
sorted_user2.head()

In [None]:
sorted_user3 = df_jobads.sort_values(by='cosine_user1', ascending=False) #.reset_index(drop=True)
sorted_user3.head()