# IV: CALCULATION OF COSINE SIMILARITY

This section focuses entirely on calculating the cosine similarity between the participant data and the job ads data.

## GENERAL

- **load module**

In [2]:
# Load nessesary libraries.
import re
import sys
import time
import torch
import psutil
import gpustat
import warnings
import platform
import numpy as np
import pandas as pd
import torch.nn.functional as F
from scipy.sparse import hstack
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity as cos
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer
from transformers import BertForSequenceClassification
warnings.filterwarnings('ignore')

- **check computational environment**

In [3]:
# List the software and hardware configurations used for conducting the experiment.
print('WINDOWS VERSION:', platform.platform())
print('PYTHON VERSION:', sys.version)
print('CPU CORE:', psutil.cpu_count(logical=False))
print('CPU SPEED:', psutil.cpu_freq())
print('GPU:', gpustat.new_query().gpus[0].name)
print(f'RAM: {psutil.virtual_memory().total/(1024 ** 3):.2f} GB')
print(f"HARD DRIVE: {psutil.disk_usage('/').total/(1024 ** 3):.2f} GB")

WINDOWS VERSION: Windows-10-10.0.22631-SP0
PYTHON VERSION: 3.11.4 | packaged by Anaconda, Inc. | (main, Jul  5 2023, 13:38:37) [MSC v.1916 64 bit (AMD64)]
CPU CORE: 4
CPU SPEED: scpufreq(current=2496.0, min=0.0, max=2496.0)
GPU: NVIDIA GeForce GTX 1650
RAM: 31.87 GB
HARD DRIVE: 237.45 GB


- **load dataset**

*job seekers*

In [4]:
# Load the experiment participants dataset.
df_jobseeker = pd.read_csv('data_jobseeker.csv', index_col=None)
print("The shape of the joob seekers' data frame is:", df_jobseeker.shape)

df_jobseeker.head()

The shape of the joob seekers' data frame is: (3, 8)


Unnamed: 0,participant,data_collection,date,location,preferred_position,education,skill,experience
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing,"patient care, wound care, medical procedures, ...",registered nurse: 3 years
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce...","circuit testing, blueprint reading, fault find...",residential electrician's helper: 1 year
2,user_3,google form,2023-12-31 13:39:00,"dublin, ireland",data analyst,"degree: master of science in data analytics, b...","python, data mining and extraction, data analy...",entry level data analyst: 1 year; data coordin...


The first dataset consists of 3 rows and 8 columns of data collected from experiment participants through interviews. The last three columns in this DataFrame (DF), which contain text data on education, skill, and experience, are intended to be used for analysis. Calculating the cosine score for each column individually is impractical and illogical. Therefore, it is necessary to combine these columns into a single one.

In [5]:
# Apply minor modifications for further use.
df_jobseeker['combined_info'] = df_jobseeker.education + '. ' + df_jobseeker.skill + '. ' + df_jobseeker.experience + '.'
df_jobseeker.drop(['education', 'skill', 'experience'], axis=1, inplace=True)

df_jobseeker.head()

Unnamed: 0,participant,data_collection,date,location,preferred_position,combined_info
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing. pati...
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce..."
2,user_3,google form,2023-12-31 13:39:00,"dublin, ireland",data analyst,"degree: master of science in data analytics, b..."


Having merged the text data into a single column, it is essential to perform a word count. This step will guide us in determining the appropriate approach for processing this text in the subsequent analytical stages.

In [6]:
# Calculate the word count for each ad and add its values to a new column.
df_jobseeker['word_count'] = df_jobseeker['combined_info'].apply(lambda x: len(x.split()))

df_jobseeker.head()

Unnamed: 0,participant,data_collection,date,location,preferred_position,combined_info,word_count
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing. pati...,27
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce...",33
2,user_3,google form,2023-12-31 13:39:00,"dublin, ireland",data analyst,"degree: master of science in data analytics, b...",60


*job ads*

In [8]:
# Load the online job ads dataset and apply minor modifications for further use.
df_jobads = pd.read_csv('data_jobads_final.csv', index_col=None)
df_jobads['job_description'] = df_jobads['job_description'].str.replace('\n', ' ')
df_jobads = df_jobads.dropna().reset_index(drop=True)

print("The shape of the joob ads' data frame is:", df_jobads.shape)
df_jobads.head(3)

The shape of the joob ads' data frame is: (1166, 6)


Unnamed: 0,title,id,link,date,job_description,label
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse
2,registered nurse,job_4e16e9830b072344,https://ie.indeed.com/rc/clk?jk=4e16e9830b0723...,"January 10, 2024","access healthcare, one of irelands leading hea...",registered_nurse


The second dataset consists of 1166 rows and 6 columns of data scraped from Indeed.com. The most essential column in this DF is the one with job descriptions. Similarly to the first DF, counting the words for each row.

In [9]:
df_jobads['word_count'] = df_jobads['job_description'].apply(lambda x: len(x.split()))
df_jobads.head(3)

Unnamed: 0,title,id,link,date,job_description,label,word_count
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse,231
2,registered nurse,job_4e16e9830b072344,https://ie.indeed.com/rc/clk?jk=4e16e9830b0723...,"January 10, 2024","access healthcare, one of irelands leading hea...",registered_nurse,182


All necessary libraries have been imported, and the datasets are also laoded and ready for use.

## 1. COSINE WITH FINE-TUNED BERT

In this sub-section, the text columns from both DFs are fed into Bert's fine-tuned encoding layers, and the resulting text representations from the last hidden layer are collected for cosine similarity computation. For demonstration purposes, only one row value is used to retrieve the final hidden state. The remaining data is processed with a custom function that has been designed to handle different conditions.

- **test**

In [None]:
# Assigning the text for demonstration to a variable.
input_text_test = df_jobseeker.iat[0, -2]

# Initialize a fine-tuned model with the hidden state output enabled.
model = BertForSequenceClassification.from_pretrained('ft_bert_temuulen2', output_hidden_states=True)

# Initialize a tokenizer used for the fine-tuned model.
tokenizer = AutoTokenizer.from_pretrained('ft_bert_temuulen_tokenizer2')

# Tokenize the input text and convert it to PyTorch tensors.
inputs = tokenizer(input_text_test, return_tensors='pt')
print(inputs)

In the previous cell, the test text specified for demonstration purposes was assigned to a variable and tokenized. The results were formatted as tensors to be compatible with our deep learning framework, PyTorch in this instance. The output of the cell shows that input itself consists of **input_ids** and **attention_mask** values, which are important for further procesing, as well as **token_type_ids** values, which are optional for the current context.

In [None]:
# Perform a forward pass through the model to get the hidden states.
with torch.no_grad():
    outputs = model(**inputs)

# Extract the last hidden states from the model outputs.
last_hidden_states = outputs.hidden_states[-1]

print('The size of the last hidden state tensor is:', last_hidden_states.shape, '\n')
print('The data type of the last hidden state tensor is:', type(last_hidden_states), '\n')
print(last_hidden_states)

Following tokenization, the input values were passed forward through the model, resulting in the extraction of a torch tensor representing hidden states with dimensions of ([1, 44, 768]). This tensor will then be used for cosine similarity calculations.

- **emplementation**

The demonstration went well and the tensor was successfully extracted. Now lets begin the main implementation for both DataFrames.

In [None]:
# Starting the timer to track the execution duration.
start = time.time()

*initialize the model*

The encoding model has been fine-tuned using the **bert-based-uncased** architecture for text sequence classification and was imported from the personal drive. The tokenizer employed is HuggingFace's autotokenizer, which automatically selects and pairs with the most suitable tokenizer for the model. In this instance, it is the **BertTokenizer**.

In [None]:
# Initialize a fine-tuned model with the hidden state output enabled.
model = BertForSequenceClassification.from_pretrained('ft_bert_temuulen2', output_hidden_states=True)

# Initialize a tokenizer used for the fine-tuned model.
tokenizer = AutoTokenizer.from_pretrained('ft_bert_temuulen_tokenizer2')

*load the dataset*

The dataset used in this implementation is a duplicate of the primary DFs containing information about job seekers and job advertisements.

In [None]:
df_bert_js = df_jobseeker.copy()
df_bert_ja = df_jobads.copy()

*initialize the gpu* (optional)

To enhance the effectiveness of managing matrix and tensor operations, the CUDA device was created. This capability represents a key advantage of utilizing the BERT model within the Torch framework.

In [None]:
# Check whether CUDA is accessible and, if so, create a CUDA device.
cuda_available = torch.cuda.is_available()
cuda_device= torch.cuda.get_device_name(0)

if cuda_available == True:
    device = torch.device('cuda')
    print('CUDA was successfully installed and compiled on my device.')
    print('CUDA device name is:', cuda_device)
else:
    print('Cuda in not available')

Before starting the encoding process, it's essential to check the word count to ensure that it doesn't surpass 510, due to a constraint associated with the BERT model. If the word count exceed this threshold, it is necessary to formulate a new strategy for obtaining the encoded value.

In [None]:
print('The total number of rows having word counts greater than 510 in the first DF is:', df_bert_js[df_bert_js['word_count'] > 510].shape[0])
print('The total number of rows having word counts greater than 510 in the second DF is:', df_bert_ja[df_bert_ja['word_count'] > 510].shape[0])
print('The word count for the longest text is:', df_bert_ja.iat[df_bert_ja['word_count'].idxmax(), -1])

*create custom function*

From the output observed in the preceding cell, it is clear that the DF for job seekers does not contain entries exceeding the 510-word limit, allowing the definition of a standard custom function for tokenization and extraction of the last hidden state without additional conditions. Conversely, the DF for job advertisements contains 236 entries surpassing the 510-word threshold, with the longest text totaling 3145 words. To process these inputs through the model, a custom function incorporating special conditions must be developed and applied. The upcoming two custom functions are designed specifically for this purpose.

In [None]:
# Define a custom function to extract the final layer encodings from BERT, without conditions.
def process_text(text):
    
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors='pt')
    
    # Pass the tokenized input through the model.
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Retrieve the last hidden states from the model's outputs.
    last_hidden_states = outputs.hidden_states[-1]
    
    return last_hidden_states

In [None]:
# Define a custom function to extract the final layer encodings from BERT, with conditions.
def embed_with_bert(df_column):
    
    embedded_texts = []
    
    # Iterate through each text in the DataFrame column.
    for text in df_column:
        
        # Tokenize each text without adding special tokens and without truncation or padding.
        tokens = tokenizer(text, add_special_tokens=False, return_tensors='pt', truncation=False, padding=False)['input_ids'].squeeze()
        token_length = len(tokens)
        
        # If the token length is less than or equal to 512, process it normally.
        if token_length <= 512:
            inputs = tokenizer(text, return_tensors='pt').to(device)
            with torch.no_grad():
                outputs = model(**inputs)
            last_hidden_states = outputs.hidden_states[-1].cpu()  
            embedded_texts.append(last_hidden_states)
            
        # If the token length is greater than 512, split it into sliding windows withot lapping.
        else:
            max_length = 512
            stride = 0
            tokens = tokenizer(text, add_special_tokens=False, return_tensors='pt', truncation=False, padding=False)['input_ids'].squeeze().to(device)
            token_windows = [tokens[i:i+max_length] for i in range(0, len(tokens), max_length - stride)]
            
            all_hidden_states = []
            
            # Add special tokens (CLS and SEP) and truncate if needed.
            for window in token_windows:
                window = torch.cat([torch.tensor([tokenizer.cls_token_id], device=device), window, torch.tensor([tokenizer.sep_token_id], device=device)])
                if len(window) > max_length:
                    window = torch.cat((window[:max_length-1], torch.tensor([tokenizer.sep_token_id], device=device)))
                inputs = {'input_ids': window.unsqueeze(0)}
                with torch.no_grad():
                    outputs = model(**inputs)
                hidden_states = outputs.hidden_states[-1].cpu()  
                all_hidden_states.append(hidden_states)
            
            # Concatenate all hidden states from each sliding window.
            embedded_texts.append(torch.cat(all_hidden_states, dim=1))
            
    return embedded_texts

*encode the text*

Using the custom functions created earlier to process each DF and extract the tensor of the final hidden state layer.

In [None]:
# Apply function and create a new column with the extracted results.
df_bert_js['last_layer'] = df_bert_js.iloc[:, -2].apply(process_text)

print('The shape of the first tensor:', df_bert_js.iat[0, -1].shape, '\n')
print('The shape of the second tensor:', df_bert_js.iat[1, -1].shape, '\n')
print('The shape of the third tensor:', df_bert_js.iat[2, -1].shape, '\n')
print(df_bert_js.iat[0, -1], '\n')

# Check the Data Frame.
df_bert_js.head()

In [None]:
# Move the model to the GPU.
model.to(device)

# Apply the 'embed_with_bert' function to each ad.
df_bert_ja['tensors'] = df_bert_ja['job_description'].apply(lambda x: embed_with_bert([x])[0])

# Check the random cell to see the results.
print(df_bert_ja.iat[0, -1].shape, '\n')
print(df_bert_ja.iat[0, -1], '\n')

# Check the Data Frame.
df_bert_ja.head(2)

The results from the previous cells indicate that the tensors generated by processing each text entry from the 'combined_info' column through the encoding layers of the fine-tuned models maintain consistent dimensions in the first and third positions. This consistency is due to the fact that each encoder handles a single sample at a time, with a batch size of one, and represents each token in the text with a 768-feature vector. However, the number of tokens in the second dimensions, representing each text, varies and slightly exceeds the actual word count of each text. This variability is because of the WordPiece tokenization approach used by the BERT model, which breaks down words into smaller pieces if they are not present in the tokenizer's lexicon. This approach enables the model to more effectively manage unrecognized words.

*calculate cosine*

In [None]:
# Costum function that generates the evarage cosine similarity between the user's tensor and a job ad's tensor.
def calculate_average_similarity(tensor_user, tensor_ad):
    
    # Squeeze dimensions if the tensors have a batch dimension.
    tensor_user = tensor_user.squeeze(0) if tensor_user.dim() == 3 else tensor_user
    tensor_ad = tensor_ad.squeeze(0) if tensor_ad.dim() == 3 else tensor_ad

    tensor_ad = tensor_ad.to(tensor_user.device)

    # Initialize a similarity matrix with zeros.
    similarity_matrix = torch.zeros(tensor_user.size(0), tensor_ad.size(0), device=tensor_user.device)
    
    # Calculate cosine similarity for each pair of vectors.
    for i in range(tensor_user.size(0)):
        for j in range(tensor_ad.size(0)):
            similarity_matrix[i, j] = F.cosine_similarity(tensor_user[i].unsqueeze(0), tensor_ad[j].unsqueeze(0), dim=1)
            
    # Calculate the average similarity and convert it to a Python float.
    average_similarity = torch.mean(similarity_matrix).item()
    
    return average_similarity

In [None]:
print('The cosine similarity between the texts from user1 and user2 is:', calculate_average_similarity(df_bert_js.iat[0, -1], df_bert_js.iat[1, -1]))

In [None]:
# Get the user1's tensor and move it to the GPU.
user1_tensor = df_bert_js.iat[0, -1]
user1_tensor = user1_tensor.to(device)

# Get the user2's tensor and move it to the GPU.
user2_tensor = df_bert_js.iat[1, -1]
user2_tensor = user2_tensor.to(device)

# Get the user3's tensor and move it to the GPU.
user3_tensor = df_bert_js.iat[2, -1]
user3_tensor = user3_tensor.to(device)

In [None]:
# Apply the calculation of average cosine similarity function to each job ad's tensor.
df_bert_ja['cosine_user1'] = df_bert_ja.iloc[:, -1].apply(lambda x: calculate_average_similarity(user1_tensor, x.to(device)))

torch.cuda.empty_cache()

In [None]:
# Apply the calculation of average cosine similarity function to each job ad's tensor.
df_bert_ja['cosine_user2'] = df_bert_ja.iloc[:, -2].apply(lambda x: calculate_average_similarity(user2_tensor, x.to(device)))

torch.cuda.empty_cache()

In [None]:
# Apply the calculation of average cosine similarity function to each job ad's tensor.
df_bert_ja['cosine_user3'] = df_bert_ja.iloc[:, -3].apply(lambda x: calculate_average_similarity(user3_tensor, x.to(device)))

torch.cuda.empty_cache()

In [None]:
# Drop the encoded column from the Data Frame (it takes up too much memory and is no longer needed).
df_bert_ja = df_bert_ja.drop(columns=['tensors']) 

df_bert_ja.head(2)

In [None]:
df_bert_ja.to_csv('cosine-bert.csv', index=False)

In [None]:
end = time.time()

print(f'The calculation of cosine similarity score using fine-tuned Bert model was completed in: {int((end - start)) // 60} minutes and {int((end - start)) % 60} seconds.')

## COSINE WITH PRE-TRAINED WORD2VEC

In [None]:
# Starting the timer to track the execution duration.
start = time.time()

*initialize the model*

In [None]:
# Load the pre-trained Word2Vec model
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

*load the dataset*

In [None]:
df_word2vec_js = df_jobseeker.copy()
df_word2vec_ja = df_jobads.copy()

*preprocessing*

In [None]:
# Lowercasing and tokenizing
def preprocess_text_word2vec(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    
    return tokens

In [None]:
df_word2vec_js['processed_ci'] = df_word2vec_js['combined_info'].apply(preprocess_text_word2vec)
df_word2vec_ja['processed_jd'] = df_word2vec_ja['job_description'].apply(preprocess_text_word2vec)

df_word2vec_js.head()

 For each entry in text column, the custom function tokenizes the text into words. Then it filters out the words not in the Word2Vec vocabulary, and then generate embeddings for each word. A common approach is to average these word vectors to get a single vector that represents the entire text.

*embedding*

In [None]:
def embed_tokens(tokens_list, model):
    vectors = [model[word] for word in tokens_list if word in model]
    if vectors:
        # Averaging the vectors (You could choose another aggregation method)
        embedding = np.mean(vectors, axis=0)
    else:
        # Use a zero vector if none of the tokens were found in the Word2Vec model
        embedding = np.zeros(model.vector_size)
        
    return embedding

In [None]:
# Apply the function to embed each row's tokens in the DataFrame
df_word2vec_js['vectors'] = df_word2vec_js['processed_ci'].apply(lambda x: embed_tokens(x, word2vec))
df_word2vec_ja['vectors'] = df_word2vec_ja['processed_jd'].apply(lambda x: embed_tokens(x, word2vec))
# This will add a new column 'word2vec_embedding' where each row contains the aggregated Word2Vec embedding for its tokens

print('The shape of the first tensor:', df_word2vec_js.iat[0, -1].shape, '\n')
print('The shape of the second tensor:', df_word2vec_js.iat[1, -1].shape, '\n')
print(df_word2vec_js.iat[0, -1], '\n')

df_word2vec_ja.head(2)

cosine_similarity

In [None]:
# Define a function to calculate cosine similarity (dot product in this case)
def cos(vector1, vector2):
    return np.dot(vector1, vector2)

In [None]:
user1_vector = df_word2vec_js.iat[0, -1].copy()
user2_vector = df_word2vec_js.iat[1, -1].copy()
user3_vector = df_word2vec_js.iat[2, -1].copy()

In [None]:
# Calculate the cosine similarity for each row
df_word2vec_ja['cos_user1'] = df_word2vec_ja['vectors'].apply(lambda x: cos(x, user1_vector))

df_word2vec_ja['cos_user2'] = df_word2vec_ja['vectors'].apply(lambda x: cos(x, user2_vector))

df_word2vec_ja['cos_user3'] = df_word2vec_ja['vectors'].apply(lambda x: cos(x, user3_vector))

In [None]:
df_word2vec_ja.drop(columns=['processed_jd', 'vectors'], inplace=True)

df_word2vec_ja.head(2)

In [None]:
df_word2vec_ja.to_csv('cosine-word2vec.csv', index=False)

In [None]:
end = time.time()

print(f'The calculation was completed in: {int((end - start)) // 60} minutes and {int((end - start)) % 60} seconds.')

print(f'The calculation of cosine similarity using pretrained word2vec model was completed in: {int((end - start)) // 60} minutes and {int((end - start)) % 60} seconds.')

## COSINE WITH TF-IDF AND BOW

In [10]:
# Starting the timer to track the execution duration.
start = time.time()

*initialize the tools*

In [11]:
tfidf_vectorizer = TfidfVectorizer()
bow_vectorizer = CountVectorizer()
lemmatizer = WordNetLemmatizer()

*load the dataset*

In [12]:
df_tfidf_js = df_jobseeker.copy()
df_tfidf_ja = df_jobads.copy()

*preprocessing*

In [13]:
def preprocess_text_tfidf(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords and lemmatization
    stop_words = set(stopwords.words('english'))
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Re-joining tokens
    processed_text = ' '.join(processed_tokens)
    
    return processed_text

In [14]:
df_tfidf_js['processed_ci'] = df_tfidf_js['combined_info'].apply(preprocess_text_tfidf)
df_tfidf_ja['processed_jd'] = df_tfidf_ja['job_description'].apply(preprocess_text_tfidf)
df_tfidf_js.head(2)

Unnamed: 0,participant,data_collection,date,location,preferred_position,combined_info,word_count,processed_ci
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing. pati...,27,bachelor degree critical care nursing patient ...
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce...",33,high school diploma vocational electrician cer...


In [15]:
empty_rows = pd.DataFrame([[''] * len(df_tfidf_ja.columns)] * 3, columns=df_tfidf_ja.columns)
df_tfidf_ja = pd.concat([empty_rows, df_tfidf_ja], ignore_index=True)

df_tfidf_ja.head(4)

Unnamed: 0,title,id,link,date,job_description,label,word_count,processed_jd
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502.0,silver stream healthcare group offer great emp...


In [16]:
values_to_add = df_tfidf_js['processed_ci'].tolist()[:3]
df_tfidf_ja['processed_jd'].iloc[:3] = values_to_add

df_tfidf_ja.head(4)

Unnamed: 0,title,id,link,date,job_description,label,word_count,processed_jd
0,,,,,,,,bachelor degree critical care nursing patient ...
1,,,,,,,,high school diploma vocational electrician cer...
2,,,,,,,,degree master science data analytics bachelor ...
3,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502.0,silver stream healthcare group offer great emp...


### 3. Embedding

In [17]:
tfidf_matrix = tfidf_vectorizer.fit_transform(df_tfidf_ja['processed_jd'])
bow_matrix = bow_vectorizer.fit_transform(df_tfidf_ja['processed_jd'])
combined_matrix = hstack([tfidf_matrix, bow_matrix])

# Convert each row of the TF-IDF matrix to a list and store in a new DataFrame column
df_tfidf_ja['vectors'] = list(combined_matrix.toarray())
df_tfidf_ja.head(4)

Unnamed: 0,title,id,link,date,job_description,label,word_count,processed_jd,vectors
0,,,,,,,,bachelor degree critical care nursing patient ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,,,,,,,,high school diploma vocational electrician cer...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,,,,,,,,degree master science data analytics bachelor ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502.0,silver stream healthcare group offer great emp...,"[0.0, 0.0, 0.04749643991878368, 0.0, 0.0, 0.0,..."


In [18]:
check_df = df_tfidf_ja.iat[0, -1]
print(check_df.shape)
print(type(check_df))

(20162,)
<class 'numpy.ndarray'>


In [19]:
vectors_array = pd.DataFrame(df_tfidf_ja['vectors'].tolist())
vectors_array.head(4)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20152,20153,20154,20155,20156,20157,20158,20159,20160,20161
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.047496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
normalized_vectors = normalize(vectors_array, norm='l2', axis=1)
normalized_vectors

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00959848, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [21]:
df_tfidf_ja['normolized_vec'] = normalized_vectors.tolist()
df_tfidf_ja.head(4)

Unnamed: 0,title,id,link,date,job_description,label,word_count,processed_jd,vectors,normolized_vec
0,,,,,,,,bachelor degree critical care nursing patient ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,,,,,,,,high school diploma vocational electrician cer...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,,,,,,,,degree master science data analytics bachelor ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502.0,silver stream healthcare group offer great emp...,"[0.0, 0.0, 0.04749643991878368, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0016647493576457985, 0.0, 0.0, 0...."


### 4. Cosine calculation

In [22]:
vectors_tf = np.array(df_tfidf_ja['normolized_vec'].tolist()).copy()

user1_vector_tf = vectors_tf[0].reshape(1, -1).copy()
user2_vector_tf = vectors_tf[1].reshape(1, -1).copy()
user3_vector_tf = vectors_tf[2].reshape(1, -1).copy()

print(vectors_tf.shape)
print(user1_vector_tf.shape)

(1169, 20162)
(1, 20162)


In [23]:
cosine_similarities = cos(user1_vector_tf, vectors_tf).flatten()
df_tfidf_ja['cos_user1'] = cosine_similarities

cosine_similarities = cos(user2_vector_tf, vectors_tf).flatten()
df_tfidf_ja['cos_user2'] = cosine_similarities

cosine_similarities = cos(user3_vector_tf, vectors_tf).flatten()
df_tfidf_ja['cos_user3'] = cosine_similarities

In [24]:
# Slicing the DataFrame to exclude the first three rows
df_tfidf_ja = df_tfidf_ja.iloc[3:].reset_index(drop=True)

df_tfidf_ja.drop(columns=['processed_jd', 'vectors', 'normolized_vec'], inplace=True)

df_tfidf_ja.head(2)

Unnamed: 0,title,id,link,date,job_description,label,word_count,cos_user1,cos_user2,cos_user3
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502,0.301457,0.022477,0.033491
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse,231,0.301988,0.03765,0.005109


In [25]:
df_tfidf_ja.to_csv('cosine-tfidf.csv', index=False)

In [26]:
end = time.time()

print(f'The calculation of cosine similarity using TF-IDF and BoW was completed in: {int((end - start)) // 60} minutes and {int((end - start)) % 60} seconds.')

The calculation of cosine similarity using TF-IDF and BoW was completed in: 1 minutes and 13 seconds.
