In [48]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np
from sklearn.preprocessing import normalize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
import re
from scipy.sparse import hstack
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# Testing model using data from experiment participants.
df_jobads = pd.read_csv('cosine-bert.csv', index_col=None)
df_jobads = df_jobads.iloc[:, :-3]
df_jobads.head(2)

Unnamed: 0,title,id,link,date,job_description,label,word_count
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse,231


In [13]:
# Testing model using data from experiment participants.
df_jobseeker = pd.read_csv('data_jobseeker.csv', index_col=None)
df_jobseeker['professional_qualifications'] = df_jobseeker.education + ". " + df_jobseeker.skill + ". " + df_jobseeker.experience + '.'
df_jobseeker.drop(['education', 'skill', 'experience'], axis=1, inplace=True)
df_jobseeker.head(2)

Unnamed: 0,participant,data_collection,date,location,preferred_position,professional_qualifications
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing. pati...
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce..."


In [18]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Removing punctuation
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Removing stopwords and lemmatization
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Re-joining tokens
    processed_text = ' '.join(processed_tokens)
    return processed_text

In [22]:
df_jobads['processed_job_description'] = df_jobads['job_description'].apply(preprocess_text)

df_jobads.head(2)

Unnamed: 0,title,id,link,date,job_description,label,word_count,processed_job_description
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502,silver stream healthcare group offer great emp...
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse,231,create better future recruitnet international ...


In [23]:
df_jobseeker['processed_pq'] = df_jobseeker['professional_qualifications'].apply(preprocess_text)

df_jobseeker.head(2)

Unnamed: 0,participant,data_collection,date,location,preferred_position,professional_qualifications,processed_pq
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing. pati...,bachelor degree critical care nursing patient ...
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce...",high school diploma vocational electrician cer...


In [46]:
# Initialize the vectorizers
tfidf_vectorizer = TfidfVectorizer()
bow_vectorizer = CountVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_jobads['processed_job_description'])
bow_matrix = bow_vectorizer.fit_transform(df_jobads['processed_job_description'])

# Combine the TF-IDF and BoW vectors
combined_features = hstack([tfidf_matrix, bow_matrix])

# Convert to a list or another format if needed to add to the DataFrame
# This step depends on how you want to use the vectors. If you need them in a specific format, let me know.
# For demonstration, I'll convert the sparse matrix to a list of lists (which might not be memory efficient for large datasets)
combined_features_list = combined_features.toarray().tolist()

df_jobads['vectors'] = combined_features_list
df_jobads.head(2)

Unnamed: 0,title,id,link,date,job_description,label,word_count,processed_job_description,vectors
0,assistant director of nursing,sj_3c7e64c7996bb9d6,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,"January 10, 2024",silver stream healthcare group offer great emp...,registered_nurse,502,silver stream healthcare group offer great emp...,"[0.0, 0.0, 0.04749600562738871, 0.0, 0.0, 0.0,..."
1,clinical nurse manager (cnm),sj_358f1f68cde928c4,https://ie.indeed.com/pagead/clk?mo=r&ad=-6NYl...,unknown,create a better future for yourself recruitne...,registered_nurse,231,create better future recruitnet international ...,"[0.0, 0.0, 0.0, 0.14492891726001467, 0.0, 0.0,..."


In [47]:
# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(df_jobseeker['processed_pq'])
bow_matrix = bow_vectorizer.fit_transform(df_jobseeker['processed_pq'])

# Combine the TF-IDF and BoW vectors
combined_features = hstack([tfidf_matrix, bow_matrix])

# Convert to a list or another format if needed to add to the DataFrame
# This step depends on how you want to use the vectors. If you need them in a specific format, let me know.
# For demonstration, I'll convert the sparse matrix to a list of lists (which might not be memory efficient for large datasets)
combined_features_list = combined_features.toarray().tolist()

df_jobseeker['vectors'] = combined_features_list
df_jobseeker.head(2)

Unnamed: 0,participant,data_collection,date,location,preferred_position,professional_qualifications,processed_pq,vectors
0,user_1,voice call,2023-12-17 15:30:00,"dublin, ireland",registered nurse,bachelor's degree: critical care nursing. pati...,bachelor degree critical care nursing patient ...,"[0.0, 0.0, 0.1805912687057042, 0.0, 0.0, 0.0, ..."
1,user_2,voice call,2023-12-27 11:50:00,"dublin, ireland",electrician,"high school diploma, vocational electrician ce...",high school diploma vocational electrician cer...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [54]:
vector_to_compare = df_jobseeker.iat[0,-1]
vector_to_compare = vector_to_compare.reshape(-1, 2)

AttributeError: 'list' object has no attribute 'reshape'

In [58]:
single_vector_reshaped = df_jobseeker.iat[0,-1]
single_vector_reshaped = np.array(single_vector_reshaped).reshape(1, -1)

# Initialize a list to store the cosine similarity results
cosine_similarities = []

# Iterate over the DataFrame
for index, row in df_jobads.iterrows():
    # Reshape the vector in the current row to match the expected input shape
    vector_reshaped = np.array(row['vectors']).reshape(1, -1)
    
    # Calculate cosine similarity and append the result to our list
    # cosine_similarity returns a matrix, we get the first element [0][0] as the result
    similarity_score = cosine_similarity(single_vector_reshaped, vector_reshaped)[0][0]
    cosine_similarities.append(similarity_score)

# Add the cosine similarity results to the DataFrame as a new column
df_jobads['cosine_user1'] = cosine_similarities

# Now df contains an additional column 'cosine_similarity' with the similarity scores

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 162 while Y.shape[1] == 20160