In [22]:
#maybe rid profiles where there are more than 5 NaNs, or rid profiles where there are len(profile) > 25
#get rid of NaN relationship role 

df['Relationship Role'].value_counts()

mentee    1343
mentor     550
nan         35
both         2
Name: Relationship Role, dtype: int64

In [9]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import nltk

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] =  df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)
df['Tokens'] = df['Profile'].apply(word_tokenize)

sentences = df['Tokens'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['Vector'] = df['Tokens'].apply(lambda tokens: average_word2vec(tokens, word2vec_model))

X = np.vstack(df['Vector'].values)

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2), df['Relationship Role'][indices[i][j]]) for j in range(1, k)]
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)
#print(results_df.to_string(index=False))

results_df

Unnamed: 0,Id,Profile,Relationship Role,Nearest Neighbors
0,1047644182,I'm in my 3rd year in Cinema Studies at UBC. I...,mentor,"[(1047550122, 1.0, mentor), (1047516499, 1.0, ..."
1,1047643231,nan South Kamloops SS Surgeon is my main goal ...,mentee,"[(1047564281, 1.0, mentee), (1047541071, 1.0, ..."
2,1047643230,"nan South Kamloops SS dermatologist, zoologist...",mentee,"[(1047585080, 0.99, mentee), (1047549757, 0.99..."
3,1047643228,"I am third year Bachelor of Science, biology m...",mentor,"[(1047549455, 1.0, mentor), (1047584290, 1.0, ..."
4,1047641732,"nan South Kamloops SS Photographer, Teacher, P...",mentee,"[(1047627564, 1.0, mentee), (1047514517, 1.0, ..."
...,...,...,...,...
1925,9950,nan nan nan nan Marketing and Communications n...,mentee,"[(1047485501, 1.0, mentor), (1047514780, 1.0, ..."
1926,9941,"nan nan nan nan nan Family, working out, readi...",mentor,"[(15660, 0.99, mentee), (11057, 0.99, mentor),..."
1927,9927,nan nan nan nan Rural Medicine nan nan nan nan...,mentor,"[(10160, 1.0, mentor), (10162, 1.0, mentor), (..."
1928,9923,"nan nan nan nan Nursing nan nan nan nan , 1 , ...",mentee,"[(10150, 1.0, mentor), (10143, 1.0, mentor), (..."


In [13]:
import matplotlib as plt 
import seaborn as sns 

svd = TruncatedSVD(n_components=2, random_state=69)  # must be 2 to vis
X_reduced2 = svd.fit_transform(X.T)

embedding_df = pd.DataFrame(X_reduced2, columns=['Component 1', 'Component 2'])
embedding_df['word'] = X.columns

plt.figure(figsize=(10, 10))
sns.scatterplot(data=embedding_df, x='Component 1', y='Component 2')
for i in range(embedding_df.shape[0]):
    plt.text(embedding_df['Component 1'][i], embedding_df['Component 2'][i], embedding_df['word'][i])
plt.title('Word Embeddings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
#using bert to tokenize and for word embeddings/vectors 

import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

random_seed = 42
random.seed(random_seed)

#random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
	torch.cuda.manual_seed_all(random_seed)
	
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Input df
df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] =  df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus(
					 # List of input texts
	padding=True,			 # Pad to the maximum sequence length
	truncation=True,		 # Truncate to the maximum sequence length if necessary
	return_tensors='pt',	 # Return PyTorch tensors
	add_special_tokens=True # Add special tokens CLS and SEP
)

input_ids = encoding['input_ids'] # Token IDs

print(f"Input ID: {input_ids}")
attention_mask = encoding['attention_mask'] # Attention mask

print(f"Attention mask: {attention_mask}")

# Generate embeddings using BERT model
with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask)
	word_embeddings = outputs.last_hidden_state # This contains the embeddings

# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")



