In [22]:
#maybe rid profiles where there are more than 5 NaNs, or rid profiles where there are len(profile) > 25
#get rid of NaN relationship role 

df['Relationship Role'].value_counts()

mentee    1343
mentor     550
nan         35
both         2
Name: Relationship Role, dtype: int64

In [7]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str).sample(frac=0.1, random_state=42)
df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)
df['Tokens'] = df['Profile'].apply(word_tokenize)

sentences = df['Tokens'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['Vector'] = df['Tokens'].apply(lambda tokens: average_word2vec(tokens, word2vec_model))

X = np.vstack(df['Vector'].values)

# svd = TruncatedSVD(n_components=100)
# X_reduced = svd.fit_transform(X)

cos_sim_matrix = cosine_similarity(X)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df.iloc[indices[i][j]]['Id'], round(1 - distances[i][j], 2), df.iloc[indices[i][j]]['Relationship Role']) for j in range(1, k)]
    result = {
        'Id': df.iloc[i]['Id'],
        'Profile': profile,
        'Relationship Role': df.iloc[i]['Relationship Role'],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)
results_df


[nltk_data] Downloading package punkt to
[nltk_data]     /zfs/users/asda2/asda2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Id,Profile,Relationship Role,Nearest Neighbors
0,1047538821,nan nan nan nan nan nan nan nan nan nan nan na...,mentee,"[(1047503435, 1.0, mentee), (1047538821, 1.0, ..."
1,1047497874,"nan Alberni Secondary no idea, nan I would li...",mentee,"[(1047513123, 1.0, mentee), (1047593968, 1.0, ..."
2,10698,", 0 nan nan nan pediatrician, veterinarian ...",mentee,"[(10716, 1.0, mentee), (10702, 1.0, mentee), (..."
3,1047549000,nan South Kamloops SS I would start a program ...,mentee,"[(1047585216, 1.0, mentee), (1047588253, 1.0, ..."
4,1047592264,nan nan Psychologist or a Forensic Psychiatris...,mentee,"[(1047503436, 1.0, mentee), (1047513309, 1.0, ..."
...,...,...,...,...
188,1047585221,"nan Hope Secondary - Cattrell astronaut, engin...",mentee,"[(1047551394, 1.0, mentee), (1047584581, 1.0, ..."
189,1047583535,"Douglas College, Bachelor of Arts in Applied C...",mentor,"[(1047502278, 1.0, mentor), (1047513634, 1.0, ..."
190,1047627569,nan Charles Hays Secondary - Ling onboard engi...,mentee,"[(1047541619, 1.0, mentor), (1047549398, 1.0, ..."
191,1047501471,nan nan carpenter nan video games likes games ...,mentee,"[(15660, 1.0, mentee), (10241, 1.0, mentor), (..."


In [11]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

nltk.download('punkt')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str).sample(frac=0.1, random_state=42)
df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)
df['Tokens'] = df['Profile'].apply(word_tokenize)

stop_words = set(stopwords.words('english'))
df['Tokens'] = df['Tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])


sentences = df['Tokens'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['Vector'] = df['Tokens'].apply(lambda tokens: average_word2vec(tokens, word2vec_model))

# embeddings quality
print("Embedding variance: ", np.var(np.vstack(df['Vector'].values), axis=0).mean())

X = np.vstack(df['Vector'].values)

#   SVD
svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)
print("Explained variance ratio: ", svd.explained_variance_ratio_.sum())

# normalize 
X_normalized = normalize(X_reduced)

cos_sim_matrix = cosine_similarity(X_normalized)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_normalized)

distances, indices = knn.kneighbors(X_normalized)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df.iloc[indices[i][j]]['Id'], round(1 - distances[i][j], 2), df.iloc[indices[i][j]]['Relationship Role']) for j in range(1, k)]
    result = {
        'Id': df.iloc[i]['Id'],
        'Profile': profile,
        'Relationship Role': df.iloc[i]['Relationship Role'],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)
results_df


[nltk_data] Downloading package punkt to
[nltk_data]     /zfs/users/asda2/asda2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Embedding variance:  0.0009967114
Explained variance ratio:  0.999996


Unnamed: 0,Id,Profile,Relationship Role,Nearest Neighbors
0,1047538821,nan nan nan nan nan nan nan nan nan nan nan na...,mentee,"[(1047538821, 1.0, mentee), (1047537906, 1.0, ..."
1,1047497874,"nan Alberni Secondary no idea, nan I would li...",mentee,"[(1047498727, 1.0, mentee), (1047516916, 1.0, ..."
2,10698,", 0 nan nan nan pediatrician, veterinarian ...",mentee,"[(10716, 1.0, mentee), (1047584573, 1.0, mente..."
3,1047549000,nan South Kamloops SS I would start a program ...,mentee,"[(1047513634, 1.0, mentor), (1047594511, 1.0, ..."
4,1047592264,nan nan Psychologist or a Forensic Psychiatris...,mentee,"[(1047541038, 1.0, mentee), (1047514517, 1.0, ..."
...,...,...,...,...
188,1047585221,"nan Hope Secondary - Cattrell astronaut, engin...",mentee,"[(1047496812, 1.0, mentee), (1047499338, 1.0, ..."
189,1047583535,"Douglas College, Bachelor of Arts in Applied C...",mentor,"[(1047549055, 1.0, mentee), (1047549915, 1.0, ..."
190,1047627569,nan Charles Hays Secondary - Ling onboard engi...,mentee,"[(1047514517, 1.0, mentor), (1047541032, 1.0, ..."
191,1047501471,nan nan carpenter nan video games likes games ...,mentee,"[(1047499586, 1.0, mentee), (1047554084, 1.0, ..."


In [14]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import nltk

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str).sample(frac=0.1, random_state=42)
df['Profile'] =  df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)
df['Tokens'] = df['Profile'].apply(word_tokenize)

sentences = df['Tokens'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['Vector'] = df['Tokens'].apply(lambda tokens: average_word2vec(tokens, word2vec_model))

X = np.vstack(df['Vector'].values)

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2), df['Relationship Role'][indices[i][j]]) for j in range(1, k)]
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)
#print(results_df.to_string(index=False))

results_df

KeyError: 134

In [13]:
import matplotlib as plt 
import seaborn as sns 

svd = TruncatedSVD(n_components=2, random_state=69)  # must be 2 to vis
X_reduced2 = svd.fit_transform(X.T)

embedding_df = pd.DataFrame(X_reduced2, columns=['Component 1', 'Component 2'])
embedding_df['word'] = X.columns

plt.figure(figsize=(10, 10))
sns.scatterplot(data=embedding_df, x='Component 1', y='Component 2')
for i in range(embedding_df.shape[0]):
    plt.text(embedding_df['Component 1'][i], embedding_df['Component 2'][i], embedding_df['word'][i])
plt.title('Word Embeddings')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.show()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [1]:
#using bert to tokenize and for word embeddings/vectors 

import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

random_seed = 42
random.seed(random_seed)

#random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
	torch.cuda.manual_seed_all(random_seed)
	
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Input df
df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] =  df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus(
					 # List of input texts
	padding=True,			 # Pad to the maximum sequence length
	truncation=True,		 # Truncate to the maximum sequence length if necessary
	return_tensors='pt',	 # Return PyTorch tensors
	add_special_tokens=True # Add special tokens CLS and SEP
)

input_ids = encoding['input_ids'] # Token IDs

print(f"Input ID: {input_ids}")
attention_mask = encoding['attention_mask'] # Attention mask

print(f"Attention mask: {attention_mask}")

# Generate embeddings using BERT model
with torch.no_grad():
	outputs = model(input_ids, attention_mask=attention_mask)
	word_embeddings = outputs.last_hidden_state # This contains the embeddings

# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")





NameError: name 'pd' is not defined

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import nltk
from transformers import BertTokenizer, BertModel
import torch

nltk.download('punkt')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] =  df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

df['Tokens'] = df['Profile'].apply(word_tokenize)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def embed_bert(tokens, tokenizer, model):
    inputs = tokenizer(tokens, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

df['Vector'] = df['Tokens'].apply(lambda tokens: embed_bert(tokens, tokenizer, model))

X = np.vstack(df['Vector'].values)

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2), df['Relationship Role'][indices[i][j]]) for j in range(1, k)]
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))


[nltk_data] Downloading package punkt to
[nltk_data]     /zfs/users/asda2/asda2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


MemoryError: Unable to allocate 269. GiB for an array with shape (268737, 268737) and data type float32

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import nltk
from transformers import BertTokenizer, BertModel
import torch

nltk.download('punkt')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] =  df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

#df['Tokens'] = df['Profile'].apply(word_tokenize)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def embed_bert(tokens, tokenizer, model):
    inputs = tokenizer(tokens, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

df['Vector'] = df['Tokens'].apply(lambda tokens: embed_bert(tokens, tokenizer, model))

X = np.vstack(df['Vector'].values)

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2), df['Relationship Role'][indices[i][j]]) for j in range(1, k)]
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
