In [4]:
#TODO add weights to the profile columns, compare the results of the embeddings (eg. confirm similarity score between ID is legit)


from transformers import BertModel, BertTokenizer
import pandas as pd
import torch
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df = df.sample(frac=0.1, random_state=42)
df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

inputs = df['Profile'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))

inputs = inputs.tolist()
max_len = max(len(seq) for seq in inputs)
padded_inputs = [seq + [0] * (max_len - len(seq)) for seq in inputs]

input_ids = torch.tensor(padded_inputs)

with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs.last_hidden_state

embeddings = embeddings.mean(dim=1).numpy()

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(embeddings)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df.iloc[indices[i][j]]['Id'], round(1 - distances[i][j], 2), df.iloc[indices[i][j]]['Relationship Role']) for j in range(1, k)]
    result = {
        'Id': df.iloc[i]['Id'],
        #'Profile': profile,
        'Relationship Role': df.iloc[i]['Relationship Role'],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)

print(results_df)

#results_df.to_csv("results_rec.csv", index=False)

#print(embeddings)


             Id Relationship Role  \
0    1047538821            mentee   
1    1047497874            mentee   
2         10698            mentee   
3    1047549000            mentee   
4    1047592264            mentee   
..          ...               ...   
188  1047585221            mentee   
189  1047583535            mentor   
190  1047627569            mentee   
191  1047501471            mentee   
192       10988            mentor   

                                     Nearest Neighbors  
0    [(1047516619, 1.0, mentee), (1047503059, 1.0, ...  
1    [(1047498727, 1.0, mentee), (1047501363, 1.0, ...  
2    [(10988, 1.0, mentor), (10716, 1.0, mentee), (...  
3    [(1047554104, 1.0, mentee), (1047588327, 1.0, ...  
4    [(1047593954, 1.0, mentee), (1047541619, 1.0, ...  
..                                                 ...  
188  [(1047593954, 1.0, mentee), (1047541038, 1.0, ...  
189  [(1047549915, 0.98, mentor), (1047567699, 0.98...  
190  [(1047541032, 1.0, mentee), (10475541

In [1]:
from transformers import BertModel, BertTokenizer
import pandas as pd
import torch

#embeddings 

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df = df.sample(frac=0.1, random_state=42)
df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

inputs = df['Profile'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))

inputs = inputs.tolist()
max_len = max(len(seq) for seq in inputs)
padded_inputs = [seq + [0] * (max_len - len(seq)) for seq in inputs]

input_ids = torch.tensor(padded_inputs)

with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs.last_hidden_state

print(embeddings)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[[-0.2471,  0.5581,  0.4856,  ..., -0.3383,  0.4712, -0.5496],
         [-0.9043,  0.1023,  0.3905,  ..., -0.2531,  0.3107,  0.1530],
         [-0.9974,  0.0594,  0.3350,  ..., -0.2259, -0.1157,  0.2946],
         ...,
         [ 0.4776,  0.3691,  0.5112,  ..., -0.4778,  0.2328, -1.2041],
         [ 0.3487,  0.4768,  0.5018,  ..., -0.5214,  0.1758, -1.1651],
         [ 0.1418,  0.6547,  0.6327,  ..., -0.4767,  0.1373, -1.2286]],

        [[-0.2218,  0.5102,  0.0278,  ..., -0.5992,  0.4269, -0.7260],
         [-0.2835,  1.0011,  0.5905,  ..., -0.7353,  0.4000,  0.2702],
         [-0.7124,  0.0316,  0.9288,  ..., -0.0165,  0.2927,  0.9949],
         ...,
         [ 0.2611,  0.2666,  0.3623,  ..., -0.6006, -0.2367, -1.1620],
         [ 0.4256,  0.1841,  0.4608,  ..., -0.6418, -0.1705, -1.2568],
         [ 0.1303,  0.5525,  0.5782,  ..., -0.9839, -0.1581, -1.4536]],

        [[-0.1543,  0.5035,  0.0646,  ..., -0.4356,  0.2956, -0.9883],
         [-0.5467,  0.1428,  0.2016,  ..., -0

In [2]:
from transformers import BertModel, BertTokenizer
import pandas as pd
import torch
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

#embeddings + svd + cosine matching 

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

inputs = df['Profile'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))

inputs = inputs.tolist()
max_len = max(len(seq) for seq in inputs)
padded_inputs = [seq + [0] * (max_len - len(seq)) for seq in inputs]

input_ids = torch.tensor(padded_inputs)
attention_mask = inputs["attention_mask"]

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state
    

embeddings = embeddings.mean(dim=1).numpy()

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(embeddings)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2), df['Relationship Role'][indices[i][j]]) for j in range(1, k)]
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)

print(results_df)

#results_df.to_csv("results_rec.csv", index=False)


TypeError: list indices must be integers or slices, not str