In [8]:
from transformers import BertModel, BertTokenizer
import pandas as pd
import torch

#embeddings 

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

inputs = df['Profile'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))

inputs = inputs.tolist()
max_len = max(len(seq) for seq in inputs)
padded_inputs = [seq + [0] * (max_len - len(seq)) for seq in inputs]

input_ids = torch.tensor(padded_inputs)

with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs.last_hidden_state

print(embeddings)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


tensor([[[-3.3356e-02, -5.5299e-02, -3.8083e-01,  ...,  3.2576e-01,
           6.1554e-01, -1.1003e-01],
         [-4.9277e-02, -2.4514e-01, -2.5094e-01,  ...,  5.4711e-01,
           7.8107e-01, -4.8744e-02],
         [ 2.6528e-01, -3.5830e-01,  1.9189e+00,  ..., -8.6398e-02,
           2.6493e-01, -4.8579e-01],
         ...,
         [ 2.8758e-01,  4.9665e-01,  1.7169e-01,  ..., -3.0395e-01,
          -6.1941e-01, -1.8973e-01],
         [-1.7996e-01, -1.7997e-01, -1.6888e-01,  ...,  5.8168e-01,
           4.1518e-02, -8.2469e-01],
         [-1.2731e-01,  1.7953e-01,  1.0966e-01,  ...,  9.6392e-01,
          -2.0503e-01, -4.1640e-01]],

        [[-3.5382e-01,  5.7421e-01, -4.6681e-01,  ..., -5.4893e-01,
           5.6156e-01, -7.3049e-01],
         [-1.2480e-01,  5.9617e-01,  2.8719e-01,  ..., -9.1282e-01,
           4.0866e-01,  3.9394e-01],
         [-5.5533e-01,  2.3028e-01,  4.4106e-01,  ..., -7.2912e-01,
          -2.5920e-01,  1.1557e-01],
         ...,
         [ 2.6458e-01,  2

In [1]:
from transformers import BertModel, BertTokenizer
import pandas as pd
import torch
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

#embeddings + svd + cosine matching 

model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.astype(str)
df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role', 'Total Mentees', 'Number of Messages Sent', 'Resource Clicks', 'Courses Clicks']).agg(' '.join, axis=1)

inputs = df['Profile'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, truncation=True, max_length=512))

inputs = inputs.tolist()
max_len = max(len(seq) for seq in inputs)
padded_inputs = [seq + [0] * (max_len - len(seq)) for seq in inputs]

input_ids = torch.tensor(padded_inputs)
attention_mask = inputs["attention_mask"]

with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    embeddings = outputs.last_hidden_state
    

embeddings = embeddings.mean(dim=1).numpy()

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(embeddings)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2), df['Relationship Role'][indices[i][j]]) for j in range(1, k)]
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)

print(results_df)

#results_df.to_csv("results_rec.csv", index=False)


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


              Id                                            Profile  \
0     1047644182  I'm in my 3rd year in Cinema Studies at UBC. I...   
1     1047643231  nan South Kamloops SS Surgeon is my main goal ...   
2     1047643230  nan South Kamloops SS dermatologist, zoologist...   
3     1047643228  I am third year Bachelor of Science, biology m...   
4     1047641732  nan South Kamloops SS Photographer, Teacher, P...   
...          ...                                                ...   
1925        9950  nan nan nan nan Marketing and Communications n...   
1926        9941  nan nan nan nan nan Family, working out, readi...   
1927        9927  nan nan nan nan Rural Medicine nan nan nan nan...   
1928        9923  nan nan nan nan Nursing nan nan nan nan , 1 , ...   
1929        9664  , 2 nan nan nan Indigenous Education nan nan n...   

     Relationship Role                                  Nearest Neighbors  
0               mentor  [(1047637319, 0.97, mentor), (1047551740, 0.97.