In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import spacy
from gensim.utils import simple_preprocess

#tokenize w nltk, vectorize and learn embeddings using word2vec

[nltk_data] Downloading package stopwords to
[nltk_data]     /zfs/users/asda2/asda2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /zfs/users/asda2/asda2/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:

with open("../../../clean_data/profiles.csv", encoding='utf-8', errors='ignore') as f:
    df = pd.read_csv(f)


In [3]:


#tokenization
def tokenize_text(texts):
    return [word_tokenize(text.lower()) for text in texts]

#stopwords (words like: the, is, in, it, they)
def remove_stopwords(texts, stopwords):
    return [[word for word in text if word not in stopwords] for text in texts]

#Lemmatization (turn words into their base form ex. eating -> eat)
def lemmatization(texts, allowed_postags=["NOUN", "ADJ", "VERB", "ADV"]):
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    texts_out = []
    for text in texts:
        doc = nlp(" ".join(text))
        new_text = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_text.append(token.lemma_)
        texts_out.append(new_text)
    return texts_out


# Preprocess the text data w functions defined above
def preprocess_text(data):
    data_words = tokenize_text(data)
    stop_words = set(stopwords.words('english'))
    data_no_stopwords = remove_stopwords(data_words, stop_words)

    lemmatized_texts = lemmatization(data_no_stopwords)

    return lemmatized_texts

processed_texts = preprocess_text(df.astype(str))



In [4]:
df = df.fillna('').astype(str)
df['Profile'] = df.drop(columns=['Id']).agg(' '.join, axis=1)

In [5]:
# nltk.download('punkt')
# df['Tokens'] = df['Profile'].apply(word_tokenize)

In [6]:
sentences = df['Profile'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)


In [8]:
def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

def apply_average_word2vec(row):
    return average_word2vec(row['Profile'], word2vec_model)

df['Vector'] = df.apply(apply_average_word2vec, axis=1)


In [9]:
X = np.vstack(df['Vector'].values)

cos_sim_matrix = cosine_similarity(X)

k = 10
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X)

distances, indices = knn.kneighbors(X)


In [10]:
results = []
for i, profile in enumerate(df['Profile']):
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': [(df['Id'][indices[i][j]], round(1 - distances[i][j], 3)) for j in range(1, k)]
    }
    results.append(result)

In [11]:
results = pd.DataFrame(results)

results

#results.to_csv("results_rec.csv")

Unnamed: 0,Id,Profile,Relationship Role,Nearest Neighbors
0,1047644182,5/7/2024 9:22 mentor 0 0 I'm in my 3rd year ...,mentor,"[(1047512490, 0.999), (1047517758, 0.999), (10..."
1,1047643231,4/15/2024 10:08 mentee 0 0 South Kamloops S...,mentee,"[(1047538891, 0.998), (1047584028, 0.998), (10..."
2,1047643230,4/15/2024 10:08 mentee 0 0 Name: Mentee Respon...,mentee,"[(1047582592, 0.995), (1047551508, 0.994), (10..."
3,1047643228,4/15/2024 10:04 mentor 1 2 Name: Meet your men...,mentor,"[(1047550122, 0.998), (1047583343, 0.998), (10..."
4,1047641732,3/15/2024 9:58 mentee 0 6 Name: Adulting Lesso...,mentee,"[(1047630611, 0.997), (1047627554, 0.997), (10..."
...,...,...,...,...
1925,9950,12/9/2016 2:56 mentee 0 1 Marketing and ...,mentee,"[(1047499328, 0.952), (1047499343, 0.951), (10..."
1926,9941,4/2/2013 13:06 mentor 2 30 Name: Ways of Know...,mentor,"[(1047551102, 0.996), (1047514780, 0.996), (10..."
1927,9927,12/6/2016 13:12 mentor 0 0 Rural Medicin...,mentor,"[(10160, 0.994), (10155, 0.994), (13683, 0.991..."
1928,9923,"12/6/2016 11:38 mentee 0 1 Nursing ,...",mentee,"[(10289, 0.995), (10324, 0.993), (10322, 0.991..."


In [12]:

match1 = results[:1]
results[:1]

results['Nearest Neighbors']

0       [(1047512490, 0.999), (1047517758, 0.999), (10...
1       [(1047538891, 0.998), (1047584028, 0.998), (10...
2       [(1047582592, 0.995), (1047551508, 0.994), (10...
3       [(1047550122, 0.998), (1047583343, 0.998), (10...
4       [(1047630611, 0.997), (1047627554, 0.997), (10...
                              ...                        
1925    [(1047499328, 0.952), (1047499343, 0.951), (10...
1926    [(1047551102, 0.996), (1047514780, 0.996), (10...
1927    [(10160, 0.994), (10155, 0.994), (13683, 0.991...
1928    [(10289, 0.995), (10324, 0.993), (10322, 0.991...
1929    [(12537, 0.98), (15680, 0.976), (15665, 0.974)...
Name: Nearest Neighbors, Length: 1930, dtype: object

In [13]:
df[df['Id'].astype(int) == 1047512490]
# df[df['Id']=='1047644182']


Unnamed: 0,Id,Created at,Relationship Role,Total Mentees,Number of Messages Sent,Resource Clicks,Courses Clicks,Mentor school,Mentee school,[mentor and mentee] Career Interests,...,[mentor] Your Characteristics,[mentee] Characteristics you are looking for in a mentor,[mentor] What help can you provide to your mentee?,[mentee] What do you hope to learn?,[mentor and mentee] Anything else?,[mentee] Message frequency,[mentee] Barriers to communication,Admin Comments,Profile,Vector
1400,1047512490,9/10/2020 17:51,mentor,0,0,,,Simon Fraser University,,"At the moment, I�m particularly interested in ...",...,no preference,1,,"I�m hoping to gain experience as a mentor, bui...",,,,,9/10/2020 17:51 mentor 0 0 Simon Fraser Univ...,"[0.85075796, 0.34090993, 0.35404456, -0.236566..."
