In [6]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
with open("../../clean_data/profiles.csv", encoding='utf-8', errors='ignore') as f:
    df = pd.read_csv(f)

df = df.fillna('').astype(str)


df['Tokens'] = df.apply(word_tokenize)

sentences = df['Tokens']
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

def apply_average_word2vec(row):
    return average_word2vec(row['Profile'], word2vec_model)

df['Vector'] = df.apply(apply_average_word2vec, axis=1)

X = np.vstack(df['Vector'].values)


svd = TruncatedSVD(n_components=100) 
X_reduced = svd.fit_transform(X)

cos_sim_matrix = cosine_similarity(X_reduced)

# KNN with cosine similarity
k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2)) for j in range(1, k)]
    }
    results.append(result)

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))


[nltk_data] Downloading package punkt to
[nltk_data]     /zfs/users/asda2/asda2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


TypeError: expected string or bytes-like object

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

df = pd.read_csv("../../../clean_data/profiles.csv", encoding='utf-8')
df = df.fillna('').astype(str)

df['Profile'] = df.drop(columns=['Id', 'Created at', 'Relationship Role']).agg(' '.join, axis=1)
df['Tokens'] = df['Profile'].apply(word_tokenize)

sentences = df['Tokens'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=200, window=5, min_count=1, workers=4)

def average_word2vec(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

df['Vector'] = df['Tokens'].apply(lambda tokens: average_word2vec(tokens, word2vec_model))

X = np.vstack(df['Vector'].values)

svd = TruncatedSVD(n_components=100)
X_reduced = svd.fit_transform(X)

cos_sim_matrix = cosine_similarity(X_reduced)

k = 5
knn = NearestNeighbors(n_neighbors=k, metric='cosine')
knn.fit(X_reduced)

distances, indices = knn.kneighbors(X_reduced)

results = []
for i, profile in enumerate(df['Profile']):
    nearest_neighbors = [(df['Id'][indices[i][j]], round(1 - distances[i][j], 2), df['Relationship Role'][indices[i][j]]) for j in range(1, k)]
    result = {
        'Id': df['Id'][i],
        'Profile': profile,
        'Relationship Role': df['Relationship Role'][i],
        'Nearest Neighbors': nearest_neighbors
    }
    results.append(result)

results_df = pd.DataFrame(results)
#print(results_df.to_string(index=False))


[nltk_data] Downloading package punkt to
[nltk_data]     /zfs/users/asda2/asda2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
results_df

#results_df.to_csv("results_rec2.csv")