In [1]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from scipy.spatial.distance import cosine
from sklearn.metrics import mean_squared_error

In [3]:
df_train = pd.read_csv("../../data/ml-100k/ub.base", sep="\t", 
                 header=None, names=['user id', 'item id', 'rating', 'timestamp'])
df_test = pd.read_csv("../../data/ml-100k/ub.test", sep="\t", 
                 header=None, names=['user id', 'item id', 'rating', 'timestamp'])

In [4]:
user_movies_train = (
    pd.pivot(data=df_train, index='user id', columns='item id', values='rating')
)

# I subtract each user's average rating to magnify individual preferences
# Replace NaN with 0.0, as this is now the "neutral" value

train_matrix = user_movies_train.sub(user_movies_train.mean(axis=1), axis=0).fillna(0.0)

In [5]:
user_movies_test = (
    pd.pivot(data=df_test, index='user id', columns='item id', values='rating')
)

# I subtract each user's average rating to magnify individual preferences
# Replace NaN with 0.0, as this is now the "neutral" value

test_matrix = user_movies_test.sub(user_movies_test.mean(axis=1), axis=0).fillna(0.0)

In [22]:
item_models = {}

for item in tqdm(df_train['item id'].unique()):
    ids_ratings = df_train[df_train['item id'] == item][['user id', 'rating']]
    X = train_matrix.loc[ids_ratings['user id'], :]
    y = ids_ratings['rating']
    
    if y.shape[0] >= 5:
        model = KNeighborsRegressor(n_neighbors=min(30, y.shape[0]),
                                    weights='distance', metric=cosine, n_jobs=-1)
        fitted_model = model.fit(X, y)

        item_models[item] = fitted_model

  0%|          | 0/1675 [00:00<?, ?it/s]

In [23]:
average_rating = df_train['rating'].mean()
average_rating

3.5236612564866956

In [28]:
item_ratings_predictions = []

for item in tqdm(df_test['item id'].unique()):    
    ratings_movie = user_movies_test[item][user_movies_test[item].notna()]
    
    model = item_models.get(item, None)
    if model is None:
        predictions = [average_rating * len(ratings_movie.index)]
    else:
        predictions = model.predict(train_matrix.loc[ratings_movie.index, :])
    
    item_ratings_predictions += [list(zip(ratings_movie, predictions))]

  0%|          | 0/1145 [00:00<?, ?it/s]

In [93]:
rts_preds = np.array([(k,v) for lst in item_ratings_predictions for k,v in lst])
results = pd.DataFrame(rts_preds, columns=['rating', 'preds'])
results['preds'] = results['preds'].clip(upper=5)
mean_squared_error(results['rating'], results['preds'], squared=False)

1.0336275283845844

## Node 2 Vec similarity

In [39]:
import networkx as nx
from node2vec import Node2Vec

In [70]:
df_train['source'] = df_train['user id']
df_train['target'] = df_train['item id'] + df_train['user id'].max()

assert (
    df_train['source'].isin(df_train['target']).sum() ==
    df_train['target'].isin(df_train['source']).sum() == 
    0
)

In [71]:
G = nx.from_pandas_edgelist(df_train, source='source', target='target', edge_attr='rating')

In [97]:
node2vec = Node2Vec(
    G,
    weight_key='rating',
    workers=8
)

Computing transition probabilities:   0%|          | 0/2618 [00:00<?, ?it/s]

Generating walks (CPU: 1): 100%|██████████| 2/2 [00:07<00:00,  3.82s/it]
Generating walks (CPU: 2): 100%|██████████| 2/2 [00:07<00:00,  3.85s/it]
Generating walks (CPU: 3): 100%|██████████| 1/1 [00:03<00:00,  3.91s/it]
Generating walks (CPU: 4): 100%|██████████| 1/1 [00:03<00:00,  3.75s/it]
Generating walks (CPU: 5): 100%|██████████| 1/1 [00:03<00:00,  3.76s/it]
Generating walks (CPU: 6): 100%|██████████| 1/1 [00:03<00:00,  3.80s/it]
Generating walks (CPU: 7): 100%|██████████| 1/1 [00:03<00:00,  3.72s/it]
Generating walks (CPU: 8): 100%|██████████| 1/1 [00:03<00:00,  3.31s/it]


In [99]:
# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)

In [100]:
embeds = model.wv.get_normed_vectors()

In [101]:
df_train['source'].nunique() + df_train['target'].nunique()

2618

In [102]:
len(embeds)

2618

In [103]:
user_user_embeds = pd.DataFrame(embeds[:943], index=range(1, 944))

In [104]:
user_user_embeds.shape

(943, 128)

In [105]:
item_models_node2vec = {}

for item in tqdm(df_train['item id'].unique()):
    ids_ratings = df_train[df_train['item id'] == item][['user id', 'rating']]
    X = user_user_embeds.loc[ids_ratings['user id'], :]
    y = ids_ratings['rating']
    
    if y.shape[0] >= 5:
        model = KNeighborsRegressor(n_neighbors=min(30, y.shape[0]),
                                    weights='distance', metric=cosine, n_jobs=-1)
        fitted_model = model.fit(X, y)

        item_models_node2vec[item] = fitted_model

  0%|          | 0/1675 [00:00<?, ?it/s]

In [106]:
item_ratings_predictions_node2vec = []

for item in tqdm(df_test['item id'].unique()):    
    ratings_movie = user_movies_test[item][user_movies_test[item].notna()]
    
    model = item_models_node2vec.get(item, None)
    if model is None:
        predictions = [average_rating * len(ratings_movie.index)]
    else:
        predictions = model.predict(user_user_embeds.loc[ratings_movie.index, :])
    
    item_ratings_predictions_node2vec += [list(zip(ratings_movie, predictions))]

  0%|          | 0/1145 [00:00<?, ?it/s]

In [107]:
rts_preds_n2v = np.array([(k,v) for lst in item_ratings_predictions_node2vec for k,v in lst])
results_n2v = pd.DataFrame(rts_preds_n2v, columns=['rating', 'preds'])
results_n2v['preds'] = results_n2v['preds'].clip(upper=5)
mean_squared_error(results_n2v['rating'], results_n2v['preds'], squared=False)

1.0584383418668615