# Node2vec embeddings for recommendation systems

In [23]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
import math
import networkx as nx
from node2vec import Node2Vec
from sklearn.model_selection import train_test_split

from data.data_helper_functions import *

In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Load the Data

In [25]:
data_path = '../data/'
books_df, users_df, ratings_df = load_data(data_path)
# G = load_bipartite_graph(books_df, users_df, ratings_df)

Train test split evenly for each user

In [5]:
# train_ratings_df, test_ratings_df = train_test_split(ratings_df, test_size=0.2, random_state=0)

train_df, test_df = split_ratings_by_user(ratings_df, test_size = 0.2, random_state=0)

Load bipartite graph containing ratings from train_set

In [6]:
G_train = load_new_bipartite_graph(books_df, users_df, train_df)

Create Node Embeddings using Node2Vec

In [8]:
node2vec = Node2Vec(G_train, dimensions=128, walk_length=8, num_walks=250, weight_key='sigmoid_weight', workers=12)
n2v_model = node2vec.fit(window=10, min_count=1, batch_words=4)

Computing transition probabilities: 100%|██████████| 49686/49686 [05:29<00:00, 150.96it/s] 


Store or Read node embeddings from Pickle

In [9]:
node_embeddings = {node: n2v_model.wv[str(node)] for node in G_train.nodes()}

# Save the embeddings dictionary to a file
# with open('node_embeddings_128_250.pkl', 'wb') as f:
#     pickle.dump(node_embeddings, f)

In [7]:
with open('node_embeddings_128_250.pkl', 'rb') as f:
    node_embeddings = pickle.load(f)

In [8]:
# Separate user and book embeddings
user_embeddings = {node[2:]: node_embeddings[node] for node in G_train.nodes() if node.startswith('u-')}
book_embeddings = {node[2:]: node_embeddings[node] for node in G_train.nodes() if node.startswith('b-')}

## Utilizing Node Embeddings for making recommendation

Feeding node embeddings to neural network

In [9]:
from keras.models import Sequential
from keras.layers import Dense
from keras.metrics import RootMeanSquaredError, MeanAbsoluteError, MeanSquaredError
from keras.regularizers import l2

In [None]:
node_embedding_dim = 128
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=2*node_embedding_dim, kernel_regularizer=l2(0.001)))
model.add(Dense(8, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(1, activation='linear'))

In [11]:
# Prepare the Training data
X_train = []
y_train = []

for index, row in train_df.iterrows():
    user_embedding = user_embeddings[str(row['user_id'])]
    item_embedding = book_embeddings[str(row['book_id'])]
    feature_vector = np.concatenate((user_embedding, item_embedding))
    X_train.append(feature_vector)
    y_train.append(row['rating'])

X_train = np.array(X_train)
y_train = np.array(y_train)

In [12]:
# Prepare the Test data
X_test = []
y_test = []

for index, row in test_df.iterrows():
    user_embedding = user_embeddings[str(row['user_id'])]
    item_embedding = book_embeddings[str(row['book_id'])]
    feature_vector = np.concatenate((user_embedding, item_embedding))
    X_test.append(feature_vector)
    y_test.append(row['rating'])

X_test = np.array(X_test)
y_test = np.array(y_test)

In [16]:
# Create validation set from training data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=[MeanSquaredError(), RootMeanSquaredError(), MeanAbsoluteError()])
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_val, y_val))

In [None]:
# Evaluate the model
results = model.evaluate(X_test, y_test, verbose=1)

mse = results[1]
rmse = results[2]
mae = results[3]

print("Mean squared error: ", mse)
print("Root mean squared error: ", rmse)
print("Mean absolute error: ", mae)

## Evaluate the model using precision and recall at K

In [19]:
from tqdm import tqdm
import random

user_ids = ratings_df['user_id'].unique()
random.seed(42)
sample_user_ids = random.sample(list(user_ids), 500)
k = 200

To slow ...

In [None]:
def get_top_recommendation_for_user(user_id, all_books, model, k=10):
    user_rated_books = set(train_df[train_df['user_id'] == user_id]['book_id'])
    books_not_yet_rated = list(all_books - user_rated_books)

    predicted_ratings = []
    for book_id in books_not_yet_rated:
        user_embedding = user_embeddings[str(user_id)]
        book_embedding = book_embeddings[str(book_id)]
        feature_vector = np.concatenate((user_embedding, book_embedding))
        predicted_rating = model.predict(feature_vector.reshape(1, -1), verbose=0)
        predicted_ratings.append((book_id, predicted_rating.flatten()[0]))
    sorted_book_ratings = sorted(predicted_ratings, key=lambda x: x[1], reverse=True)
    return [book_id for (book_id, _) in sorted_book_ratings[:k]]

all_book_ids = set(ratings_df['book_id'].unique())
recommendations = {}
for user_id in tqdm(sample_user_ids, desc="Getting recommendations for users"):
    top_n_recommendations = get_top_recommendation_for_user(user_id, all_book_ids, model, k=k)
    recommendations[user_id] = top_n_recommendations