In [1]:
%load_ext autoreload
%autoreload 2

# Introduction 

These changes are implemented in the new architecture: GNNRecommenderwithSkipConnections. To summarise, there are 3 changes from the BaseGNNRecomender:

1. Added skip connections
2. Added batch norm layers
3. Simplified MLP predictor to single feedforward layer

# Imports

In [2]:
import pandas as pd
import numpy as np
import torch

from utils.setup_nodes import create_edge_lists
from utils.graph_helpers import train_model, plot_train_val_loss, final_evaluation, make_df
from utils.graph_model import GNNRecommenderwithSkipConnections
from utils.general import seed_everything
from utils.predictions import recommend_products, pretty_print_recomendations, get_top_k_preds

# Set up

In [3]:
seed_everything()

In [4]:
user_split = "train_test_valid"
prod_embed_name = "meta_features_512"
user_embed_name = "user_reviews_features_512"

data_dir = "data"
product_dir = "full_data"
embedding_dir = "embedding"

device = "cuda"

product_cols = ["parent_asin", "average_rating", "rating_number"]
user_cols = ["user_id", "rating_mean", "rating_count", "helpful_vote_mean", "helpful_vote_gte_1", "verified_purchase_mean", "last_active_in_days_min",
            "last_active_in_days_max", "word_count_mean"]
edge_cols = ["user_id", "parent_asin", "rating"]

fill_users = "zero"

# Load data

In [5]:
product_df = pd.read_parquet(f"{data_dir}/{product_dir}/product_df.parquet", columns = product_cols)
train_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/train_agg.parquet", columns = user_cols)
train_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/train.parquet", columns = edge_cols)

if user_split == "train_test_valid":
    test_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/valid_agg.parquet", columns = user_cols)
    test_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/valid.parquet", columns = edge_cols)
else:
    test_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/test_agg.parquet", columns = user_cols)
    test_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/test.parquet", columns = edge_cols)


In [6]:
product_embedding = torch.load(f"{data_dir}/{embedding_dir}/product/{prod_embed_name}.pt")
train_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/train_{user_embed_name}.pt")
if user_split == "train_test_valid":
    test_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/valid_{user_embed_name}.pt")
else:
    test_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/test_{user_embed_name}.pt")

# Make nodes

In [7]:
# Add the embedding 
train_user_df["embedding"] = list(train_user_embedding.numpy())
test_user_df["embedding"] = list(test_user_embedding.numpy())

# Concat user nodes 
additional_test_users = test_user_df[~test_user_df.user_id.isin(train_user_df.user_id)].copy()

## these are users that need to be zero-ed out 
additional_test_users = test_user_df[~test_user_df.user_id.isin(train_user_df.user_id)].copy()
additional_test_users["embedding"] = list(torch.zeros((len(additional_test_users), test_user_embedding.shape[1])).numpy())
if fill_users == "zero":
    additional_test_users.iloc[:, 1:-1] = 0
elif fill_users == "mean":
    additional_test_users.iloc[:, 1:-1] = train_user_df.iloc[:, 1:].mean()

# Make super user df
user_df = pd.concat([train_user_df, additional_test_users])

In [8]:
# Set up id mapping
offset = user_df.user_id.nunique()
user_id_to_idx = {unique_id : idx for idx, unique_id in enumerate(user_df.user_id.unique())}
prod_id_to_idx = {unique_id : offset + idx for idx, unique_id in enumerate(product_df.parent_asin.unique())}

# Add to df
product_df["prod_idx"] = product_df.parent_asin.apply(lambda x: prod_id_to_idx[x])
train_user_edges["user_idx"] = train_user_edges.user_id.apply(lambda x: user_id_to_idx[x])
test_user_edges["user_idx"] = test_user_edges.user_id.apply(lambda x: user_id_to_idx[x])
train_user_edges["prod_idx"] = train_user_edges.parent_asin.apply(lambda x: prod_id_to_idx[x])
test_user_edges["prod_idx"] = test_user_edges.parent_asin.apply(lambda x: prod_id_to_idx[x])

In [9]:
# Concat product nodes 
product_nodes = torch.cat([torch.tensor(product_df.drop(["parent_asin", "prod_idx"], axis = 1).to_numpy()), product_embedding], dim = 1)

In [10]:
# concat user nodes 
user_embed = torch.tensor(np.vstack(user_df["embedding"].values))
user_info = torch.tensor(user_df.drop(["user_id", "embedding"], axis = 1).to_numpy())
user_nodes = torch.cat([user_info, user_embed], dim = 1)

In [11]:
# Create edge list
train_edge_index, train_edge_weights = create_edge_lists(train_user_edges)
test_edge_index, test_edge_weights = create_edge_lists(train_user_edges)

# Move to GPU 

In [12]:
product_nodes = product_nodes.type(torch.float).to(device)
user_nodes = user_nodes.type(torch.float).to(device)
train_edge_index = train_edge_index.to(device)
train_edge_weights = train_edge_weights.to(device)
test_edge_index = test_edge_index.to(device)
test_edge_weights = test_edge_weights.to(device)

# Model

In [13]:
# Set up model features (fixed)
num_users = len(user_df)
num_products = len(product_df)
user_feature_dim = user_nodes.shape[1]
product_feature_dim = product_nodes.shape[1]

num_epochs = 1000

In [14]:
# Config ls 

config_ls = []

embedding_dim_ls = [32, 64, 128, 256]
learning_rate_ls = [0.01, 0.05, 0.1]

for embedding_dim in embedding_dim_ls:
    for learning_rate in learning_rate_ls:
        config_ls.append((embedding_dim, learning_rate))

In [15]:
train_loss_ls = []
test_loss_ls = []
full_test_loss_ls = []
best_epoch_ls = []
best_test_loss_ls = []

In [16]:
for config in config_ls:
    embedding_dim, learning_rate = config
    model = GNNRecommenderwithSkipConnections(num_users, num_products, user_feature_dim, product_feature_dim, embedding_dim)
    model.to(device=device)
    train_loss, test_loss, best_model, best_epoch = train_model(model, train_edge_index, train_edge_weights, test_edge_index, test_edge_weights,
                                                    user_nodes, product_nodes, num_epochs = num_epochs, print_progress=False, lr = learning_rate, 
                                                    give_epoch= True)
    
    full_test_loss = final_evaluation(model, test_edge_index, test_edge_weights, user_nodes, product_nodes, device, print_test=False)
    model.load_state_dict(best_model)
    best_test_loss = final_evaluation(model, test_edge_index, test_edge_weights, user_nodes, product_nodes, device, print_test=False)
    train_loss_ls.append(train_loss)
    test_loss_ls.append(test_loss)
    full_test_loss_ls.append(full_test_loss.item())
    best_epoch_ls.append(best_epoch)
    best_test_loss_ls.append(best_test_loss.item())

In [17]:
make_df(config_ls, ["embedding_dim", "learning_rate"], train_loss_ls, test_loss_ls, full_test_loss_ls, best_epoch_ls, best_test_loss_ls)

Unnamed: 0,embedding_dim,learning_rate,train_loss,test_loss,final_test_loss,best_epoch,best_test_loss
0,32,0.01,"[18.93882179260254, 18.372299194335938, 18.077...","[18.48277473449707, 18.092578887939453, 17.870...",0.103291,1000,0.103291
1,32,0.05,"[17.829692840576172, 17.09220314025879, 16.145...","[17.571622848510742, 16.24557876586914, 15.492...",0.096932,997,0.094286
2,32,0.1,"[18.65825843811035, 17.49795150756836, 15.9016...","[17.891389846801758, 16.399417877197266, 15.15...",0.114216,980,0.099692
3,64,0.01,"[17.54790496826172, 17.23591423034668, 16.9813...","[17.25389862060547, 17.06599998474121, 16.8226...",0.097904,999,0.097105
4,64,0.05,"[17.35498046875, 16.629972457885742, 15.791384...","[16.74798583984375, 16.026737213134766, 15.136...",0.145437,980,0.09379
5,64,0.1,"[18.486141204833984, 17.37009048461914, 15.801...","[17.57076072692871, 16.180830001831055, 14.527...",0.105063,993,0.09807
6,128,0.01,"[18.422361373901367, 18.089866638183594, 17.86...","[18.226402282714844, 17.88574981689453, 17.695...",0.104073,982,0.100792
7,128,0.05,"[17.653156280517578, 17.086856842041016, 16.23...","[17.2088623046875, 16.510927200317383, 15.6126...",0.096001,881,0.095034
8,128,0.1,"[18.49266242980957, 17.47332000732422, 15.8714...","[17.721370697021484, 16.173261642456055, 14.55...",0.097684,997,0.0973
9,256,0.01,"[18.62061882019043, 18.369264602661133, 18.161...","[18.416627883911133, 18.213382720947266, 18.01...",0.094777,1000,0.094777
