In [1]:
%load_ext autoreload
%autoreload 2

# Introduction 

This is an attempt to compute NDCG

In [2]:
import torch
import pandas as pd
import numpy as np

from utils.graph_model import GNNSAGERecommenderwithSkipConnections 
from utils.setup_nodes import create_edge_lists, create_user_df
from utils.predictions import pretty_print_recomendations
from utils.ndcg_computations import recommend_products_with_ndcg

# Set up

In [3]:
user_split = "train_test"
prod_embed_name = "meta_features_512"
user_embed_name = "user_reviews_features_512"

data_dir = "data"
product_dir = "full_data"
embedding_dir = "embedding"
results_folder = "final_model"

device = "cuda"

product_cols = ["parent_asin", "average_rating", "rating_number"]
user_cols = ["user_id", "rating_mean", "rating_count", "helpful_vote_mean", "helpful_vote_gte_1", "verified_purchase_mean", "last_active_in_days_min",
            "last_active_in_days_max", "word_count_mean"]
edge_cols = ["user_id", "parent_asin", "rating"]

# Load data

In [4]:
product_df = pd.read_parquet(f"{data_dir}/{product_dir}/product_df.parquet", columns = product_cols)
train_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/train_agg.parquet", columns = user_cols)
train_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/train.parquet", columns = edge_cols)

if user_split == "train_test_valid":
    test_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/valid_agg.parquet", columns = user_cols)
    test_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/valid.parquet", columns = edge_cols)
else:
    test_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/test_agg.parquet", columns = user_cols)
    test_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/test.parquet", columns = edge_cols)


In [5]:
product_embedding = torch.load(f"{data_dir}/{embedding_dir}/product/{prod_embed_name}.pt")
train_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/train_{user_embed_name}.pt")
if user_split == "train_test_valid":
    test_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/valid_{user_embed_name}.pt")
else:
    test_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/test_{user_embed_name}.pt")

# Make nodes

In [6]:
# Add the embedding 
train_user_df["embedding"] = list(train_user_embedding.numpy())
test_user_df["embedding"] = list(test_user_embedding.numpy())

# Make super user df
user_df = create_user_df(train_user_df, test_user_df)

In [7]:
# Set up id mapping
offset = user_df.user_id.nunique()
user_id_to_idx = {unique_id : idx for idx, unique_id in enumerate(user_df.user_id.unique())}
prod_id_to_idx = {unique_id : offset + idx for idx, unique_id in enumerate(product_df.parent_asin.unique())}

# Add to df
product_df["prod_idx"] = product_df.parent_asin.apply(lambda x: prod_id_to_idx[x])
train_user_edges["user_idx"] = train_user_edges.user_id.apply(lambda x: user_id_to_idx[x])
test_user_edges["user_idx"] = test_user_edges.user_id.apply(lambda x: user_id_to_idx[x])
train_user_edges["prod_idx"] = train_user_edges.parent_asin.apply(lambda x: prod_id_to_idx[x])
test_user_edges["prod_idx"] = test_user_edges.parent_asin.apply(lambda x: prod_id_to_idx[x])

In [8]:
# Concat product nodes 
product_nodes = torch.cat([torch.tensor(product_df.drop(["parent_asin", "prod_idx"], axis = 1).to_numpy()), product_embedding], dim = 1)

In [9]:
# concat user nodes 
user_embed = torch.tensor(np.vstack(user_df["embedding"].values))
user_info = torch.tensor(user_df.drop(["user_id", "embedding"], axis = 1).to_numpy())
user_nodes = torch.cat([user_info, user_embed], dim = 1)

In [10]:
# Create edge list
train_edge_index, train_edge_weights = create_edge_lists(train_user_edges)
test_edge_index, test_edge_weights = create_edge_lists(train_user_edges)

# Move to GPU 

In [11]:
product_nodes = product_nodes.type(torch.float).to(device)
user_nodes = user_nodes.type(torch.float).to(device)
train_edge_index = train_edge_index.to(device)
train_edge_weights = train_edge_weights.to(device)
test_edge_index = test_edge_index.to(device)
test_edge_weights = test_edge_weights.to(device)

# Model

In [12]:
# Set up model features
num_users = len(user_df)
num_products = len(product_df)
user_feature_dim = user_nodes.shape[1]
product_feature_dim = product_nodes.shape[1]
embedding_dim = 256
dropout_prob = 0.2

In [13]:
# Instantiate the model
model = GNNSAGERecommenderwithSkipConnections(num_users, num_products, user_feature_dim, product_feature_dim, embedding_dim, dropout_prob)

In [14]:
# move the model 
model.to(device)

GNNSAGERecommenderwithSkipConnections(
  (user_feature_transform): Linear(in_features=776, out_features=256, bias=True)
  (product_feature_transform): Linear(in_features=770, out_features=256, bias=True)
  (conv1): SAGEConv(256, 256, aggr=mean)
  (bn1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (conv2): SAGEConv(256, 256, aggr=mean)
  (bn2): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): Linear(in_features=256, out_features=1, bias=True)
)

In [15]:
# Load trained model 
model.load_state_dict(torch.load("results/final_model/model.pt", weights_only=True))

<All keys matched successfully>

# Test NDCG

In [16]:
title_mapping = pd.read_parquet(f"{data_dir}/{product_dir}/product_df.parquet", columns = ["parent_asin", "title"])
prod_idx_to_id = {idx - offset: asin for asin, idx in prod_id_to_idx.items()}

In [17]:
user_id = test_edge_index[0][torch.randint(0, test_edge_index.shape[1], (1,))].item() #Here we are only selecting from users in test set
print(user_id, "\n")
print("Recommended products", "\n")
recomendations = recommend_products_with_ndcg(model, test_edge_index, test_edge_weights, user_id, user_id_to_idx, prod_id_to_idx, user_nodes, product_nodes, 
                                              top_k=10, batch_size=2048)
pretty_print_recomendations(recomendations[0], title_mapping, "title")
print("\n")
print("NDCG score is: ", recomendations[1].numpy())
print("Ratings are: ", recomendations[2].cpu().numpy())

128754 

Recommended products 

Running batch number 1 out of 1...
Product 1: NewBeauty Magazine New Fall 2011 The World's Most Unique Beauty Magazine
Product 2: Charles J. Wahba Side Comb (Paired) - 17 Teeth (Black Color) - Handmade in France
Product 3: The Men From Shiloh
Product 4: PUNARA Organic & Natural Makeup Music Box Set for Kids, Merry Go Round Beauty Treasure 10pcs, Safe & Gentle for Sensitive Skin, ECOCERT Certified Ingredients
Product 5: Candlelight Shadowsense by Senegence
Product 6: Charles J. Wahba - Side Combs for Thin Hair (Paired) - Made in France (Demi Blonde (Pair))
Product 7: Diane oil-infused lift comb, 6-1/4", green, DBC024
Product 8: TANGLE Jr. MetallicTexture - Purple
Product 9: ETUDE HOUSE HERSHEY's Chocolate Drink Kit #Original - Play Color Eyes Mini Eyeshadow Palette & Tumbler - Special Limited edition
Product 10: Sustainability: Radical Solutions Inspiring Hope


NDCG score is:  0.0
Ratings are:  [5.233306  5.2319746 5.2272487 5.224562  5.2228847 5.222747 

In [18]:
print("Actual products bought", "\n")
actual_prod = test_edge_index[1][test_edge_index[0] == user_id]
actual_weights = test_edge_weights[test_edge_index[0] == user_id]
actual_recommendations = [prod_idx_to_id[x.item() - offset] for x in actual_prod]
pretty_print_recomendations(actual_recommendations, title_mapping, "title")
print("\n")
print("Ratings are: ", [x.item() for x in actual_weights])
print("\n")
print("\n")

Actual products bought 

Product 1: e.l.f. Moisturizing Foundation Stick 83187 Almond by e.l.f. Cosmetics


Ratings are:  [5.0]




