# Introduction

This notebook will focus a what if analysis.  
We note that most of the users with no prior info will be predicted with the same items.  
So what if the predicted items were instead the most frequent items in the training data

In [17]:
import pandas as pd
import torch
from utils.setup_nodes import create_user_df

# Set up

In [18]:
user_split = "train_test"
prod_embed_name = "meta_features_512"
user_embed_name = "user_reviews_features_512"

data_dir = "data"
product_dir = "full_data"
embedding_dir = "embedding"
results_folder = "final_model"

device = "cuda"

product_cols = ["parent_asin", "average_rating", "rating_number"]
user_cols = ["user_id", "rating_mean", "rating_count", "helpful_vote_mean", "helpful_vote_gte_1", "verified_purchase_mean", "last_active_in_days_min",
            "last_active_in_days_max", "word_count_mean"]
edge_cols = ["user_id", "parent_asin", "rating"]

# Load data

In [19]:
product_df = pd.read_parquet(f"{data_dir}/{product_dir}/product_df.parquet", columns = product_cols)
train_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/train_agg.parquet", columns = user_cols)
train_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/train.parquet", columns = edge_cols)
test_user_df = pd.read_parquet(f"{data_dir}/{user_split}_split/test_agg.parquet", columns = user_cols)
test_user_edges = pd.read_parquet(f"{data_dir}/{user_split}_split/test.parquet", columns = edge_cols)

product_embedding = torch.load(f"{data_dir}/{embedding_dir}/product/{prod_embed_name}.pt")
train_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/train_{user_embed_name}.pt")
test_user_embedding = torch.load(f"{data_dir}/{embedding_dir}/{user_split}_split/test_{user_embed_name}.pt")

# Add the embedding 
train_user_df["embedding"] = list(train_user_embedding.numpy())
test_user_df["embedding"] = list(test_user_embedding.numpy())

# Make super user df
user_df = create_user_df(train_user_df, test_user_df)

# Set up id mapping
offset = user_df.user_id.nunique()
user_id_to_idx = {unique_id : idx for idx, unique_id in enumerate(user_df.user_id.unique())}
prod_id_to_idx = {unique_id : offset + idx for idx, unique_id in enumerate(product_df.parent_asin.unique())}

# Add to df
product_df["prod_idx"] = product_df.parent_asin.apply(lambda x: prod_id_to_idx[x])
train_user_edges["user_idx"] = train_user_edges.user_id.apply(lambda x: user_id_to_idx[x])
test_user_edges["user_idx"] = test_user_edges.user_id.apply(lambda x: user_id_to_idx[x])
train_user_edges["prod_idx"] = train_user_edges.parent_asin.apply(lambda x: prod_id_to_idx[x])
test_user_edges["prod_idx"] = test_user_edges.parent_asin.apply(lambda x: prod_id_to_idx[x])

In [20]:
predictions = pd.read_parquet("results/final_model/predictions.parquet")

In [27]:
# find the top 10 recomended items 
top_10_prods = train_user_edges.groupby("prod_idx").parent_asin.count().reset_index().sort_values("parent_asin").tail(10).prod_idx.to_numpy()

In [40]:
additional_test_users = test_user_df[~test_user_df.user_id.isin(train_user_df.user_id)][["user_id"]].copy()
additional_test_users["user_idx"] = additional_test_users.user_id.apply(lambda x: user_id_to_idx[x])
additional_test_users_user_idx = set(additional_test_users.user_idx.unique())

In [43]:
predictions["new_user"] = predictions.user_idx.apply(lambda x: x in additional_test_users_user_idx)

In [45]:
predictions["prediction_adjusted"] = predictions.apply(lambda x: top_10_prods if x.new_user else x.prediction, axis = 1)

In [47]:
predictions["adj_match_count"] = predictions.apply(lambda x: len(set(x.prod_idx).intersection(set(x.prediction_adjusted))), axis = 1)

In [50]:
(predictions["adj_match_count"] / predictions.prod_idx.apply(len)).mean()

np.float64(0.00826960176381425)

In [53]:
predictions["adj_recall"]  = predictions["adj_match_count"] / predictions.prod_idx.apply(len)

In [52]:
0.00826960176381425 * 100

0.8269601763814249