# Introduction 

This notebook is to make a fixed test set for all future modelling   
This is an improvement compared to the original to allow for more flexibility of the dataset creeations

# Import

In [1]:
import pandas as pd
import numpy as np
import pickle

from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


# Load data

In [2]:
# Direct from the source 
reviews_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
product_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_meta_All_Beauty", split="full", trust_remote_code=True)

review_df = reviews_dataset['full'].to_pandas()
product_df = product_dataset.to_pandas()

In [3]:
# Cleaned data
agg_user_df = pd.read_csv("data/cleaned/user_metadata.csv")
new_product_df = pd.read_csv("data/cleaned/product_metadata.csv")

# Tally the ids

In [4]:
useful_parent_asin = (set(review_df.parent_asin)
                      .intersection(set(product_df.parent_asin))
                      .intersection(set(new_product_df.parent_asin))
)

In [5]:
review_df = review_df[review_df.parent_asin.isin(useful_parent_asin)]
product_df = product_df[product_df.parent_asin.isin(useful_parent_asin)]
new_product_df = new_product_df[new_product_df.parent_asin.isin(useful_parent_asin)]

In [6]:
useful_user_id = (set(review_df.user_id).intersection(set(agg_user_df.user_id)))

In [7]:
review_df = review_df[review_df.user_id.isin(useful_user_id)]
agg_user_df = agg_user_df[agg_user_df.user_id.isin(useful_user_id)]

# Remove same reviews 

This is for when the user has more than 1 rating for the same product 
There are a few ways to remove and this is not the only way 

In [8]:
# Two possible options (about 2%)
# .groupby(["user_id", "parent_asin"]).mean().reset_index()
# .drop_duplicates(subset = ["user_id", "parent_asin"]).sort_values(["user_id", "timestamp"]).reset_index(drop = True)

In [9]:
len(review_df.drop_duplicates(subset = ["user_id", "parent_asin"]).reset_index(drop = True)) / len(review_df)

0.9891005772412371

In [10]:
review_df = review_df.drop_duplicates(subset = ["user_id", "parent_asin"]).reset_index(drop = True)

# Make the full data

1. df for the user features (numeric) - x2, aggregated and not 
2. df for the user features (string data) - x2, aggregated and not 
3. df for the product features (numeric)
4. df for the product features (string)
5. df for the edges of the graph

In [11]:
user_features_numeric_agg = agg_user_df.drop("reviews", axis = 1)
user_features_numeric_pre_agg = review_df[["user_id", "parent_asin", "rating", "helpful_vote", "verified_purchase"]]
user_features_string_agg = agg_user_df[["user_id", "reviews"]]
user_features_string_pre_agg = review_df[["user_id", "parent_asin", "title", "text", ]]

In [12]:
product_features_numeric = product_df[["parent_asin", "main_category", "average_rating", "rating_number", "price"]]
product_features_string = pd.merge(product_df[["parent_asin", "title", "features", "description", "store", "details"]], new_product_df)

In [13]:
edge_df = review_df[["user_id", "parent_asin", "rating", "timestamp"]].copy()

# Mapping dict 

This is for the full graph   
We assume no new products / users 

In [14]:
user_id_to_idx = {unique_id : idx for idx, unique_id in enumerate(edge_df.user_id.unique())}
prod_id_to_idx = {unique_id : idx for idx, unique_id in enumerate(edge_df.parent_asin.unique())}

In [15]:
edge_df["user_idx"] = edge_df.user_id.apply(lambda x: user_id_to_idx[x])
edge_df["prod_idx"] = edge_df.parent_asin.apply(lambda x: prod_id_to_idx[x])

# Split into train test val 

This is only applicable to the edges 

In [16]:
train_mark = np.quantile(edge_df.timestamp, 0.7)
test_mark = np.quantile(edge_df.timestamp, 0.85)

In [17]:
train_edges = edge_df[edge_df.timestamp <= train_mark].copy()
test_edges = edge_df[edge_df.timestamp >= test_mark].copy()
val_edges = edge_df[(edge_df.timestamp > train_mark) & (edge_df.timestamp < test_mark)].copy()

# Save files 



In [18]:
user_features_numeric_agg.to_parquet("data/cleaned_v2/user_features_numeric_agg.parquet")
user_features_numeric_pre_agg.to_parquet("data/cleaned_v2/user_features_numeric_pre_agg.parquet")
user_features_string_agg.to_parquet("data/cleaned_v2/user_features_string_agg.parquet")
user_features_string_pre_agg.to_parquet("data/cleaned_v2/user_features_string_pre_agg.parquet")
product_features_numeric.to_parquet("data/cleaned_v2/product_features_numeric.parquet")
product_features_string.to_parquet("data/cleaned_v2/product_features_string.parquet")
train_edges.to_parquet("data/cleaned_v2/train_edges.parquet")
test_edges.to_parquet("data/cleaned_v2/test_edges.parquet")
val_edges.to_parquet("data/cleaned_v2/val_edges.parquet")

pickle.dump(user_id_to_idx, open("data/cleaned_v2/user_id_to_idx.pkl", "wb"))
pickle.dump(prod_id_to_idx, open("data/cleaned_v2/prod_id_to_idx.pkl", "wb"))
