# Introduction 

This notebook is to make a fixed test set for all future modelling 

# Import

In [36]:
import pandas as pd
import numpy as np

from datasets import load_dataset

# Load data

In [42]:
reviews_dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_All_Beauty", trust_remote_code=True)
review_df = reviews_dataset['full'].to_pandas()

In [63]:
new_product_df = pd.read_csv("data/cleaned/product_metadata.csv")

In [64]:
def train_test_split(data):
    timestamp_split = np.quantile(data.timestamp, 0.80)
    train_df = data[data.timestamp < timestamp_split]
    test_df = data[data.timestamp >= timestamp_split]
    return train_df, test_df

# Full split

In [65]:
train_df, test_df = train_test_split(review_df)

In [67]:
train_df.to_parquet("data/cleaned/train_reviews.parquet")
test_df.to_parquet("data/cleaned/test_reviews.parquet")

In [68]:
(train_df.rating.value_counts() / len(train_df)).sort_index()

rating
1.0    0.132147
2.0    0.059627
3.0    0.079763
4.0    0.117959
5.0    0.610504
Name: count, dtype: float64

In [69]:
(test_df.rating.value_counts() / len(test_df)).sort_index()

rating
1.0    0.198965
2.0    0.068208
3.0    0.082263
4.0    0.093938
5.0    0.556626
Name: count, dtype: float64

# Reduced split

In [71]:
reduced_review_df = review_df[review_df.parent_asin.isin(new_product_df.parent_asin)].reset_index(drop=True)

In [72]:
reduced_train_df, reduced_test_df = train_test_split(reduced_review_df)

In [74]:
(reduced_train_df.rating.value_counts() / len(reduced_train_df)).sort_index()

rating
1.0    0.129482
2.0    0.059525
3.0    0.080284
4.0    0.120157
5.0    0.610553
Name: count, dtype: float64

In [75]:
(reduced_test_df.rating.value_counts() / len(reduced_test_df)).sort_index()

rating
1.0    0.183994
2.0    0.066429
3.0    0.079801
4.0    0.091356
5.0    0.578420
Name: count, dtype: float64

In [76]:
reduced_train_df.to_parquet("data/cleaned/reduced_train_reviews.parquet")
reduced_test_df.to_parquet("data/cleaned/reduced_test_reviews.parquet")