In [1]:
%load_ext autoreload
%autoreload 2

# Introduction 

This notebook is to split the data into the necessary train, test and validation sets.   
Additionally, preprocessing is also done for the string based data

# Imports 

In [2]:
import numpy as np
import os 
import pandas as pd
import pickle

from utils.pre_processing import concat_item_metadata, concat_user_review, prep_user_nodes, make_user_nodes

# Set variables

In [3]:
cut_timestamp = 1628643414042
train_size = 0.6
test_size = 0.2

# for saving
data = "data"
mapping = "mapping"
train_test_split = "train_test_split"
train_test_valid_split = "train_test_valid_split"
full_data = "full_data"
blair_train = "blair_train"

# Load data

In [4]:
review_df = pd.read_parquet(f"data/original/review_df.parquet")
product_df = pd.read_parquet(f"data/original/product_df.parquet")

# Clean products

In [5]:
product_df["meta"] = product_df.apply(lambda x: concat_item_metadata(x), axis = 1)
product_df = product_df[product_df.meta.apply(len) > 30]

# Clean reviews

In [6]:
review_df["review"] = review_df.apply(lambda x: concat_user_review(x), axis = 1)

In [7]:
product_ids = set(product_df.parent_asin.unique())

In [8]:
review_df["filter_timestamp"] = ~(review_df["timestamp"] >= cut_timestamp)
review_df["filter_review_len"] = ~(review_df["review"].apply(len) <= 30)
review_df["filter_product_id"] = review_df["parent_asin"].isin(product_ids)

In [9]:
review_df = (review_df[(review_df["filter_timestamp"] & review_df["filter_review_len"] & review_df["filter_product_id"])]
             .drop(["filter_timestamp", "filter_review_len", "filter_product_id",], axis = 1)
)

# Create BLAIR training data 

In [10]:
df = pd.merge(
    review_df[["parent_asin", "review"]], 
    product_df[["parent_asin", "meta"]],
    on = "parent_asin"
)

In [11]:
blair_train_data = df[["review", "meta"]].copy()

# Drop user reviews

There are some duplicates, we drop them to become single row  
Multiple ways to do it, we opted for taking the first row only 

In [12]:
review_df = review_df.drop_duplicates(subset = ["user_id", "parent_asin"]).reset_index(drop = True)

# Make sure ids tally

In [13]:
useful_parent_asin = (set(review_df.parent_asin).intersection(set(product_df.parent_asin)))

In [14]:
review_df = review_df[review_df.parent_asin.isin(useful_parent_asin)]
product_df = product_df[product_df.parent_asin.isin(useful_parent_asin)]

# Make unique ids

This is for the edges 

In [15]:
offset = review_df.user_id.nunique()
user_id_to_idx = {unique_id : idx for idx, unique_id in enumerate(review_df.user_id.unique())}
prod_id_to_idx = {unique_id : idx + offset  for idx, unique_id in enumerate(product_df.parent_asin.unique())}

In [16]:
review_df["user_idx"] = review_df.user_id.apply(lambda x: user_id_to_idx[x])
review_df["prod_idx"] = review_df.parent_asin.apply(lambda x: prod_id_to_idx[x])
product_df["prod_idx"] = product_df.parent_asin.apply(lambda x: prod_id_to_idx[x])

# Add additional details to review df

In [17]:
review_df = prep_user_nodes(review_df)

# Train test validation split 

This is done on the user nodes / edges 

edges follow the 

In [18]:
train_mark = np.quantile(review_df.timestamp, train_size)
test_mark = np.quantile(review_df.timestamp, 1 - test_size)

In [19]:
# Assume only train test split 
two_split_review_train = review_df[review_df.timestamp < test_mark].copy()
two_split_review_test = review_df[review_df.timestamp >= test_mark].copy()

In [20]:
# Asssume train test validation split 
three_split_review_train = review_df[review_df.timestamp <= train_mark].copy()
three_split_review_test = review_df[review_df.timestamp >= test_mark].copy()
three_split_review_valid = review_df[(review_df.timestamp > train_mark) & (review_df.timestamp < test_mark)].copy()

In [21]:
# aggregated 
two_split_review_train_agg = make_user_nodes(two_split_review_train)
two_split_review_test_agg = make_user_nodes(two_split_review_test)
three_split_review_train_agg = make_user_nodes(three_split_review_train)
three_split_review_test_agg = make_user_nodes(three_split_review_test)
three_split_review_valid_agg = make_user_nodes(three_split_review_valid)

# Save data

In [22]:
os.makedirs(f"{data}/{mapping}", exist_ok = True)
os.makedirs(f"{data}/{train_test_split}", exist_ok = True)
os.makedirs(f"{data}/{train_test_valid_split}", exist_ok = True)
os.makedirs(f"{data}/{full_data}", exist_ok = True)
os.makedirs(f"{data}/{blair_train}", exist_ok = True)

In [23]:
pickle.dump(user_id_to_idx, open(f"{data}/{mapping}/user_id_to_idx.pkl", "wb"))
pickle.dump(prod_id_to_idx, open(f"{data}/{mapping}/prod_id_to_idx.pkl", "wb"))

In [24]:
# Blair data
blair_train_data.to_csv(f'{data}/{blair_train}/clean_review_meta.tsv', sep='\t', lineterminator='\n', index=False)

# full df
product_df.to_parquet(f"{data}/{full_data}/product_df.parquet")
review_df.to_parquet(f"{data}/{full_data}/review_df.parquet")

# Raw users 
two_split_review_train.to_parquet(f"{data}/{train_test_split}/train.parquet")
two_split_review_test.to_parquet(f"{data}/{train_test_split}/test.parquet")
three_split_review_train.to_parquet(f"{data}/{train_test_valid_split}/train.parquet")
three_split_review_test.to_parquet(f"{data}/{train_test_valid_split}/test.parquet")
three_split_review_valid.to_parquet(f"{data}/{train_test_valid_split}/valid.parquet")

# Aggregated users 
two_split_review_train_agg.to_parquet(f"{data}/{train_test_split}/train_agg.parquet")
two_split_review_test_agg.to_parquet(f"{data}/{train_test_split}/test_agg.parquet")
three_split_review_train_agg.to_parquet(f"{data}/{train_test_valid_split}/train_agg.parquet")
three_split_review_test_agg.to_parquet(f"{data}/{train_test_valid_split}/test_agg.parquet")
three_split_review_valid_agg.to_parquet(f"{data}/{train_test_valid_split}/valid_agg.parquet")