In [4]:
import numpy as np
import pandas as pd
import torch
import torchtext
from builder import PandasGraphBuilder
from data_utils import *

import dgl

import os
import pickle
import re

import gzip
import json
import shutil

import random

In [5]:
data_dir = "./data"
reviews_fn = "Electronics.csv"
metadata_fn = "preprocessed_metadata.csv"
data_out_fn = "Amazon_Electronics.pkl"

# Process metadata

In [6]:
# Load metadata
metadata_path = os.path.join(data_dir, metadata_fn)

In [7]:
metadata_df = pd.read_csv(metadata_path)

In [8]:
metadata_df.reset_index(drop=True, inplace=True)

In [9]:
metadata_df = metadata_df.dropna()

In [10]:
len(metadata_df)

101128

In [11]:
# Get distinct asin list
asin_array = metadata_df['parent_asin'].unique()
len(asin_array)

101128

# Process reviews

In [12]:
# Load reviews
reviews_path = os.path.join(data_dir, reviews_fn)
reviews_df = pd.read_csv(reviews_path)

In [13]:
len(reviews_df)

15473536

In [14]:
# filter reviewed asin with metadata
reviews_df = reviews_df[reviews_df['parent_asin'].isin(asin_array)]

In [15]:
out_reviews_path = "./data/preprocessed_reviews.csv"
# Write the DataFrame to a CSV file
reviews_df.to_csv(out_reviews_path, index=False)

In [16]:
len(reviews_df)

1744048

# Process users

In [17]:
users_df = reviews_df[['user_id']].drop_duplicates()

In [18]:
len(users_df)

974207

In [19]:
out_reviews_path = "users.csv"
# Write the DataFrame to a CSV file
users_df.to_csv(out_reviews_path, index=False)

# Process event

In [16]:
events_df = reviews_df[['user_id', 'parent_asin', 'timestamp', 'rating']]

In [17]:
len(events_df)

1744048

# Build graph

In [18]:
graph_builder = PandasGraphBuilder()
graph_builder.add_entities(users_df, "user_id", 'user')
graph_builder.add_entities(metadata_df, "parent_asin", 'product')
graph_builder.add_binary_relations(events_df, 'user_id', 'parent_asin', 'reviewed')
graph_builder.add_binary_relations(events_df, 'parent_asin', 'user_id', 'reviewed-by')

In [19]:
g = graph_builder.build()

# Assign node features

In [20]:
# Process price feature
metadata_df['price'].apply(lambda x: int( float(x)*100 ))
g.nodes['product'].data['price'] = torch.LongTensor(np.array(metadata_df['price'].values))

In [21]:
# Process avg rating feature
g.nodes['product'].data['average_rating'] = torch.LongTensor(metadata_df['average_rating'].values)

In [22]:
# Process niche feature
unique_niches = metadata_df['niche'].unique()
niche_to_index = {niche: index for index, niche in enumerate(unique_niches)}
# Replace niche values with their corresponding integer indices
metadata_df['niche_index'] = metadata_df['niche'].map(niche_to_index)
g.nodes['product'].data['niche'] = torch.LongTensor(np.array(metadata_df['niche_index'].values))

In [23]:
# Process store
unique_stores = metadata_df['store'].unique()
store_to_index = {store: index for index, store in enumerate(unique_stores)}
# Replace store values with their corresponding integer indices
metadata_df['store_index'] = metadata_df['store'].map(niche_to_index)
g.nodes['product'].data['store'] = torch.LongTensor(np.array(metadata_df['store_index'].values))

# Assign edge features

In [24]:
# add edge features
g.edges['reviewed'].data['rating'] = torch.FloatTensor(events_df['rating'].values)
g.edges['reviewed'].data['timestamp'] = torch.LongTensor(events_df['timestamp'].values)

g.edges['reviewed-by'].data['rating'] = torch.FloatTensor(events_df['rating'].values)
g.edges['reviewed-by'].data['timestamp'] = torch.LongTensor(events_df['timestamp'].values)

# Train-validation-test split

In [25]:
train_indices, val_indices, test_indices = train_test_split_by_time(
	events_df, "timestamp", "user_id")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["train_mask"] = np.ones((len(df),), dtype=np.bool_)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["val_mask"] = np.zeros((len(df),), dtype=np.bool_)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["test_mask"] = np.zeros((len(df),), dtype=np.bool_)
  Before: .apply(func)
  After:  .apply(f

                         user_id parent_asin      timestamp  rating  \
10  AGBFYI2DDIKXC5Y4FARTYDTQBMFQ  B00RH29CJO  1535118501776     5.0   
12  AGBFYI2DDIKXC5Y4FARTYDTQBMFQ  B0093162RM  1578593798362     5.0   

    train_mask  val_mask  test_mask  
10        True     False      False  
12       False     False       True  


# Build train graph

In [26]:
train_g = build_train_graph(
	g, train_indices, 'user', 'product', 'reviewed', 'reviewed-by')
assert train_g.out_degrees(etype='reviewed').min() > 0

In [27]:
# Build sparse matrix for validation and test
val_matrix, test_matrix = build_val_test_matrix(
	g, val_indices, test_indices, 'user', 'product', 'reviewed')
# return type: (ssp.coo_matrix, ssp.coo_matrix)

In [28]:
textual_dataset = {
	'title': metadata_df['title'].values.astype(str)
}

In [29]:
dataset = {
	'full-graph': g,
	'train-graph': train_g,
	'val-matrix': val_matrix,
	'test-matrix': test_matrix,
	'item-texts': textual_dataset,
	'item-images': None,
	'user-list': users_df['user_id'].values,
	'product-list': metadata_df['parent_asin'].values,
	'user-type': 'user',
	'item-type': 'product',
	'user-to-item-type': 'reviewed',
	'item-to-user-type': 'reviewed-by',
	'timestamp-edge-column': 'timestamp'
}

# Save dataset

In [30]:
with open(os.path.join("./data/processed_pinsage_Amazon_electronics.pkl"), "wb") as f:
	pickle.dump(dataset, f)