In [1]:
import os
import gzip
import json
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
DATASET = 'Grocery_and_Gourmet_Food'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = '{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items in metadata
3. Calculate basic statistics

In [3]:
# Download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://deepyeti.ucsd.edu/jianmo/amazon/metaFiles2/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

Downloading interaction data into ./Grocery_and_Gourmet_Food
Downloading item metadata into ./Grocery_and_Gourmet_Food


In [4]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def get_df(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [5]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"11 19, 2014",A1QVBUH9E1V6I8,4639725183,Jamshed Mathur,No adverse comment.,Five Stars,1416355200,,,
1,5.0,True,"10 13, 2016",A3GEOILWLK86XM,4639725183,itsjustme,Gift for college student.,Great product.,1476316800,,,
2,5.0,True,"11 21, 2015",A32RD6L701BIGP,4639725183,Krystal Clifton,"If you like strong tea, this is for you. It mi...",Strong,1448064000,,,
3,5.0,True,"08 12, 2015",A2UY1O1FBGKIE6,4639725183,U. Kane,Love the tea. The flavor is way better than th...,Great tea,1439337600,,,
4,5.0,True,"05 28, 2015",A3QHVBQYDV7Z6U,4639725183,The Nana,I have searched everywhere until I browsed Ama...,This is the tea I remembered!,1432771200,,,


In [6]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,category,tech1,description,fit,title,also_buy,tech2,brand,feature,rank,also_view,main_cat,similar_item,date,price,asin,imageURL,imageURLHighRes,details
0,"[Grocery & Gourmet Food, Dairy, Cheese & Eggs,...",,"[BEEMSTER GOUDA CHEESE AGED 18/24 MONTHS, Stat...",,Beemster Gouda - Aged 18/24 Months - App. 1.5 Lbs,[],,Ariola Imports,[],"165,181 in Grocery & Gourmet Food (","[B0000D9MYM, B0000D9MYL, B00ADHIGBA, B00H9OX59...",Grocery,,,$41.91,681727810,[],[],
1,"[Grocery & Gourmet Food, Cooking & Baking, Sug...",,"[Shipped from UK, please allow 10 to 21 busine...",,Trim Healthy Mama Xylitol,"[B01898YHXK, B01BCM6LAC, B00Q4OL47O, B00Q4OL5Q...",,,[],"315,867 in Grocery & Gourmet Food (",[],Grocery,,,,853347867,[https://images-na.ssl-images-amazon.com/image...,[https://images-na.ssl-images-amazon.com/image...,
2,"[Grocery & Gourmet Food, Cooking & Baking, Fro...",,[Jazz up your cakes with a sparkling monogram ...,,Letter C - Swarovski Crystal Monogram Wedding ...,[],,Unik Occasions,[],"[>#669,941 in Kitchen & Dining (See Top 100 in...",[B07DXN65TF],Amazon Home,,"September 21, 2010",$29.95,1888861118,[],[],
3,"[Grocery & Gourmet Food, Cooking & Baking, Fro...",,"[Large Letter - Height 4.75""]",,Letter H - Swarovski Crystal Monogram Wedding ...,[],,Other,"[Large Letter - Height 4.75""]","[>#832,581 in Kitchen & Dining (See Top 100 in...",[],Amazon Home,,"September 11, 2011",$11.45,1888861517,[],[],
4,"[Grocery & Gourmet Food, Cooking & Baking, Fro...",,"[4.75""]",,Letter S - Swarovski Crystal Monogram Wedding ...,[],,Unik Occasions,"[4.75"" height]","[>#590,999 in Kitchen & Dining (See Top 100 in...",[],Amazon Home,,"September 11, 2011",$15.00,1888861614,[],[],


In [7]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
data_df = data_df[data_df['asin'].isin(useful_meta_df['asin'])].reset_index(drop=True)

### Statistics

In [8]:
# Gather statistics

n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [9]:
# Print statistics

time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 127496
# Items: 41280
# Interactions: 1143063
Time Span: 2000-08-09/2018-10-02


# Build Dataset

### Interaction data

In [10]:
np.random.seed(RANDOM_SEED)

In [11]:
# Leave only user_id, item_id and time columns

out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['time', 'user_id'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A1KXONFPU2XQ5K,B00004S1C5,965779200
1,A1TIVD0Y3KJSCN,B00005IX97,1023321600
2,A2P1DJ0G8PEO14,B00004W4VD,1038614400
3,A23GFTVIETX7DS,B0000CH39R,1068249600
4,A281NPSIMI1C2R,B0000DBN2F,1070582400


In [12]:
# Reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,19384,5,965779200
1,27232,11,1023321600
2,56893,6,1038614400
3,36479,47,1068249600
4,40797,90,1070582400


In [13]:
# Leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [14]:
# Train, validation, test split

leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(823240, 127307, 127463)

In [15]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,19384,5,965779200
1,27232,11,1023321600
2,56893,6,1038614400
3,36479,47,1068249600
4,40797,90,1070582400


In [16]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
310,11372,1675,1154044800,"[2733, 21244, 30404, 32104, 20758, 14936, 1543..."
313,29158,2156,1154044800,"[14255, 23701, 37074, 38300, 23311, 31786, 150..."
422,44525,8353,1156550400,"[17192, 2122, 24452, 3887, 22451, 15213, 9988,..."
543,74504,36480,1158883200,"[14133, 7602, 27868, 11060, 740, 29546, 21267,..."
663,15757,36708,1162684800,"[8011, 13538, 5764, 19341, 7397, 36667, 14706,..."


In [17]:
# Save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [18]:
# Level-2 category

l2_cate_lst = list()
for cate_lst in useful_meta_df['category']:
    l2_cate_lst.append(cate_lst[0][2] if len(cate_lst[0]) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_cate_lst  
l2_cates = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_cates, range(1, len(l2_cates) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

In [19]:
# For every item we consider items from column 'also_buy' as complements and from column 'also_view' as substitutes

item_meta_data = dict()
for idx in range(len(useful_meta_df)):
    #info = useful_meta_df.iloc[idx]['related']
    also_buy = list(useful_meta_df.iloc[idx]['also_buy'])
    also_view = list(useful_meta_df.iloc[idx]['also_view'])
    also_buy_new, also_view_new = [], []
    
    for item in also_buy:
      if item in item2id.keys():
        also_buy_new.append(item)
    
    for item in also_view:
      if item in item2id.keys():
        also_view_new.append(item) 

    item_meta_data[idx] = {
        'item_id': item2id[useful_meta_df.iloc[idx]['asin']],
        'i_category': useful_meta_df.iloc[idx]['l2_category'],
        'r_complement': list(map(lambda x: item2id[x], also_buy_new)),
        'r_substitute': list(map(lambda x: item2id[x], also_view_new)),
    }

item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'i_category', 'r_complement', 'r_substitute']]
item_meta_df.head()

Unnamed: 0,item_id,i_category,r_complement,r_substitute
0,1,1,"[20780, 10333, 11746, 3176, 27679, 10733, 1211...","[10333, 3176, 11746, 20780, 3174, 10733, 2, 12..."
1,2,1,"[3174, 1, 28564, 3803, 11265]","[3174, 11265, 1, 3803, 11746, 18391]"
2,3,1,"[27182, 12575, 26233, 36162, 26231, 28304, 253...","[27182, 26233, 12575, 36162, 28304, 26231, 175..."
3,4,1,"[1697, 1696, 12005, 17514, 33632, 34152, 28461...","[12005, 1696, 17748, 12350, 1697, 4607, 3435, ..."
4,5,1,[],"[31181, 11142, 39827, 40303, 41]"


In [20]:
# Save results

item_meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)