In [1]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [3]:
DATASET = 'Video_Games'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items in metadata
3. Calculate basic statistics

In [10]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

Downloading interaction data into ./Video_Games


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  107M  100  107M    0     0  1316k      0  0:01:23  0:01:23 --:--:-- 1582k


In [11]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2HD75EMZR8QLN,700099867,123,"[8, 12]",Installing the game was a struggle (because of...,1.0,Pay to unlock content? I don't think so.,1341792000,"07 9, 2012"
1,A3UR8NLLY1ZHCX,700099867,"Alejandro Henao ""Electronic Junky""","[0, 0]",If you like rally cars get this game you will ...,4.0,Good rally game,1372550400,"06 30, 2013"
2,A1INA0F5CWW3J4,700099867,"Amazon Shopper ""Mr.Repsol""","[0, 0]",1st shipment received a book instead of the ga...,1.0,Wrong key,1403913600,"06 28, 2014"
3,A1DLMTOTHQ4AST,700099867,ampgreen,"[7, 10]","I got this version instead of the PS3 version,...",3.0,"awesome game, if it did not crash frequently !!",1315958400,"09 14, 2011"
4,A361M14PU2GUEG,700099867,"Angry Ryan ""Ryan A. Forrest""","[2, 2]",I had Dirt 2 on Xbox 360 and it was an okay ga...,4.0,DIRT 3,1308009600,"06 14, 2011"


In [12]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,asin,description,price,imUrl,related,salesRank,categories,title,brand
0,0078764343,Brand new sealed!,37.98,http://ecx.images-amazon.com/images/I/513h6dPb...,"{'also_bought': ['B000TI836G', 'B003Q53VZC', '...",{'Video Games': 28655},"[[Video Games, Xbox 360, Games]]",,
1,043933702X,In Stock NOW. Eligible for FREE Super Saving ...,23.5,http://ecx.images-amazon.com/images/I/61KKRndV...,"{'also_viewed': ['B000067NP1', '0439573947', '...",{'Video Games': 44080},"[[Video Games, PC, Games]]",,
2,0439339987,Grandma Groupers kelp seeds are missing and wi...,8.95,http://ecx.images-amazon.com/images/I/416QZg89...,"{'also_bought': ['B000314VVU', 'B000PXUOTE', '...",{'Video Games': 49836},"[[Video Games, PC, Games]]",,
3,0439342260,This software is BRAND NEW. Packaging may diff...,,http://ecx.images-amazon.com/images/I/61Wvu-Uj...,{'also_viewed': ['043934302X']},{'Video Games': 49156},"[[Video Games, PC, Games]]",,
4,0439339960,a scholastic clubs fairs cd rom game,,http://ecx.images-amazon.com/images/I/51k3oRCF...,{'also_viewed': ['B00028D7TG']},{'Video Games': 52262},"[[Video Games, PC, Games]]",,


In [18]:
# Count number of interactions
data_df.shape[0]

231780

In [15]:
# Count number of unique users
data_df['reviewerID'].nunique()

24303

In [17]:
# Count number of video games
meta_df['asin'].nunique()

50953

In [19]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

### Statistics

In [20]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [22]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 24303
# Items: 10672
# Interactions: 231780
Time Span: 1999-10-14/2014-07-22


# Build Dataset

### Interaction data

In [23]:
np.random.seed(RANDOM_SEED)

In [24]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]

# Drop duplicate interactions
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])

# Sort values by the time stamp
out_df = out_df.sort_values(by=['time', 'user_id'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A2AXQTB83VMK4L,B0000296O5,939859200
1,A1QA8K3LD9K892,B00000K1V2,942192000
2,A3VWWQT4XDSBGQ,B00000K4AX,942192000
3,AMGJMFJ63DWWH,B00000JL6V,942192000
4,AMGJMFJ63DWWH,B00001NFUA,942192000


In [25]:
# reindex (start from 1)

uids = sorted(out_df['user_id'].unique())
user2id = dict(zip(uids, range(1, len(uids) + 1)))
iids = sorted(out_df['item_id'].unique())
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,8429,251,939859200
1,4700,110,942192000
2,18572,130,942192000
3,21894,99,942192000
4,21894,171,942192000


In [27]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [28]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(183174, 24303, 24303)

In [47]:
out_df[out_df['user_id'] == 1612]

Unnamed: 0,user_id,item_id,time
95965,1612,3074,1289347200
125617,1612,3822,1328054400
125618,1612,5985,1328054400
163287,1612,3271,1362441600
163288,1612,8402,1362441600
163289,1612,9111,1362441600


In [45]:
test_df[test_df['user_id'] == 16127]

Unnamed: 0,user_id,item_id,time,neg_items
174955,16127,9950,1369267200,"[7286, 5184, 7207, 5634, 3836, 1739, 4225, 919..."


In [44]:
dev_df[dev_df['user_id'] == 16127]

Unnamed: 0,user_id,item_id,time,neg_items
174954,16127,9813,1369267200,"[960, 5433, 3979, 34, 2160, 3406, 9597, 733, 2..."


In [43]:
train_df[train_df['user_id'] == 16127]

Unnamed: 0,user_id,item_id,time
131173,16127,8765,1334016000
145080,16127,6712,1353456000
158007,16127,9999,1359417600


In [35]:
leave_df

Unnamed: 0,user_id,item_id,time
0,8429,251,939859200
1,4700,110,942192000
2,18572,130,942192000
3,21894,99,942192000
6,22097,153,942537600
...,...,...,...
231717,6136,10056,1405987200
231729,12927,979,1405987200
231744,17792,2600,1405987200
231766,19333,7116,1405987200


In [16]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,8429,251,939859200
1,4700,110,942192000
2,18572,130,942192000
3,21894,99,942192000
4,21894,171,942192000


In [17]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
43,17088,295,944524800,"[2733, 9846, 3265, 4860, 9226, 7892, 4374, 587..."
88,20228,265,947548800,"[3456, 6168, 5820, 6522, 6243, 7743, 9124, 101..."
219,7551,475,955238400,"[1242, 9014, 4404, 1199, 2998, 5662, 808, 2122..."
230,1070,633,956534400,"[7445, 3397, 5348, 7035, 596, 648, 574, 6798, ..."
250,23921,216,956880000,"[3517, 2672, 2388, 10061, 5395, 3442, 8011, 10..."


In [18]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [19]:
# level-2 category

l2_cate_lst = list()
for cate_lst in useful_meta_df['categories']:
    l2_cate_lst.append(cate_lst[0][2] if len(cate_lst[0]) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_cate_lst  
l2_cates = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_cates, range(1, len(l2_cates) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

In [20]:
item_meta_data = dict()
for idx in range(len(useful_meta_df)):
    info = useful_meta_df.iloc[idx]['related']
    item_meta_data[idx] = {
        'item_id': item2id[useful_meta_df.iloc[idx]['asin']],
        'i_category': useful_meta_df.iloc[idx]['l2_category'],
        'r_complement': list(map(lambda x: item2id[x], info['also_bought'])) if 'also_bought' in info else [],
        'r_substitute': list(map(lambda x: item2id[x], info['also_viewed'])) if 'also_viewed' in info else [],
    }

item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'i_category', 'r_complement', 'r_substitute']]
item_meta_df.head()

Unnamed: 0,item_id,i_category,r_complement,r_substitute
0,1,28,"[5346, 3620, 6155, 8392, 4510, 5467, 6494, 954...",[]
1,2,1,"[7546, 6483, 6314, 5610, 10016, 6301, 5618, 57...",[]
2,3,0,"[1879, 8893, 4495, 4223, 3399, 2253, 1662, 702...",[]
3,4,1,"[9753, 8893, 4541, 6594, 6649, 6782, 4167, 740...",[]
4,5,28,"[7762, 3807, 7071, 8672, 4899, 6962, 7573, 101...",[]


In [21]:
# save results

item_meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)

In [22]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,8429,251,939859200
1,4700,110,942192000
2,18572,130,942192000
3,21894,99,942192000
4,21894,171,942192000


In [45]:
train_df[train_df['user_id'] == 20440]

Unnamed: 0,user_id,item_id,time
31964,20440,3084,1119830400
45025,20440,2217,1173744000
46036,20440,2699,1177459200
81384,20440,7031,1262649600
100391,20440,6627,1294704000
112626,20440,5703,1312848000
131520,20440,7410,1334534400
132448,20440,6874,1335744000


In [39]:
count_times = train_df.groupby(["user_id"]).count()
count_times
# count_times.mean(axis=0)

Unnamed: 0_level_0,item_id,time
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4,4
2,25,25
3,4,4
4,3,3
5,4,4
...,...,...
24299,7,7
24300,8,8
24301,3,3
24302,6,6


In [35]:
count_times.max(axis=0)

771

In [36]:
count_times.min(axis=0)

3