In [21]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [22]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [23]:
DATASET = 'Office_Products'
RAW_PATH = os.path.join('./', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

RANDOM_SEED = 0
NEG_ITEMS = 99

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items in metadata
3. Calculate basic statistics

In [24]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

In [16]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1KLRMWW2FWPL4,31887,"Amazon Customer ""cameramom""","[0, 0]",This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,"02 12, 2011"
1,A2G5TCU2WDFZ65,31887,Amazon Customer,"[0, 0]",I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,"01 19, 2013"
2,A1RLQXYNCMWRWN,31887,Carola,"[0, 0]",What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,"01 4, 2013"
3,A8U3FAMSJVHS5,31887,Caromcg,"[0, 0]","We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,"04 27, 2014"
4,A3GEOILWLK86XM,31887,CJ,"[0, 0]",Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,"03 15, 2014"


In [25]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,asin,description,price,imUrl,related,salesRank,categories,title,brand
0,0078800242,All in one TeacherWorks Plus CD-ROM,93.06,http://ecx.images-amazon.com/images/I/41K1aBkl...,{'buy_after_viewing': ['007861970X']},{'Software': 18529},"[[Office Products, Office & School Supplies, C...",,
1,0113000316,High quality inkjet cartridges use high-densit...,,http://ecx.images-amazon.com/images/I/51AMwP3D...,,,"[[Office Products, Office & School Supplies, P...",123GetInk -14-pack 5-black 3-cyan 3-magenta 3-...,
2,043928631X,"Harry Potter living bookmark showing Harry, He...",,http://ecx.images-amazon.com/images/I/41SulB7T...,,,"[[Office Products, Office & School Supplies, L...",Harry Potter Lenticular Hologram Bookmark - Ha...,
3,0439340039,Windows based computer game.,,http://ecx.images-amazon.com/images/I/51zQE0w%...,,{'Software': 32784},"[[Office Products, Office & School Supplies, E...",,
4,0439394058,"126 pieces: 23"" tall schoolhouse calendar, 12 ...",11.64,http://ecx.images-amazon.com/images/I/51DFp0Lg...,"{'also_bought': ['B000QE1HHU', 'B00207MG4Y', '...",,"[[Office Products, Office & School Supplies, E...",Scholastic SC939405 All-In-One Schoolhouse Cal...,Scholastic


In [18]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

### Statistics

In [19]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [20]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 39387
# Items: 23033
# Interactions: 278677
Time Span: 2003-03-29/2014-07-23


# Build Dataset

### Interaction data

In [43]:
np.random.seed(RANDOM_SEED)

In [44]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['time', 'user_id'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A1ABVP0DV1ZN89,B00000JBAT,929232000
1,A6ILK3FXYH595,B00000J4FS,931132800
2,A7RV1KU5O0II9,B00000JFMK,931392000
3,A1JTSRG8SU4VFO,B00000JFE3,932688000
4,AJ6TMOHHFJJAJ,B00000JMO4,932688000


In [45]:
# reindex (start from 1)

# uids = sorted(out_df['user_id'].unique())
uids = out_df['user_id'].unique()
user2id = dict(zip(uids, range(1, len(uids) + 1)))
# iids = sorted(out_df['item_id'].unique())
iids = out_df['item_id'].unique()
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,1,1,929232000
1,2,2,931132800
2,3,3,931392000
3,4,4,932688000
4,5,5,932688000


In [46]:
# leave one out spliting

clicked_item_set = dict()
for user_id, seq_df in out_df.groupby('user_id'):
    clicked_item_set[user_id] = set(seq_df['item_id'].values.tolist())
    
def generate_dev_test(data_df):
    result_dfs = []
    n_items = data_df['item_id'].value_counts().size
    for idx in range(2):
        result_df = data_df.groupby('user_id').tail(1).copy()
        data_df = data_df.drop(result_df.index)
        neg_items = np.random.randint(1, n_items + 1, (len(result_df), NEG_ITEMS))
        for i, uid in enumerate(result_df['user_id'].values):
            user_clicked = clicked_item_set[uid]
            for j in range(len(neg_items[i])):
                while neg_items[i][j] in user_clicked:
                    neg_items[i][j] = np.random.randint(1, n_items + 1)
        result_df['neg_items'] = neg_items.tolist()
        result_dfs.append(result_df)
    return result_dfs, data_df

In [47]:
leave_df = out_df.groupby('user_id').head(1)
data_df = out_df.drop(leave_df.index)

[test_df, dev_df], data_df = generate_dev_test(data_df)
train_df = pd.concat([leave_df, data_df]).sort_index()

len(train_df), len(dev_df), len(test_df)

(1304382, 192403, 192403)

In [48]:
train_df.head()

Unnamed: 0,user_id,item_id,time
0,1,1,929232000
1,2,2,931132800
2,3,3,931392000
3,4,4,932688000
4,5,5,932688000


In [49]:
test_df.head()

Unnamed: 0,user_id,item_id,time,neg_items
104,85,47,948931200,"[2733, 43568, 42614, 52417, 45892, 21244, 3040..."
307,166,110,960595200,"[27258, 16299, 12373, 51404, 4421, 45831, 3987..."
458,307,19,966038400,"[15876, 8287, 44259, 38764, 18729, 13641, 3092..."
565,33,76,969580800,"[34010, 43672, 58166, 4404, 17583, 29845, 1938..."
801,151,107,976320000,"[17814, 61642, 44890, 18931, 31201, 14300, 167..."


In [50]:
# save results

train_df.to_csv(os.path.join(RAW_PATH, 'train.csv'), sep='\t', index=False)
dev_df.to_csv(os.path.join(RAW_PATH, 'dev.csv'), sep='\t', index=False)
test_df.to_csv(os.path.join(RAW_PATH, 'test.csv'), sep='\t', index=False)

### Item Metadata

In [51]:
# level-2 category

l2_cate_lst = list()
for cate_lst in useful_meta_df['categories']:
    l2_cate_lst.append(cate_lst[0][2] if len(cate_lst[0]) > 2 else np.nan)
useful_meta_df['l2_category'] = l2_cate_lst  
l2_cates = sorted(useful_meta_df['l2_category'].dropna().unique())
l2_dict = dict(zip(l2_cates, range(1, len(l2_cates) + 1)))
useful_meta_df['l2_category'] = useful_meta_df['l2_category'].apply(lambda x: l2_dict[x] if x == x else 0)

In [52]:
# brand

brand_lst = list()
for brand in useful_meta_df['brand']:
    brand_lst.append(brand if not pd.isnull(brand) else np.nan)
useful_meta_df['l1_brand'] = brand_lst
# brands = sorted(useful_meta_df['brand'].dropna().unique())
brands = useful_meta_df['l1_brand'].dropna().unique()
brand_dict = dict(zip(brands, range(1, len(brands) + 1)))
useful_meta_df['l1_brand'] = useful_meta_df['l1_brand'].apply(lambda x: brand_dict[x] if x == x else 0)

In [53]:
useful_meta_df

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand,l2_category,l1_brand
0,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B004PNVETE', 'B007X3DAMM', '...",,114,0
1,0594451647,http://ecx.images-amazon.com/images/I/51RjSETO...,HDTV Adapter Kit for NOOK HD and NOOK HD+\nThi...,"[[Electronics, Computers & Accessories, Touch ...",Barnes &amp; Noble HDTV Adapter Kit for NOOK H...,49.95,,"{'also_bought': ['B00AFXGMN6', 'B0063W7XJK', '...",,108,0
2,0594481813,http://ecx.images-amazon.com/images/I/41K7ymN5...,Power up your device with this Barnes &amp; No...,"[[Electronics, eBook Readers & Accessories, Po...",Barnes &amp; Noble OV/HB-ADP Universal Power Kit,19.65,,"{'also_bought': ['B00BN1Q5JA', 'B0069QPC0W', '...",Barnes &amp; Noble,78,1
3,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[[Electronics, Accessories & Supplies, Audio &...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B005LJQMOS', 'B000WYVBR0', '...",VideoSecu,6,2
4,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[[Electronics, eBook Readers & Accessories]]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['140053271X', 'B0069ZFYCY', '...",Barnes &amp; Noble,0,1
...,...,...,...,...,...,...,...,...,...,...,...
62996,B00L2442H0,http://ecx.images-amazon.com/images/I/51NFBdDZ...,Description:Add up to 4 peripherals quickly an...,"[[Electronics, Computers & Accessories, Networ...",Sabrent 4 Port Portable USB 2.0 Hub (9.5&quot;...,5.99,{'Electronics': 1383},"{'also_bought': ['B001DHECXA', 'B00DBV28TG', '...",Sabrent,72,31
62997,B00L26YDA4,http://ecx.images-amazon.com/images/I/41061q4C...,Description:The Sabrent Wifi Receiver lets you...,"[[Electronics, Car & Vehicle Electronics, Vehi...",Sabrent Wifi Audio Receiver (Supports DLNA and...,35.99,{'Cell Phones & Accessories': 6139},"{'also_bought': ['B00FDKAFAW', 'B005GM1Q1O', '...",Sabrent,113,31
62998,B00L21HC7A,http://ecx.images-amazon.com/images/I/513kT0it...,Description:The Sabrent CR-CCU3 3-Slot Card Re...,"[[Electronics, Computers & Accessories, Cables...",Sabrent USB 3.0 SuperSpeed 3 slot Memory Card ...,14.99,{},"{'also_viewed': ['B00L26YDA4', 'B008VQ2YUY', '...",Sabrent,24,31
62999,B00L3YHF6O,http://ecx.images-amazon.com/images/I/41SBx7QY...,"Mind-Shattering Performance, Precision-Tuned F...","[[Electronics, Home Audio, Stereo Components, ...",NEW! Creative Sound Blaster Roar: Portable NFC...,149.99,{'Cell Phones & Accessories': 131},"{'also_bought': ['B002XVYZ82', 'B004LTEUDO', '...",,99,0


In [54]:
item_meta_data = dict()
for idx in range(len(useful_meta_df)):
    info = useful_meta_df.iloc[idx]['related']
    item_meta_data[idx] = {
        'item_id': item2id[useful_meta_df.iloc[idx]['asin']],
        'i_category': useful_meta_df.iloc[idx]['l2_category'],
        'i_brand': useful_meta_df.iloc[idx]['l1_brand'],
        'r_complement': list(map(lambda x: item2id[x], info['also_bought'])) if 'also_bought' in info else [],
        'r_substitute': list(map(lambda x: item2id[x], info['also_viewed'])) if 'also_viewed' in info else [],
    }

item_meta_df = pd.DataFrame.from_dict(item_meta_data, orient='index')
item_meta_df = item_meta_df[['item_id', 'i_category', 'i_brand', 'r_complement', 'r_substitute']]
item_meta_df

Unnamed: 0,item_id,i_category,i_brand,r_complement,r_substitute
0,24136,114,0,[],"[31564, 51985, 45367, 42432, 15923, 36500, 174..."
1,61235,108,0,"[54780, 39816, 56707, 56040, 56516, 52021, 516...",[]
2,55994,78,1,"[55632, 42389, 55596, 45090, 58113, 57544, 595...",[]
3,9586,6,2,"[38035, 21535, 13296, 12094, 8184, 26354, 5293...",[]
4,23036,0,1,"[32013, 41087, 26017, 37774, 48568, 38788, 535...",[]
...,...,...,...,...,...
62996,62453,72,31,"[59918, 57309, 62372, 59797, 61954, 34245, 368...",[]
62997,62963,113,31,"[59419, 35196, 50597, 62372, 18881, 13350, 618...",[]
62998,62978,24,31,[],"[62963, 46878, 62858, 50196, 56896, 39437, 586..."
62999,62994,99,0,"[23059, 29791, 60592, 62801, 57292, 34365, 581...",[]


In [55]:
# save results

item_meta_df.to_csv(os.path.join(RAW_PATH, 'item_meta.csv'), sep='\t', index=False)

In [57]:
useful_meta_df

Unnamed: 0,asin,imUrl,description,categories,title,price,salesRank,related,brand,l2_category,l1_brand
0,0528881469,http://ecx.images-amazon.com/images/I/51FnRkJq...,"Like its award-winning predecessor, the Intell...","[[Electronics, GPS & Navigation, Vehicle GPS, ...",Rand McNally 528881469 7-inch Intelliroute TND...,299.99,,"{'also_viewed': ['B004PNVETE', 'B007X3DAMM', '...",,114,0
1,0594451647,http://ecx.images-amazon.com/images/I/51RjSETO...,HDTV Adapter Kit for NOOK HD and NOOK HD+\nThi...,"[[Electronics, Computers & Accessories, Touch ...",Barnes &amp; Noble HDTV Adapter Kit for NOOK H...,49.95,,"{'also_bought': ['B00AFXGMN6', 'B0063W7XJK', '...",,108,0
2,0594481813,http://ecx.images-amazon.com/images/I/41K7ymN5...,Power up your device with this Barnes &amp; No...,"[[Electronics, eBook Readers & Accessories, Po...",Barnes &amp; Noble OV/HB-ADP Universal Power Kit,19.65,,"{'also_bought': ['B00BN1Q5JA', 'B0069QPC0W', '...",Barnes &amp; Noble,78,1
3,0972683275,http://ecx.images-amazon.com/images/I/41hYJ9Mw...,The VideoSecu TV mount is a mounting solution ...,"[[Electronics, Accessories & Supplies, Audio &...",VideoSecu 24&quot; Long Arm TV Wall Mount Low ...,29.99,{},"{'also_bought': ['B005LJQMOS', 'B000WYVBR0', '...",VideoSecu,6,2
4,1400532620,http://ecx.images-amazon.com/images/I/519ca3cu...,Barnes & Noble Nook eReader - no 3GMeet nook. ...,"[[Electronics, eBook Readers & Accessories]]",Barnes &amp; Noble Nook eReader - no 3G,74.95,{'Electronics': 23071},"{'also_bought': ['140053271X', 'B0069ZFYCY', '...",Barnes &amp; Noble,0,1
...,...,...,...,...,...,...,...,...,...,...,...
62996,B00L2442H0,http://ecx.images-amazon.com/images/I/51NFBdDZ...,Description:Add up to 4 peripherals quickly an...,"[[Electronics, Computers & Accessories, Networ...",Sabrent 4 Port Portable USB 2.0 Hub (9.5&quot;...,5.99,{'Electronics': 1383},"{'also_bought': ['B001DHECXA', 'B00DBV28TG', '...",Sabrent,72,31
62997,B00L26YDA4,http://ecx.images-amazon.com/images/I/41061q4C...,Description:The Sabrent Wifi Receiver lets you...,"[[Electronics, Car & Vehicle Electronics, Vehi...",Sabrent Wifi Audio Receiver (Supports DLNA and...,35.99,{'Cell Phones & Accessories': 6139},"{'also_bought': ['B00FDKAFAW', 'B005GM1Q1O', '...",Sabrent,113,31
62998,B00L21HC7A,http://ecx.images-amazon.com/images/I/513kT0it...,Description:The Sabrent CR-CCU3 3-Slot Card Re...,"[[Electronics, Computers & Accessories, Cables...",Sabrent USB 3.0 SuperSpeed 3 slot Memory Card ...,14.99,{},"{'also_viewed': ['B00L26YDA4', 'B008VQ2YUY', '...",Sabrent,24,31
62999,B00L3YHF6O,http://ecx.images-amazon.com/images/I/41SBx7QY...,"Mind-Shattering Performance, Precision-Tuned F...","[[Electronics, Home Audio, Stereo Components, ...",NEW! Creative Sound Blaster Roar: Portable NFC...,149.99,{'Cell Phones & Accessories': 131},"{'also_bought': ['B002XVYZ82', 'B004LTEUDO', '...",,99,0


In [61]:
import html
import re
from tqdm import tqdm
def clean_text(text):
    text = html.unescape(text)
    text = re.sub(r'["\n\r]*', '', text)
    return text
outf = open(os.path.join(RAW_PATH, f'{DATASET}.text'),"w")
outf.write(f"id\ttext\n")
for index, row in tqdm(useful_meta_df.iterrows(),total=len(useful_meta_df)):
    item_id, title, categories, brand = row['asin'], row["title"], row['categories'],row['brand']
    text = ""
    if not pd.isnull(title):
        title = " ".join(title.strip().split()[:32])
        title = clean_text(title)
        text += title
    if len(categories) > 0:
        for cate in categories:
            if cate[0] == "Electronics":
                category = clean_text(cate[-1])
                text += " " + category
    #     item2category[item_id] = category
    if not pd.isnull(brand):
        brand = clean_text(brand)
        text += " " + brand
        # item2brand[item_id] = clean_text(brand)
    outf.write(f"{item_id}\t{text}\n")

100%|██████████| 63001/63001 [00:14<00:00, 4306.76it/s]


In [60]:
def load_unit2index(file):
    unit2index = dict()
    with open(file, 'r') as fp:
        for line in fp:
            unit, index = line.strip().split('\t')
            unit2index[unit] = int(index)
    return unit2index


def write_remap_index(unit2index, file):
    with open(file, 'w') as fp:
        for unit in unit2index:
            fp.write(unit + '\t' + str(unit2index[unit]) + '\n')

write_remap_index(user2id, os.path.join(RAW_PATH, f'{DATASET}.user2index'))
write_remap_index(item2id, os.path.join(RAW_PATH, f'{DATASET}.item2index'))