In [7]:
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForMaskedLM
from tqdm import tqdm
import pandas as pd
from matplotlib import pyplot as plt
from scipy import stats

In [5]:
############# Build id2name #############
id2name = {}
for item in pd.read_csv('meta_9946.csv').values.tolist():
    id, name = item[0], item[1]
    if id in id2name: print(id)
    id2name[id] = name

oldid2newid = {} # trim empty ids
for new_id, old_id in enumerate(sorted(id2name.keys())): oldid2newid[old_id] = new_id

id2name = [id2name[id] for id in sorted(id2name.keys())]

############# Build user2ids #############
user2ids = {}
for item in pd.read_csv('events_297018.csv').values.tolist():
    user, id, time = item[1], oldid2newid[item[0]], item[3]
    if user not in user2ids: user2ids[user] = []
    user2ids[user].append((id, time))
for user in user2ids:
    user2ids[user] = sorted(user2ids[user], key=lambda x: x[1])
    user2ids[user], _ = zip(*user2ids[user])

In [8]:
print('# user:', len(user2ids), ',', '# item:', len(id2name), ',', '# interaction:', sum([len(user2ids[user]) for user in user2ids]))
print('History:', stats.describe([len(user2ids[user]) for user in user2ids]))
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
print('# words in item name:', stats.describe([len(name.split()) for name in id2name]))
print('# tokens in item name:', stats.describe([len(tokenizer(name, add_special_tokens=False)['input_ids']) for name in id2name]))

# user: 75601 , # item: 9946 , # interaction: 297018
History: DescribeResult(nobs=75601, minmax=(2, 201), mean=3.9287575561169827, variance=24.440585822714898, skewness=9.44282152586692, kurtosis=180.0857730869049)
# words in item name: DescribeResult(nobs=9946, minmax=(2, 34), mean=10.67052081238689, variance=16.712448134961477, skewness=1.5549009139109813, kurtosis=3.680650137290357)
# tokens in item name: DescribeResult(nobs=9946, minmax=(3, 76), mean=23.487331590589182, variance=77.176561466105, skewness=1.4819774965770933, kurtosis=3.410896833209314)


In [9]:
def save_jsonl(data, filename):
    with open(filename, 'w') as fout:
        for item in data:
            fout.write(json.dumps(item) + '\n')

data = [{'user': user, 'ids': user2ids[user]} for user in user2ids]
save_jsonl(data, 'data.jsonl')
json.dump(id2name, open('id2name.json', 'w'))

In [13]:
user = 'A1STOHTZ9BOA43'
print([id2name[id] for id in user2ids[user]])
print(user2ids[user], len(user2ids[user]))

['PASTA RONI SHELL WHT CHEDDAR 6.2OZ.', 'Softsoap Liquid Hand Soap, Lavender and Chamomile - 7.5 Fluid Ounce.', 'Softsoap Liquid Hand Soap, Lavender and Chamomile - 7.5 Fluid Ounce.', 'Twinings of London Earl Grey Black Tea Bags, 100 Count.', 'King Arthur White Whole Wheat Flour, 100% Whole Grain, 5 Pounds.', 'Del Monte Pear Halves, Lite, 15 Oz.', 'Del Monte Canned Yellow Cling Sliced Peaches in Extra Light Syrup, 15-Ounce.', 'Arm Hammer Laundry Detergent Plus OxiClean, Fresh Scent, 122.5 Oz.', 'Crystal Light Drink Mix, Peach Tea, Pitcher Packets, 6 Count.', 'Rice-A-Roni Chicken and Mushroom, 5 Ounce.', 'Kraft Barbecue Sauce Slow-Simmered Sauce, Sweet Honey, 18 Ounce.', "Campbell's Condensed Soup, Tomato, 10.75 oz, 6 Count.", 'Arm Hammer Advance White Extreme Whitening Toothpaste, 6 oz Twin Pack (Packaging May Vary).', 'GOOD PLENTY Licorice Candy, 5 Pound Bulk Candy.', 'Boost Original Complete Nutritional Drink, Rich Chocolate, 8 fl oz Bottle, 12 Pack.', 'Boost Original Complete Nutrit