In [2]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
DATASET = 'Office_Products'
RAW_PATH = os.path.join('./data/', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

# Load Data

1. Load interaction data and item metadata
2. Filter out unuseful items in metadata
3. Calculate basic statistics

In [5]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

python(27308) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(27312) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Downloading interaction data into ./datasets/Office_Products


100 17.6M  100 17.6M    0     0   779k      0  0:00:23  0:00:23 --:--:-- 1198k
python(27662) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0

Downloading item metadata into ./datasets/Office_Products


100 45.3M  100 45.3M    0     0  1864k      0  0:00:24  0:00:24 --:--:-- 2819k


In [6]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
data_df.head()

asin_counts = data_df['asin'].value_counts()
min_asin_frequency = asin_counts.min()

reviewerName_counts = data_df['reviewerName'].value_counts()
min_reviewerName_frequency = reviewerName_counts.min()



In [50]:
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
meta_df.head()

Unnamed: 0,asin,description,price,imUrl,related,salesRank,categories,title,brand
0,0078800242,All in one TeacherWorks Plus CD-ROM,93.06,http://ecx.images-amazon.com/images/I/41K1aBkl...,{'buy_after_viewing': ['007861970X']},{'Software': 18529},"[[Office Products, Office & School Supplies, C...",,
1,0113000316,High quality inkjet cartridges use high-densit...,,http://ecx.images-amazon.com/images/I/51AMwP3D...,,,"[[Office Products, Office & School Supplies, P...",123GetInk -14-pack 5-black 3-cyan 3-magenta 3-...,
2,043928631X,"Harry Potter living bookmark showing Harry, He...",,http://ecx.images-amazon.com/images/I/41SulB7T...,,,"[[Office Products, Office & School Supplies, L...",Harry Potter Lenticular Hologram Bookmark - Ha...,
3,0439340039,Windows based computer game.,,http://ecx.images-amazon.com/images/I/51zQE0w%...,,{'Software': 32784},"[[Office Products, Office & School Supplies, E...",,
4,0439394058,"126 pieces: 23"" tall schoolhouse calendar, 12 ...",11.64,http://ecx.images-amazon.com/images/I/51DFp0Lg...,"{'also_bought': ['B000QE1HHU', 'B00207MG4Y', '...",,"[[Office Products, Office & School Supplies, E...",Scholastic SC939405 All-In-One Schoolhouse Cal...,Scholastic


In [52]:
# Only retain items that appear in interaction data

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

### Statistics

In [53]:
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()

In [54]:
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 4905
# Items: 2420
# Interactions: 53258
Time Span: 2000-09-29/2014-07-23


In [21]:
df = useful_meta_df
((df.isnull().sum())/df.shape[0]).sort_values(ascending=False).map(lambda x:"{:.2%}".format(x))

salesRank      76.90%
brand          19.88%
price           2.73%
description     1.61%
title           0.12%
asin            0.00%
imUrl           0.00%
related         0.00%
categories      0.00%
dtype: object

# Build Dataset

### Interaction data

In [55]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['user_id', 'time'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A00473363TJ8YSZ3YAGG9,B004APM26Q,1357430400
1,A00473363TJ8YSZ3YAGG9,B0073W70BK,1357430400
2,A00473363TJ8YSZ3YAGG9,B00007E7D2,1387843200
3,A00473363TJ8YSZ3YAGG9,B007ZYF266,1387843200
4,A00473363TJ8YSZ3YAGG9,B00D51XMLU,1387843200


In [56]:
# reindex (start from 1)

# uids = sorted(out_df['user_id'].unique())
uids = out_df['user_id'].unique()
user2id = dict(zip(uids, range(1, len(uids) + 1)))
# iids = sorted(out_df['item_id'].unique())
iids = out_df['item_id'].unique()
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,1,1,1357430400
1,1,2,1357430400
2,1,3,1387843200
3,1,4,1387843200
4,1,5,1387843200


In [57]:
useful_meta_df['item_id'] = useful_meta_df['asin'].apply(lambda x: item2id[x])

In [59]:
# save data
out_df.to_csv(RAW_PATH+'/inter.csv', index=False)
useful_meta_df.to_csv(RAW_PATH+'/meta.csv', index=False)

In [128]:
# read data
inter = pd.read_csv(RAW_PATH+'/inter.csv')
useful_meta_df = pd.read_csv(RAW_PATH+'/meta.csv')

In [129]:
# seq slide augmentation

def prepare_data_augmentation(df):
    max_item_list_len = 20

    last_uid = None
    uid_list, item_list, target, item_list_length = [], [], [], []

    for index, row in df.iterrows():
        uid, item_id = row['user_id'], row['item_id']
        if last_uid != uid:
            last_uid = uid
            seq = []
        else:
            if len(seq) > max_item_list_len:
                seq = seq[1:]
            uid_list.append(uid)
            item_list.append(seq[:])
            target.append(item_id)
            item_list_length.append(len(seq))
        seq.append(item_id)

    return uid_list, item_list, target, item_list_length
 

uid_list, item_list, target, item_list_length = prepare_data_augmentation(inter)

In [130]:
import ast
# useful_meta_df
useful_meta_df['categories'] = useful_meta_df['categories'].apply(ast.literal_eval)
useful_meta_df['categories'] = useful_meta_df['categories'].apply(lambda x: x[0][2:])
columns_to_drop = ['asin', 'description','imUrl', 'related', 'salesRank']
useful_meta_df_dropped = useful_meta_df.drop(columns=columns_to_drop)
meta_dict = useful_meta_df_dropped.set_index('item_id').T.to_dict()

In [164]:
prompt_input = {}
for i in range(len(item_list)):
    uid, items = uid_list[i], item_list[i]
    # print([uid] + items)
    key = ":".join(map(str, [uid] + items))
    title_list = []
    for item in items:
        title_list.append(meta_dict[item]['title'])
    if i < 10:
        print(items)
        print(title_list)
    formatted_titles = [f'<{title}>' for title in title_list]
    formatted_string = '; '.join(formatted_titles)
    prompt_input[key] = formatted_string

[1]
['Brownline 2013 Monthly Desk Pad Calendar, January - December, 22 x 17 Inches (C1731-13)']
[1, 2]
['Brownline 2013 Monthly Desk Pad Calendar, January - December, 22 x 17 Inches (C1731-13)', 'Panasonic KX-TG4741B DECT 6.0 Cordless Phone with Answering System, Black, 1 Handset']
[1, 2, 3]
['Brownline 2013 Monthly Desk Pad Calendar, January - December, 22 x 17 Inches (C1731-13)', 'Panasonic KX-TG4741B DECT 6.0 Cordless Phone with Answering System, Black, 1 Handset', 'Avery Self-Adhesive Laminating Sheets, 9 x 12 Inches, Box of 50 (73601)']
[1, 2, 3, 4]
['Brownline 2013 Monthly Desk Pad Calendar, January - December, 22 x 17 Inches (C1731-13)', 'Panasonic KX-TG4741B DECT 6.0 Cordless Phone with Answering System, Black, 1 Handset', 'Avery Self-Adhesive Laminating Sheets, 9 x 12 Inches, Box of 50 (73601)', 'Pilot B2P - Bottle to Pen - Retractable Ball Point Pens Made from Recycled Bottles, 2 Pen Pack, Fine Point, Black (32605)']
[6]
['Zebra Z-Grip  Retractable Ballpoing Pens Medium, 1.0 

In [175]:
import pickle
with open(RAW_PATH+'/user_prompt_input.pkl', 'wb') as pickle_file:
    pickle.dump(prompt_input, pickle_file)

In [186]:
system_input = """Assume you are an office products recommendation expert.
You will be provided with a user's historical purchases of office products in chronological order, given in the following format:
<The title of item1>; <The title of item2>; <The title of item3>;... 
Please conclude the user's preference in purchasing office products. Note that your response should be a coherent paragraph of no more than 100 words.
"""

In [289]:
## generate item summray: what kind of users will like this item

# step 1: find the sequences ending with the current item

inter = pd.read_csv(RAW_PATH+'/inter.csv')
useful_meta_df = pd.read_csv(RAW_PATH+'/meta.csv')
useful_meta_df['brand'] = useful_meta_df['brand'].fillna('missing or unkown')
useful_meta_df['title'] = useful_meta_df['title'].fillna('missing')
useful_meta_df['description'] = useful_meta_df['description'].fillna('missing')
import ast
useful_meta_df['categories'] = useful_meta_df['categories'].apply(ast.literal_eval)
useful_meta_df['categories'] = useful_meta_df['categories'].apply(lambda x: x[0][2:])
columns_to_drop = ['asin', 'imUrl', 'related', 'salesRank']
useful_meta_df_dropped = useful_meta_df.drop(columns=columns_to_drop)
meta_dict = useful_meta_df_dropped.set_index('item_id').T.to_dict()

# seq slide augmentation

def prepare_data_augmentation(df):
    max_item_list_len = 20

    last_uid = None
    uid_list, item_list, target, item_list_length = [], [], [], []

    for index, row in df.iterrows():
        uid, item_id = row['user_id'], row['item_id']
        if last_uid != uid:
            last_uid = uid
            seq = []
        else:
            if len(seq) > max_item_list_len:
                seq = seq[1:]
            uid_list.append(uid)
            item_list.append(seq[:])
            target.append(item_id)
            item_list_length.append(len(seq))
        seq.append(item_id)

    return uid_list, item_list, target, item_list_length
 

uid_list, item_list, target, item_list_length = prepare_data_augmentation(inter)


In [290]:
filtered_list = [sublist[-11:] for sublist in item_list if (len(sublist) >= 2 and len(sublist) <= 21)]
print("length of filtered_list", len(filtered_list))

last_values = set(sublist[-1] for sublist in filtered_list)

end_dict = {value: [] for value in last_values}

for sublist in filtered_list:
    last_value = sublist[-1]
    end_dict[last_value].append(sublist)

import random
random.seed(0)

for key in end_dict:
    if len(end_dict[key]) > 5:
        end_dict[key] = random.sample(end_dict[key], 5)

end_dict_text = {}
for key in end_dict:
    end_dict_text[key] = []
    for ls in end_dict[key]:
        ls_text = [meta_dict[item]['title'] for item in ls]
        ls_text = [val for val in ls_text if val != "missing"]
        end_dict_text[key].append(ls_text)


end_dict_text_formatted = {}
for item_id in end_dict_text.keys():
    same_target_seqs = ''
    for seq in end_dict_text[item_id]:
        print(seq)
        seq = ' -> '.join(seq) 
        seq = '[' + seq + '] # '
        seq += " \n "
        same_target_seqs += seq
    end_dict_text_formatted[item_id] = same_target_seqs

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [291]:
def item_template(item_dict):
    text = (
        f"The product name is {item_dict['title']}; "
        f"brand is {item_dict['brand']}; "
        f"categories are {item_dict['categories']}; "
        f"price is {item_dict['price']}; "
        f"The detailed description of the product is {item_dict['description'][:1000]}..."
    )
    return text

item_meta_formatted = {}
for item_id in item2id.values():
    item_meta_formatted[item_id] = item_template(meta_dict[item_id])
    
    

In [292]:
def complete_prompt_gen(item_info, history):
    prompt = f"""
    Assume you are an office products recommendation expert. Please help me analyze a specific office product. You will be provided with the following information:
    1) The attributes of the office product: {item_info};
    2) The historical purchase information of users who have bought this product: {history}. Here, different sequences are separated by '#', and each sequence is in LIST format, representing a certain user's historical purchases. Items in each sequence are separated by '->', and the last item in all sequences is the specific product mentioned above.
    
    Requirements:
    1) Please briefly describe the given target item.
    2) Based on the provided sequences, please analyze what type of users would purchase this item. Please do not generally say office workers or people who like office supplies, as all users in this context have a need to purchase office supplies. Please provide a more detailed granularity.
    Please provide your answer in JSON format, following this structure:
    {{
    "item summary": "A description of the item, no more than 80 words.", 
    "potential user analysis": "what type of users would purchase this item, no more than 50 words."
    }}
    """
    return prompt

item_prompt = {}
for item_id in item2id.values():
    item_info = item_meta_formatted[item_id]
    if item_id in end_dict_text_formatted.keys():
        history = end_dict_text_formatted[item_id]
    else:
        history = ' [None] '
    item_prompt[item_id] = complete_prompt_gen(item_info, history)

In [293]:
import pickle
with open(RAW_PATH+'/item_prompt_input.pkl', 'wb') as pickle_file:
    pickle.dump(item_prompt, pickle_file)

In [300]:

inter = inter.drop(columns=['time'])

inter.columns = ['user_id:token', 'item_id:token']

inter.to_csv(RAW_PATH+'/Amazon_Office.inter', sep='\t', index=False)

In [305]:
df = pd.read_csv(RAW_PATH+'/inter.csv')

df = df.drop(columns=['time'])

df.to_csv(RAW_PATH+'/Office.txt', sep=' ', index=False, header=False)