In [20]:
import os
import gzip
import subprocess
import pandas as pd
import numpy as np
from datetime import datetime

In [21]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def get_df(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [22]:
DATASET = 'Beauty'
RAW_PATH = os.path.join('./data/', DATASET)
DATA_FILE = 'reviews_{}_5.json.gz'.format(DATASET)
META_FILE = 'meta_{}.json.gz'.format(DATASET)

In [23]:
# download data if not exists

if not os.path.exists(RAW_PATH):
    subprocess.call('mkdir ' + RAW_PATH, shell=True)
if not os.path.exists(os.path.join(RAW_PATH, DATA_FILE)):
    print('Downloading interaction data into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_{}_5.json.gz'
        .format(RAW_PATH, DATASET), shell=True)
if not os.path.exists(os.path.join(RAW_PATH, META_FILE)):
    print('Downloading item metadata into ' + RAW_PATH)
    subprocess.call(
        'cd {} && curl -O http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/meta_{}.json.gz'
        .format(RAW_PATH, DATASET), shell=True)

In [24]:
data_df = get_df(os.path.join(RAW_PATH, DATA_FILE))
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))
data_df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1YJEY40YUW4SE,7806397051,Andrea,"[3, 4]",Very oily and creamy. Not at all what I expect...,1.0,Don't waste your money,1391040000,"01 30, 2014"
1,A60XNB876KYML,7806397051,Jessica H.,"[1, 1]",This palette was a decent price and I was look...,3.0,OK Palette!,1397779200,"04 18, 2014"
2,A3G6XNM240RMWA,7806397051,Karen,"[0, 1]",The texture of this concealer pallet is fantas...,4.0,great quality,1378425600,"09 6, 2013"
3,A1PQFP6SAJ6D80,7806397051,Norah,"[2, 2]",I really can't tell what exactly this thing is...,2.0,Do not work on my face,1386460800,"12 8, 2013"
4,A38FVHZTNQ271F,7806397051,Nova Amor,"[0, 0]","It was a little smaller than I expected, but t...",3.0,It's okay.,1382140800,"10 19, 2013"


In [25]:
def filter_df(df):
    while True:
        print(df.shape)
        
        asin_counts = df['asin'].value_counts()
        reviewerName_counts = df['reviewerID'].value_counts()

       
        asin_to_remove = asin_counts[asin_counts < 5].index
        reviewerName_to_remove = reviewerName_counts[reviewerName_counts < 5].index

        
        if len(asin_to_remove) == 0 and len(reviewerName_to_remove) == 0:
            break

       
        df = df[~df['asin'].isin(asin_to_remove)]
        df = df[~df['reviewerID'].isin(reviewerName_to_remove)]

    return df


n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

data_df = filter_df(data_df)

print("========after filtering=============")
n_users = data_df['reviewerID'].value_counts().size
n_items = data_df['asin'].value_counts().size
n_clicks = len(data_df)
min_time = data_df['unixReviewTime'].min()
max_time = data_df['unixReviewTime'].max()
time_format = '%Y-%m-%d'

print('# Users:', n_users)
print('# Items:', n_items)
print('# Interactions:', n_clicks)
print('Time Span: {}/{}'.format(
    datetime.utcfromtimestamp(min_time).strftime(time_format),
    datetime.utcfromtimestamp(max_time).strftime(time_format))
)

# Users: 22363
# Items: 12101
# Interactions: 198502
Time Span: 2002-06-12/2014-07-23
(198502, 9)
# Users: 22363
# Items: 12101
# Interactions: 198502
Time Span: 2002-06-12/2014-07-23


In [26]:
# Only retain items that appear in interaction data
meta_df = get_df(os.path.join(RAW_PATH, META_FILE))

useful_meta_df = meta_df[meta_df['asin'].isin(data_df['asin'])].reset_index(drop=True)
all_items = set(useful_meta_df['asin'].values.tolist())

def related_filter(related_dict):
    out_dict = dict()
    if related_dict is not np.nan:
        for r in related_dict:
            out_dict[r] = list(all_items & set(related_dict[r]))
    return out_dict

useful_meta_df['related'] = useful_meta_df['related'].apply(related_filter)

In [9]:
df = useful_meta_df
((df.isnull().sum())/df.shape[0]).sort_values(ascending=False).map(lambda x:"{:.2%}".format(x))

brand          17.12%
price           4.82%
salesRank       1.73%
description     1.70%
title           0.06%
imUrl           0.06%
asin            0.00%
categories      0.00%
related         0.00%
dtype: object

# Build Dataset

In [27]:
out_df = data_df.rename(columns={'asin': 'item_id', 'reviewerID': 'user_id', 'unixReviewTime': 'time'})
out_df = out_df[['user_id', 'item_id', 'time']]
out_df = out_df.drop_duplicates(['user_id', 'item_id', 'time'])
out_df = out_df.sort_values(by=['user_id', 'time'], kind='mergesort').reset_index(drop=True)
out_df.head()

Unnamed: 0,user_id,item_id,time
0,A00414041RD0BXM6WK0GX,B007IY97U0,1405296000
1,A00414041RD0BXM6WK0GX,B00870XLDS,1405296000
2,A00414041RD0BXM6WK0GX,B008MIRO88,1405296000
3,A00414041RD0BXM6WK0GX,B00BQYYMN0,1405296000
4,A00414041RD0BXM6WK0GX,B00GRTQBTM,1405296000


In [28]:
# reindex (start from 1)
# uids = sorted(out_df['user_id'].unique())
uids = out_df['user_id'].unique()
user2id = dict(zip(uids, range(1, len(uids) + 1)))
# iids = sorted(out_df['item_id'].unique())
iids = out_df['item_id'].unique()
item2id = dict(zip(iids, range(1, len(iids) + 1)))

out_df['user_id'] = out_df['user_id'].apply(lambda x: user2id[x])
out_df['item_id'] = out_df['item_id'].apply(lambda x: item2id[x])
out_df.head()

Unnamed: 0,user_id,item_id,time
0,1,1,1405296000
1,1,2,1405296000
2,1,3,1405296000
3,1,4,1405296000
4,1,5,1405296000


In [29]:
import json
with open(RAW_PATH+'/item2id.json', 'w') as f:
    json.dump(item2id, f, indent=4)

In [30]:
useful_meta_df['item_id'] = useful_meta_df['asin'].apply(lambda x: item2id[x])
# save data
out_df.to_csv(RAW_PATH+'/inter.csv', index=False)
useful_meta_df.to_csv(RAW_PATH+'/meta.csv', index=False)

inter = out_df.drop(columns=['time'])
inter.columns = ['user_id:token', 'item_id:token']
inter.to_csv(RAW_PATH+'/Amazon_'+DATASET+'.inter', sep='\t', index=False)

df_txt = out_df.drop(columns=['time'])
df_txt.to_csv(RAW_PATH+'/'+ DATASET+'.txt', sep=' ', index=False, header=False)


# def transform_input(input_file, output_file):
#     with open(input_file, 'r') as file:
#         lines = file.readlines()

#     data = {}
#     for line in lines:
#         key, value = line.strip().split()
#         key, value = int(key), int(value)
#         if key not in data:
#             data[key] = []
#         data[key].append(value)

#     with open(output_file, 'w') as file:
#         for key in sorted(data.keys()):
#             file.write(f"{key} {' '.join(map(str, data[key]))}\n")

# input_file = RAW_PATH+'/'+ DATASET+'.txt'
# output_file = RAW_PATH+'/'+ DATASET+'_icsrec.txt'
# transform_input(input_file, output_file)

# User Prompt

In [31]:
# read data
inter = pd.read_csv(RAW_PATH+'/inter.csv')
useful_meta_df = pd.read_csv(RAW_PATH+'/meta.csv')

# seq slide augmentation

def prepare_data_augmentation(df):
    max_item_list_len = 20

    last_uid = None
    uid_list, item_list, target, item_list_length = [], [], [], []

    for index, row in df.iterrows():
        uid, item_id = row['user_id'], row['item_id']
        if last_uid != uid:
            last_uid = uid
            seq = []
        else:
            if len(seq) > max_item_list_len:
                seq = seq[1:]
            uid_list.append(uid)
            item_list.append(seq[:])
            target.append(item_id)
            item_list_length.append(len(seq))
        seq.append(item_id)

    return uid_list, item_list, target, item_list_length
 

uid_list, item_list, target, item_list_length = prepare_data_augmentation(inter)


import ast
# useful_meta_df
useful_meta_df['categories'] = useful_meta_df['categories'].apply(ast.literal_eval)
useful_meta_df['categories'] = useful_meta_df['categories'].apply(lambda x: x[0][2:])
# columns_to_drop = ['asin', 'description','imUrl', 'related', 'salesRank']
columns_to_drop = ['asin', 'description','imUrl', 'related']
useful_meta_df_dropped = useful_meta_df.drop(columns=columns_to_drop)
meta_dict = useful_meta_df_dropped.set_index('item_id').T.to_dict()

requests = {}
for i in range(len(item_list)):
    uid, items = uid_list[i], item_list[i]
    # print([uid] + items)
    key = ":".join(map(str, [uid] + items))
    title_list = []
    for item in items:
        title_list.append(meta_dict[item]['title'])
    if i < 10:
        print(items)
        print(title_list)
    formatted_titles = [f'<{title}>' for title in title_list]
    formatted_string = '; '.join(formatted_titles)
    requests[key] = formatted_string

import pickle
with open(RAW_PATH+'/user_prompt_input.pkl', 'wb') as pickle_file:
    pickle.dump(requests, pickle_file)
    
# with open(RAW_PATH+"/user_prompt_input.pkl", 'rb') as pickle_file:
#     question_dic = pickle.load(pickle_file)

system_input = """Assume you are a beauty products recommendation expert.
You will be provided with a user's historical purchases of beauty products in chronological order, given in the following format:
<The title of item1>; <The title of item2>; <The title of item3>;... 
Please summarize the user's specific preference when purchasing beauty products. Note that your response should be a coherent paragraph of no more than 100 words.
"""

[1]
['63cm Long Zipper Beige+pink Wavy Cosplay Hair Wig Rw157']
[1, 2]
['63cm Long Zipper Beige+pink Wavy Cosplay Hair Wig Rw157', 'MapofBeauty Long Wave Curly Hair Wig Full Wig for Women Long (Black)']
[1, 2, 3]
['63cm Long Zipper Beige+pink Wavy Cosplay Hair Wig Rw157', 'MapofBeauty Long Wave Curly Hair Wig Full Wig for Women Long (Black)', 'MapofBeauty Cosplay Costume Long Curly Hair Wig Ladies Synthetic Wigs (White)']
[1, 2, 3, 4]
['63cm Long Zipper Beige+pink Wavy Cosplay Hair Wig Rw157', 'MapofBeauty Long Wave Curly Hair Wig Full Wig for Women Long (Black)', 'MapofBeauty Cosplay Costume Long Curly Hair Wig Ladies Synthetic Wigs (White)', '32&quot; 80cm Long Hair Heat Resistant Spiral Curly Cosplay Wig (Red Dark)']
[1, 2, 3, 4, 5]
['63cm Long Zipper Beige+pink Wavy Cosplay Hair Wig Rw157', 'MapofBeauty Long Wave Curly Hair Wig Full Wig for Women Long (Black)', 'MapofBeauty Cosplay Costume Long Curly Hair Wig Ladies Synthetic Wigs (White)', '32&quot; 80cm Long Hair Heat Resistant S

# Item Prompt

In [32]:
inter = pd.read_csv(RAW_PATH+'/inter.csv')
useful_meta_df = pd.read_csv(RAW_PATH+'/meta.csv')

useful_meta_df['brand'] = useful_meta_df['brand'].fillna('missing')
useful_meta_df['salesRank'] = useful_meta_df['salesRank'].fillna('missing')
useful_meta_df['title'] = useful_meta_df['title'].fillna('missing')
useful_meta_df['description'] = useful_meta_df['description'].fillna('missing')
useful_meta_df['price'] = useful_meta_df['price'].fillna('missing')
import ast
useful_meta_df['categories'] = useful_meta_df['categories'].apply(ast.literal_eval)
useful_meta_df['categories'] = useful_meta_df['categories'].apply(lambda x: x[0][2:])
# columns_to_drop = ['asin', 'imUrl', 'related', 'salesRank']
columns_to_drop = ['asin', 'imUrl', 'related', ]
useful_meta_df_dropped = useful_meta_df.drop(columns=columns_to_drop)
meta_dict = useful_meta_df_dropped.set_index('item_id').T.to_dict()

# seq slide augmentation

def prepare_data_augmentation(df):
    max_item_list_len = 20

    last_uid = None
    uid_list, item_list, target, item_list_length = [], [], [], []

    for index, row in df.iterrows():
        uid, item_id = row['user_id'], row['item_id']
        if last_uid != uid:
            last_uid = uid
            seq = []
        else:
            if len(seq) > max_item_list_len:
                seq = seq[1:]
            uid_list.append(uid)
            item_list.append(seq[:])
            target.append(item_id)
            item_list_length.append(len(seq))
        seq.append(item_id)

    return uid_list, item_list, target, item_list_length
 

uid_list, item_list, target, item_list_length = prepare_data_augmentation(inter)

In [33]:
filtered_list = [sublist[-11:] for sublist in item_list if (len(sublist) >= 2 and len(sublist) <= 21)]
print("length of filtered_list", len(filtered_list))

last_values = set(sublist[-1] for sublist in filtered_list)

end_dict = {value: [] for value in last_values}

for sublist in filtered_list:
    last_value = sublist[-1]
    end_dict[last_value].append(sublist)

import random
random.seed(0)

for key in end_dict:
    if len(end_dict[key]) > 5:
        end_dict[key] = random.sample(end_dict[key], 5)

end_dict_text = {}
for key in end_dict:
    end_dict_text[key] = []
    for ls in end_dict[key]:
        ls_text = [meta_dict[item]['title'] for item in ls]
        ls_text = [val for val in ls_text if val != "missing"]
        end_dict_text[key].append(ls_text)


end_dict_text_formatted = {}
for item_id in end_dict_text.keys():
    same_target_seqs = ''
    for seq in end_dict_text[item_id]:
        print(seq)
        seq = ' -> '.join(seq) 
        seq = '[' + seq + '] # '
        seq += " \n "
        same_target_seqs += seq
    end_dict_text_formatted[item_id] = same_target_seqs

length of filtered_list 153776


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [34]:
def item_template(item_dict):
    text = (
        f"The product name is {item_dict['title']}; "
        f"brand is {item_dict['brand']}; "
        f"categories are {item_dict['categories']}; "
        f"salesRank is {item_dict['salesRank']}; "
        f"price is {item_dict['price']}; "
        f"The detailed description of the product is {item_dict['description'][:500]}......"
    )
    return text

item_meta_formatted = {}
for item_id in item2id.values():
    item_meta_formatted[item_id] = item_template(meta_dict[item_id])
    
def complete_prompt_gen(item_info, history):
    prompt = f"""
    Assume you are a Beauty products recommendation expert. Please help me analyze a specific Beauty product. You will be provided with the following information:
    1) The basic information of the Beauty product: {item_info};
    2) The historical purchase information of users who have bought this product: {history}. Here, different sequences are separated by '#', and each sequence is in LIST format, representing a certain user's historical purchases. Items in each sequence are separated by '->', and the last item in all sequences is the specific product mentioned above.
    
    Requirements:
    1) Please briefly describe the given Beauty product.
    2) Based on the provided sequences, please analyze what type of users would purchase this specific product. Please do not generally say makeup enthusiasts, as all users in this context have a need to purchase Beauty products. Instead, please provide a more detailed granularity.
    Please provide your answer in JSON format, following this structure:
    {{
    "item summary": "A description of the item, no more than 80 words.", 
    "potential user analysis": "what type of users would purchase this item, no more than 50 words."
    }}
    """
    return prompt

item_prompt = {}
for item_id in item2id.values():
    item_info = item_meta_formatted[item_id]
    if item_id in end_dict_text_formatted.keys():
        history = end_dict_text_formatted[item_id]
    else:
        history = ' [None] '
    item_prompt[item_id] = complete_prompt_gen(item_info, history)

import pickle
with open(RAW_PATH+'/item_prompt_input.pkl', 'wb') as pickle_file:
    pickle.dump(item_prompt, pickle_file)