### README

This notebook shows how we process EXTRA dataset to ours. 

        1. Convert to train.csv, valid.csv, test.csv
        2. find attribute
        3. find time

#### Amazon

In [1]:
import json
import pickle
import pandas as pd
from tqdm import tqdm
from datetime import datetime
import nltk
import random
random.seed(43)

# load Amazon interactions
with open('./Amazon/id2exp.json', 'r') as f:
    id2exp = json.load(f)
with open('./Amazon/IDs.pickle', 'rb') as f:
    inters = pickle.load(f)

data = []
for i in tqdm(range(len(inters))):
    user = inters[i]["user"] # str
    item = inters[i]["item"] # str
    exp = random.choice(inters[i]["exp_idx"])  # str
    sentence = id2exp[exp] # str
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    nouns = [word for word, tag in tags if tag in ['NN', 'NNS','NNP']]
    if nouns:
        attribute = random.choice(nouns)
    else:
    	attribute = "None"
    date_object = datetime.strptime(inters[i]["time"], '%Y-%m-%d')
    timestamp = int(date_object.timestamp())
    data.append({'user': user, 'item': item, 'exp': exp, 'sentence': sentence, 'attribute': attribute, 'timestamp': timestamp})

100%|████████████████████████████████████████████████████████████████████████████████████████| 569838/569838 [02:49<00:00, 3361.02it/s]


In [2]:
df = pd.DataFrame(data)
print(len(df))
df.head(5)

569838


Unnamed: 0,user,item,exp,sentence,attribute,timestamp
0,ATWFMTZEH77H1,5019281,547,An American Christmas Carol,Carol,1349708400
1,A3UEOL981G9TXY,5019281,547,An American Christmas Carol,Carol,1328194800
2,A1U0A5PVKWJOL7,5019281,547,An American Christmas Carol,Christmas,1354201200
3,A1GHUN5HXMHZ89,5019281,547,An American Christmas Carol,Carol,1387638000
4,A1BBBIN5NMNPV7,5019281,547,An American Christmas Carol,Carol,1396537200


In [3]:
# iteratively filter users and items with less than 5 interactions
for i in range(20):
    user_interactions = df['user'].value_counts()
    item_interactions = df['item'].value_counts()
    filtered_users = user_interactions[user_interactions >= 5].index
    filtered_items = item_interactions[item_interactions >= 5].index
    filtered_df = df[df['user'].isin(filtered_users) & df['item'].isin(filtered_items)]
    df = filtered_df
    print(len(df))

348441
320456
310869
307890
306639
306139
305960
305897
305861
305837
305833
305825
305809
305805
305801
305801
305801
305801
305801
305801


In [4]:
# grouping by users and sort by their time.
df = df.sort_values(by=['user', 'timestamp'])

# Reset the index to maintain a continuous index for the sorted DataFrame
df.reset_index(drop=True, inplace=True)
df.head(5)

Unnamed: 0,user,item,exp,sentence,attribute,timestamp
0,A010997525FU27TAPMJCG,B002VPE1AW,26355588,One of the greatest movies I've ever seen,movies,1386169200
1,A010997525FU27TAPMJCG,B00FRILRL6,4528130,The acting is great,acting,1386601200
2,A010997525FU27TAPMJCG,B00BEIYL6G,25177608,is the movie was just ok,movie,1386687600
3,A010997525FU27TAPMJCG,B0090SI56Y,13949923,Great fighting scenes,Great,1387119600
4,A010997525FU27TAPMJCG,B008JFUQZ2,24040512,The idea is simple,idea,1387206000


In [5]:
# Create index mappings
user_mapping = {user: idx for idx, user in enumerate(df['user'].unique())}
item_mapping = {item: idx for idx, item in enumerate(df['item'].unique())}
exp_mapping = {exp: idx for idx, exp in enumerate(df['exp'].unique())}
attribute_mapping = {attribute: idx for idx, attribute in enumerate(df['attribute'].unique())}

# Replace columns with their corresponding index columns
df['user_idx'] = df['user'].map(user_mapping)
df['item_idx'] = df['item'].map(item_mapping)
df['exp_idx'] = df['exp'].map(exp_mapping)
df['attribute_idx'] = df['attribute'].map(attribute_mapping)
df.head(5)

Unnamed: 0,user,item,exp,sentence,attribute,timestamp,user_idx,item_idx,exp_idx,attribute_idx
0,A010997525FU27TAPMJCG,B002VPE1AW,26355588,One of the greatest movies I've ever seen,movies,1386169200,0,0,0,0
1,A010997525FU27TAPMJCG,B00FRILRL6,4528130,The acting is great,acting,1386601200,0,1,1,1
2,A010997525FU27TAPMJCG,B00BEIYL6G,25177608,is the movie was just ok,movie,1386687600,0,2,2,2
3,A010997525FU27TAPMJCG,B0090SI56Y,13949923,Great fighting scenes,Great,1387119600,0,3,3,3
4,A010997525FU27TAPMJCG,B008JFUQZ2,24040512,The idea is simple,idea,1387206000,0,4,4,4


In [6]:
df.to_csv('./Processed_data/Amazon/full.csv', index=False)

In [7]:
# create train, valid, test
df = pd.read_csv('./Processed_data/Amazon/full.csv')
train_df = []
train_valid_df = []
valid_df = []
test_df = []

for user_idx, user_group in tqdm(df.groupby('user_idx')):
    train_rows = user_group.iloc[:-2]
    train_valid_rows = user_group.iloc[:-1]
    valid_row = user_group.iloc[-2]
    test_row = user_group.iloc[-1]
    train_df.append(train_rows)
    train_valid_df.append(train_valid_rows)
    valid_df.append(dict(valid_row))
    test_df.append(dict(test_row))

train_df = pd.concat(train_df, ignore_index=True)
train_valid_df = pd.concat(train_valid_df, ignore_index=True)
valid_df = pd.DataFrame(valid_df)
test_df = pd.DataFrame(test_df)
# Save the DataFrames to CSV files
train_df.to_csv('./Processed_data/Amazon/train.csv', index=False)
train_valid_df.to_csv('./Processed_data/Amazon/train_valid.csv', index=False)
valid_df.to_csv('./Processed_data/Amazon/valid.csv', index=False)
test_df.to_csv('./Processed_data/Amazon/test.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 22248/22248 [00:06<00:00, 3393.58it/s]


#### TripAdvisor  

Since TripAdvisor has no timestamp, so we cannot test on it.

#### Yelp

In [8]:
# load Amazon interactions
random.seed(43)
with open('./Yelp/id2exp.json', 'r') as f:
    id2exp = json.load(f)
with open('./Yelp/IDs.pickle', 'rb') as f:
    inters = pickle.load(f)
    
data = []
for i in tqdm(range(len(inters))):
    user = inters[i]["user"] # str
    item = inters[i]["item"] # str
    exp = random.choice(inters[i]["exp_idx"])  # str
    sentence = id2exp[exp] # str
    words = nltk.word_tokenize(sentence)
    tags = nltk.pos_tag(words)
    nouns = [word for word, tag in tags if tag in ['NN', 'NNS','NNP']]
    if nouns:
        attribute = random.choice(nouns)
    else:
    	attribute = "None"
    date_object = datetime.strptime(inters[i]["time"], '%Y-%m-%d')
    timestamp = int(date_object.timestamp())
    data.append({'user': user, 'item': item, 'exp': exp, 'sentence': sentence, 'attribute': attribute, 'timestamp': timestamp})

100%|██████████████████████████████████████████████████████████████████████████████████████| 2608860/2608860 [12:58<00:00, 3350.34it/s]


In [11]:
df = pd.DataFrame(data)
print(len(df))
df.head(5)

2608860


Unnamed: 0,user,item,exp,sentence,attribute,timestamp
0,lkNKfz9obz93SH5RufVBxw,I5RoKagQjGYCWgY4_fK_rQ,19595520,Perfect portion,portion,1323183600
1,0mTTpKROlr4a8bj3NiLXmQ,Er8DpPwf_lHHv64ncWhD8g,19595520,Perfect portion,portion,1531148400
2,wxaAnhOsEpXj-kK7qBdMSA,qAg8GkN9rforFWmGxBqVqA,70002434,young couples,couples,1335798000
3,TE3QbpBjlgCFrgigCxZENQ,IEqtZj1OLEAwkXlrW-OqDg,30980480,All good things,things,1521385200
4,EDA6hQNBRD2fXif2l1jVPg,iw1WJt3wl6-NquZGKVamLw,19595520,Perfect portion,portion,1496761200


In [12]:
# iteratively filter users and items with less than 5 interactions
for i in range(20):
    user_interactions = df['user'].value_counts()
    item_interactions = df['item'].value_counts()
    filtered_users = user_interactions[user_interactions >= 5].index
    filtered_items = item_interactions[item_interactions >= 5].index
    filtered_df = df[df['user'].isin(filtered_users) & df['item'].isin(filtered_items)]
    df = filtered_df
    print(len(df))

1326736
1239213
1212007
1206976
1205201
1204736
1204555
1204523
1204515
1204515
1204515
1204515
1204515
1204515
1204515
1204515
1204515
1204515
1204515
1204515


In [13]:
# grouping by users and sort by their time.
df = df.sort_values(by=['user', 'timestamp'])

# Reset the index to maintain a continuous index for the sorted DataFrame
df.reset_index(drop=True, inplace=True)
df.head(5)

Unnamed: 0,user,item,exp,sentence,attribute,timestamp
0,---1lKK3aKOuomHnwAkAow,rq5dgoksPHkJwJNQKlGQ7w,53407747,Best coffee in town,coffee,1287154800
1,---1lKK3aKOuomHnwAkAow,CWNMLT-ppaUjLMmrnYDPVg,48523110,Want great food,Want,1288882800
2,---1lKK3aKOuomHnwAkAow,vW65SNLam99SyOuVagNuvg,62185472,The food was okay,food,1289228400
3,---1lKK3aKOuomHnwAkAow,qmymSqVwHYRqdwfcBatzpQ,50511872,The food is really good,food,1290438000
4,---1lKK3aKOuomHnwAkAow,y8d90Pt16Nip-B5UXWBP-w,46413824,good beer,beer,1294153200


In [14]:
# Create index mappings
user_mapping = {user: idx for idx, user in enumerate(df['user'].unique())}
item_mapping = {item: idx for idx, item in enumerate(df['item'].unique())}
exp_mapping = {exp: idx for idx, exp in enumerate(df['exp'].unique())}
attribute_mapping = {attribute: idx for idx, attribute in enumerate(df['attribute'].unique())}

# Replace columns with their corresponding index columns
df['user_idx'] = df['user'].map(user_mapping)
df['item_idx'] = df['item'].map(item_mapping)
df['exp_idx'] = df['exp'].map(exp_mapping)
df['attribute_idx'] = df['attribute'].map(attribute_mapping)
df.head(5)

Unnamed: 0,user,item,exp,sentence,attribute,timestamp,user_idx,item_idx,exp_idx,attribute_idx
0,---1lKK3aKOuomHnwAkAow,rq5dgoksPHkJwJNQKlGQ7w,53407747,Best coffee in town,coffee,1287154800,0,0,0,0
1,---1lKK3aKOuomHnwAkAow,CWNMLT-ppaUjLMmrnYDPVg,48523110,Want great food,Want,1288882800,0,1,1,1
2,---1lKK3aKOuomHnwAkAow,vW65SNLam99SyOuVagNuvg,62185472,The food was okay,food,1289228400,0,2,2,2
3,---1lKK3aKOuomHnwAkAow,qmymSqVwHYRqdwfcBatzpQ,50511872,The food is really good,food,1290438000,0,3,3,2
4,---1lKK3aKOuomHnwAkAow,y8d90Pt16Nip-B5UXWBP-w,46413824,good beer,beer,1294153200,0,4,4,3


In [15]:
df.to_csv('./Processed_data/Yelp/full.csv', index=False)

In [16]:
# create train, valid, test
df = pd.read_csv('./Processed_data/Yelp/full.csv')
train_df = []
train_valid_df = []
valid_df = []
test_df = []

for user_idx, user_group in tqdm(df.groupby('user_idx')):
    train_rows = user_group.iloc[:-2]
    train_valid_rows = user_group.iloc[:-1]
    valid_row = user_group.iloc[-2]
    test_row = user_group.iloc[-1]
    train_df.append(train_rows)
    train_valid_df.append(train_valid_rows)
    valid_df.append(dict(valid_row))
    test_df.append(dict(test_row))

train_df = pd.concat(train_df, ignore_index=True)
train_valid_df = pd.concat(train_valid_df, ignore_index=True)
valid_df = pd.DataFrame(valid_df)
test_df = pd.DataFrame(test_df)
# Save the DataFrames to CSV files
train_df.to_csv('./Processed_data/Yelp/train.csv', index=False)
train_valid_df.to_csv('./Processed_data/Yelp/train_valid.csv', index=False)
valid_df.to_csv('./Processed_data/Yelp/valid.csv', index=False)
test_df.to_csv('./Processed_data/Yelp/test.csv', index=False)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 90742/90742 [00:26<00:00, 3411.76it/s]
