In [1]:
from itertools import islice
import os
from collections import defaultdict
import seaborn as sns
import numpy as np
import torch
import json
import pandas as pd
from datetime import datetime

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def time_converter(time):
    m, d, y = time.split(' ')
    if m[0] == '0':
        m = int(m[1])
    else:
        m = int(m)

    d = d[:-1]
    if d[0] == '0':
        d = int(d[1])
    else:
        d = int(d)

    y = int(y)

    dtime = datetime(y, m, d)

    return dtime.timestamp()

In [3]:
data_file = open("Books_5.json")
data = []

user_id_map, item_id_map = {}, {}
user_count, item_count = 0, 0

stars = []

for line in data_file:
    sub_data = json.loads(line)
#     print(sub_data)

    user, item, star, time = sub_data['reviewerID'], sub_data['asin'], sub_data['overall'], sub_data['reviewTime']
    
    time = time_converter(time)
    
    if user not in user_id_map:
        user_id_map[user] = user_count
        user_count += 1

    if item not in item_id_map:
        item_id_map[item] = item_count
        item_count += 1

    data.append([time, user_id_map[user], item_id_map[item], star])
    stars.append(star)



In [4]:
#the original dataset is so large, using only the last 25% of data
data = sorted(data)
data = data[-int(0.25*len(data)):]

#check the unique value of the star
print(np.unique(np.array(stars)))

[0. 1. 2. 3. 4. 5.]


In [5]:
new_data = [[time, user, business] for time, user, business, star in data if star >= 4]
data = new_data

In [6]:
print(len(data))

5914699


In [7]:
#calculate degree of each user and business, 10-core filtering iteratively
k = 10
iteration = 30

while iteration > 0:
    deg_dict_user, deg_dict_business = defaultdict(int), defaultdict(int)
    user_set, business_set = set(), set()

    for time, user, business in data:
        deg_dict_user[user] += 1
        deg_dict_business[business] += 1

        if deg_dict_user[user] >= k:
            user_set.add(user)

        if deg_dict_business[business] >= k:
            business_set.add(business) 

    #10-core filtering
    filter_data = [[time, user, business] for time, user, business in data if user in user_set and business in business_set]
    print(len(data), len(filter_data))
    data = filter_data    
    iteration -= 1

print(len(data))

5914699 2525398
2525398 2137030
2137030 1992554
1992554 1936857
1936857 1906576
1906576 1892303
1892303 1883521
1883521 1879064
1879064 1876218
1876218 1874642
1874642 1873615
1873615 1873062
1873062 1872723
1872723 1872525
1872525 1872435
1872435 1872373
1872373 1872337
1872337 1872310
1872310 1872292
1872292 1872274
1872274 1872265
1872265 1872247
1872247 1872238
1872238 1872238
1872238 1872238
1872238 1872238
1872238 1872238
1872238 1872238
1872238 1872238
1872238 1872238
1872238


In [8]:
user_dict, business_dict = {}, {}
user_count, business_count = 0, 0
new_data = []

for time, user, business in data:
    if user not in user_dict:
        user_dict[user] = user_count
        user_count += 1

    if business not in business_dict:
        business_dict[business] = business_count
        business_count += 1

    new_data.append([time, user_dict[user], business_dict[business]])

data = new_data

In [9]:
len(data)

1872238

In [10]:
#For each user, split the edges into 8/1/1
data = sorted(data)
edge_dict_by_user = defaultdict(list)

for time, user, business in data:
    edge_dict_by_user[user].append(business)

def split_train_val(edge_dict_by_user):
    train, val, test = [[], []], [[], []], [[], []]

    for user in edge_dict_by_user:
        edges = edge_dict_by_user[user]
        
        test_edges = edges[-1*len(edges)//10:]
        
        train_val_edges = edges[:-1*len(edges)//10]
        train_edges = train_val_edges[:8*len(train_val_edges)//9]
        val_edges = train_val_edges[8*len(train_val_edges)//9:]
            
        train[0].extend([user]*len(train_edges))
        train[1].extend(train_edges)
        
        val[0].extend([user]*len(val_edges))
        val[1].extend(val_edges)
        
        test[0].extend([user]*len(test_edges))
        test[1].extend(test_edges)
    
    return np.array(train, dtype = int).transpose(1, 0), np.array(val, dtype = int).transpose(1, 0), np.array(test, dtype = int).transpose(1, 0)

train, val, test = split_train_val(edge_dict_by_user)
print(train.shape, val.shape, test.shape)

(1446346, 2) (208271, 2) (217621, 2)


In [11]:
np.savetxt('./train.txt', train, fmt='%i')
np.savetxt('./val.txt', val, fmt='%i')
np.savetxt('./test.txt', test, fmt='%i')