In [12]:
from itertools import islice
import os
from collections import defaultdict
import seaborn as sns
import numpy as np
import torch
import json
import pandas as pd
from datetime import datetime

In [13]:
data_file = open("yelp_academic_dataset_review.json")
data = []

user_id_map, business_id_map = {}, {}
user_count, business_count = 0, 0

stars = []

for line in data_file:
    sub_data = json.loads(line)
    user, business, star, time = sub_data['user_id'], sub_data['business_id'], sub_data['stars'], sub_data['date']
    
    time = datetime.strptime(time, '%Y-%m-%d %H:%M:%S')
    time = datetime.timestamp(time)
    
    if user not in user_id_map:
        user_id_map[user] = user_count
        user_count += 1

    if business not in business_id_map:
        business_id_map[business] = business_count
        business_count += 1

    data.append([time, user_id_map[user], business_id_map[business], star])
    stars.append(star)

print(len(data))

6990280


In [14]:
#check the unique value of the star
print(np.unique(np.array(stars)))

[1. 2. 3. 4. 5.]


In [15]:
new_data = [[time, user, business] for time, user, business, star in data if star >= 4]
data = new_data

In [16]:
print(len(data))

4684545


In [17]:
#calculate degree of each user and business, 10-core filtering iteratively
k = 10
iteration = 20

while iteration > 0:
    deg_dict_user, deg_dict_business = defaultdict(int), defaultdict(int)
    user_set, business_set = set(), set()

    for time, user, business in data:
        deg_dict_user[user] += 1
        deg_dict_business[business] += 1

        if deg_dict_user[user] >= k:
            user_set.add(user)

        if deg_dict_business[business] >= k:
            business_set.add(business) 

    #10-core filtering
    filter_data = [[time, user, business] for time, user, business in data if user in user_set and business in business_set]
    print(len(data), len(filter_data))
    data = filter_data    
    iteration -= 1

print(len(data))

4684545 1844827
1844827 1629371
1629371 1548500
1548500 1529233
1529233 1520938
1520938 1518540
1518540 1517454
1517454 1517168
1517168 1517042
1517042 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024 1517024
1517024


In [18]:
user_dict, business_dict = {}, {}
user_count, business_count = 0, 0
new_data = []

for time, user, business in data:
    if user not in user_dict:
        user_dict[user] = user_count
        user_count += 1

    if business not in business_dict:
        business_dict[business] = business_count
        business_count += 1

    new_data.append([time, user_dict[user], business_dict[business]])

data = new_data

In [19]:
len(data)

1517024

In [20]:
#For each user, split the edges into 8/1/1
data = sorted(data)
edge_dict_by_user = defaultdict(list)

for time, user, business in data:
    edge_dict_by_user[user].append(business)

def split_train_val(edge_dict_by_user):
    train, val, test = [[], []], [[], []], [[], []]

    for user in edge_dict_by_user:
        edges = edge_dict_by_user[user]
        
        test_edges = edges[-1*len(edges)//10:]
        
        train_val_edges = edges[:-1*len(edges)//10]
        train_edges = train_val_edges[:8*len(train_val_edges)//9]
        val_edges = train_val_edges[8*len(train_val_edges)//9:]
            
        train[0].extend([user]*len(train_edges))
        train[1].extend(train_edges)
        
        val[0].extend([user]*len(val_edges))
        val[1].extend(val_edges)
        
        test[0].extend([user]*len(test_edges))
        test[1].extend(test_edges)
    
    return np.array(train, dtype = int).transpose(1, 0), np.array(val, dtype = int).transpose(1, 0), np.array(test, dtype = int).transpose(1, 0)

train, val, test = split_train_val(edge_dict_by_user)
print(train.shape, val.shape, test.shape)

(1166918, 2) (170767, 2) (179339, 2)


In [21]:
np.savetxt('./train.txt', train, fmt='%i')
np.savetxt('./val.txt', val, fmt='%i')
np.savetxt('./test.txt', test, fmt='%i')