In [5]:
from itertools import islice
import os
from collections import defaultdict
import seaborn as sns
import numpy as np
import torch

In [6]:
np.random.seed(1)

In [7]:
f = open('gowalla.inter', 'r')
f.readline()

user_dict, item_dict = {}, {}
user_count, item_count = 0, 0

data = []
edge_set = set()
for i, l in enumerate(f):
    if i != 0:
        log = l.split('\t')
        
        user = int(log[0])
        item = int(log[1])
        time = float(log[2])
        
        if user not in user_dict:
            user_dict[user] = user_count
            user_count += 1
        
        if item not in item_dict:
            item_dict[item] = item_count
            item_count += 1
        
        if (user_dict[user], item_dict[item]) not in edge_set:
            data.append([time, user_dict[user], item_dict[item]])
            edge_set.add((user_dict[user], item_dict[item]))

In [8]:
print(len(data))

3981333


In [9]:
#calculate degree of each user and business, 10-core filtering iteratively
k = 10
iteration = 25

while iteration > 0:
    deg_dict_user, deg_dict_business = defaultdict(int), defaultdict(int)
    user_set, business_set = set(), set()

    for time, user, business in data:
        deg_dict_user[user] += 1
        deg_dict_business[business] += 1

        if deg_dict_user[user] >= k:
            user_set.add(user)

        if deg_dict_business[business] >= k:
            business_set.add(business) 

    #10-core filtering
    filter_data = [[time, user, business] for time, user, business in data if user in user_set and business in business_set]
    print(len(data), len(filter_data))
    data = filter_data    
    iteration -= 1

print(len(data))

3981333 1339107
1339107 1174009
1174009 1090399
1090399 1064842
1064842 1047466
1047466 1040488
1040488 1035096
1035096 1032423
1032423 1030382
1030382 1029388
1029388 1028664
1028664 1028354
1028354 1028009
1028009 1027850
1027850 1027679
1027679 1027562
1027562 1027490
1027490 1027472
1027472 1027463
1027463 1027463
1027463 1027463
1027463 1027463
1027463 1027463
1027463 1027463
1027463 1027463
1027463


In [10]:
user_dict, business_dict = {}, {}
user_count, business_count = 0, 0
new_data = []

for time, user, business in data:
    if user not in user_dict:
        user_dict[user] = user_count
        user_count += 1

    if business not in business_dict:
        business_dict[business] = business_count
        business_count += 1

    new_data.append([time, user_dict[user], business_dict[business]])

data = new_data

In [11]:
len(data)

1027463

In [13]:
#For each user, split the edges into 8/1/1
data = sorted(data)
edge_dict_by_user = defaultdict(list)

for time, user, business in data:
    edge_dict_by_user[user].append(business)

def split_train_val(edge_dict_by_user):
    train, val, test = [[], []], [[], []], [[], []]

    for user in edge_dict_by_user:
        edges = edge_dict_by_user[user]
    
        test_edges = edges[-1*len(edges)//10:]
        
        train_val_edges = edges[:-1*len(edges)//10]
        train_edges = train_val_edges[:8*len(train_val_edges)//9]
        val_edges = train_val_edges[8*len(train_val_edges)//9:]
        
        
            
        train[0].extend([user]*len(train_edges))
        train[1].extend(train_edges)
        
        val[0].extend([user]*len(val_edges))
        val[1].extend(val_edges)
        
        test[0].extend([user]*len(test_edges))
        test[1].extend(test_edges)
    
    return np.array(train, dtype = int).transpose(1, 0), np.array(val, dtype = int).transpose(1, 0), np.array(test, dtype = int).transpose(1, 0)

train, val, test = split_train_val(edge_dict_by_user)

train = train[np.argsort(train[:, 0])]
val = val[np.argsort(val[:, 0])]
test = test[np.argsort(test[:, 0])]
print(train.shape, val.shape, test.shape)

(797565, 2) (113070, 2) (116828, 2)


In [14]:
np.savetxt('./train.txt', train, fmt='%i')
np.savetxt('./val.txt', val, fmt='%i')
np.savetxt('./test.txt', test, fmt='%i')