In [176]:
import pandas as pd
import numpy as np
import random
import pickle as pkl
from tqdm import tqdm

In [3]:
source_pth = '../dataset_tsmc2014/dataset_TSMC2014_TKY.txt'

dist_pth = '../processed_data/tky/'


In [7]:
col_names = ['uid', 'poi', 'cat_id', 'cat_name', 'latitude', 'longitude', 'offset', 'time']
data= pd.read_csv(source_pth, sep='\t', header=None, names=col_names, encoding='unicode_escape')
# transfer 'time' from str into datetime type
data['time']=pd.to_datetime(data['time'])

In [5]:
data.head()

Unnamed: 0,uid,poi,cat_id,cat_name,latitude,longitude,offset,time
0,1541,4f0fd5a8e4b03856eeb6c8cb,4bf58dd8d48988d10c951735,Cosmetics Shop,35.705101,139.61959,540,Tue Apr 03 18:17:18 +0000 2012
1,868,4b7b884ff964a5207d662fe3,4bf58dd8d48988d1d1941735,Ramen / Noodle House,35.715581,139.800317,540,Tue Apr 03 18:22:04 +0000 2012
2,114,4c16fdda96040f477cc473a5,4d954b0ea243a5684a65b473,Convenience Store,35.714542,139.480065,540,Tue Apr 03 19:12:07 +0000 2012
3,868,4c178638c2dfc928651ea869,4bf58dd8d48988d118951735,Food & Drink Shop,35.725592,139.776633,540,Tue Apr 03 19:12:13 +0000 2012
4,1458,4f568309e4b071452e447afe,4f2a210c4b9023bd5841ed28,Housing Development,35.656083,139.734045,540,Tue Apr 03 19:18:23 +0000 2012


In [27]:
user_no= pd.unique(data['uid']).shape[0]
poi_no= pd.unique(data['poi']).shape[0]

print("#Users: {}".format(user_no))
print("#POIs: {}".format(poi_no))

uid_dict = dict(zip(pd.unique(data['uid']), range(1, user_no+1)))
poi_dict = dict(zip(pd.unique(data['poi']), range(1, poi_no+1)))

data['uid_old']=data['uid']
data['poi_old']=data['poi']

data['uid']= data['uid_old'].map(uid_dict)
data['poi']= data['poi_old'].map(poi_dict)

#Users: 2293
#POIs: 61858


In [117]:
coords_= data.groupby('poi').agg({'latitude':np.mean, 'longitude':np.mean})
coords = dict(zip(coords_.index,list(zip(coords_['latitude'],coords_['longitude']))))

In [118]:
train_set, eval_set = [], []
for uid in range(1, user_no+1):
# for uid in range(1, 2):
    true_seq= data[data['uid']==uid]['poi'].tolist()
    # as source code, take only the pois that the uid never visits before.
    false_seq= np.random.choice(list(set(range(1, user_no+1))-set(true_seq)),size=len(true_seq),replace=True)
    
    seqlen=len(true_seq)
    
    for i in range(1,seqlen-1):
        train_set.append((uid, true_seq[i], true_seq[:i], coords[true_seq[i]], 1))
        train_set.append((uid, false_seq[i], true_seq[:i], coords[false_seq[i]], 0))
    
    eval_set.append((uid, true_seq[seqlen-1], true_seq[:seqlen-1], coords[true_seq[seqlen-1]], 1))
    eval_set.append((uid, false_seq[seqlen-1], true_seq[:seqlen-1], coords[false_seq[seqlen-1]], 0))
    



In [119]:
random.shuffle(train_set)
random.shuffle(eval_set)

In [120]:
sep = len(eval_set)//2
val_set = eval_set[:sep]
test_set = eval_set[sep:]

print(f'#Train: {len(train_set)}')
print(f'#Validation: {len(val_set)}')
print(f'#Test: {len(test_set)}')

#Train: 1138234
#Validation: 2293
#Test: 2293


In [141]:
# save the datasets
with open(dist_pth+'train.pkl', 'wb') as f:
    pkl.dump(train_set, f, pkl.HIGHEST_PROTOCOL)
    pkl.dump((user_no, poi_no), f, pkl.HIGHEST_PROTOCOL)
with open(dist_pth+'test.pkl', 'wb') as f:
    pkl.dump(test_set, f, pkl.HIGHEST_PROTOCOL)
    pkl.dump((user_no, poi_no), f, pkl.HIGHEST_PROTOCOL)
with open(dist_pth+'val.pkl', 'wb') as f:
    pkl.dump(val_set, f, pkl.HIGHEST_PROTOCOL)
    pkl.dump((user_no, poi_no), f, pkl.HIGHEST_PROTOCOL)

In [173]:
from math import cos, asin, sqrt, pi

def distance(lat1, lon1, lat2, lon2):
    r = 6371
    p = pi / 180
    a = 0.5 - cos((lat2 - lat1) * p) / 2 + cos(lat1 * p) * cos(lat2 * p) * (1 - cos((lon2 - lon1) * p)) / 2
    return 2 * r * asin(sqrt(a))

In [177]:
# construct Gerographical Graph

# all the vertex
v=list(range(1, poi_no+1))

dist_m = np.zeros((poi_no, poi_no))

for i in tqdm(range(poi_no-1)):
    for j in range(i+1, poi_no):
        dist_m[i,j] = distance(coords[i+1][0], coords[i+1][1], coords[j+1][0], coords[j+1][1])



100%|█████████████████████████████████████| 61857/61857 [38:02<00:00, 27.10it/s]


NameError: name 'threshold' is not defined

In [212]:
# dist_m is too large
# np.save(dist_pth+'dist_mat.npy', dist_m)

In [184]:
# threshold distance is 0.5km
threshold_d = 0.5
adj_m = (dist_m<=threshold_d)&(dist_m>0)

In [238]:
v_neighbor = {poi: [] for poi in v}
v_edges=[[],[] ]
for poi in tqdm(v_neighbor.keys()):
    tmp = list(np.array(v)[adj_m[poi-1,:]])+list(np.array(v)[adj_m[:,poi-1]])
    v_neighbor[poi] = tmp
    v_edges[0] += [poi for _ in range(len(tmp))]
    v_edges[1] += tmp

100%|████████████████████████████████████| 61858/61858 [05:41<00:00, 181.12it/s]


In [241]:
with open(dist_pth+'dist_graph.pkl', 'wb') as f:
    pkl.dump(v_edges, f, pkl.HIGHEST_PROTOCOL)
    pkl.dump(v_neighbor, f, pkl.HIGHEST_PROTOCOL)

In [242]:
# dist_m is too large, save only the distances which have edges.

np.save(dist_pth +'dist_on_graph.npy', dist_m[np.array(v_edges[0])-1,np.array(v_edges[1])-1])