# preprocess
0. Get basic information
1. Add "region" column
2. Add "time_delta" and "dist_delta" columns data
3. Split dataset to train and test data.
4. Get vector of every POI and region using node2vec
5. Calation the distance of between every 2 region as the weight of "Region-Graph1"
6. Calculate the visit times between 2 regions in time_period as the weight of Graph2
7. Save all data as corresponding files


In [1]:
import pandas as pd
import numpy as np
import geohash
from geopy.distance import distance
import datetime
from tqdm import tqdm
from collections import Counter
import random
import pickle

In [2]:
data_path = "./modeldata/gowalla/dataset_gowalla.csv"
# gowalla,brightkite --5; foursquareNY,foursquareTK--6
region_size = 5

data = pd.read_csv(data_path, parse_dates=['time'], infer_datetime_format=True)
data = data.sort_values(by=['user','time'], ascending=True)
data = data.reset_index(drop=True)

#data.loc[53703, "longitude"] = -0.0001
# get "region" column
data["gps"] = data["latitude"].apply(lambda x:str(x)) + "," + data["longitude"].apply(lambda x:str(x))
data["region"] = data["gps"].apply(lambda x:geohash.encode(float(x.split(",")[0]), float(x.split(",")[1]))[:region_size])
# get "time_delta" column
data["time_delta"] = data["time"].diff()
data["time_delta"] = data["time_delta"].astype('timedelta64[s]').astype(float)
data.loc[pd.isnull(data["time_delta"]), "time_delta"] = 0
# get "dist_delta" column
data["dist_delta"] = [0 for i in range(len(data))]
for i in tqdm(range(1, len(data)), ncols=80, colour="blue"):
    loc_i = data.loc[i,"gps"].strip(",")
    loc_j = data.loc[i-1,"gps"].strip(",")
    data.loc[i, "dist_delta"] = round(distance(loc_i, loc_j).m ,2)
for i in tqdm(range(1, len(data)), ncols=80, colour="blue"):
    if data.loc[i,"user"] != data.loc[i-1,"user"]:
        data.loc[i, "time_delta"] = 0
        data.loc[i, "dist_delta"] = 0

data = data.drop('gps', axis=1)
print(data.head(30))

regions_num = len(set(data["region"]))
user_num = len(set(data["user"]))
location_num = len(set(data["location"]))
print(f"there are totally {regions_num} regions")
print(f"there are totally {user_num} users")
print(f"there are totally {location_num} locations")

100%|[34m███████████████████████████████████[0m| 79631/79631 [00:53<00:00, 1480.78it/s][0m
100%|[34m██████████████████████████████████[0m| 79631/79631 [00:03<00:00, 19947.74it/s][0m

    user                time   latitude  longitude  location region  \
0      0 2010-09-24 14:58:57  30.267910 -97.749312     21714  9v6kp   
1      0 2010-09-24 21:32:13  30.269103 -97.749395    420315  9v6kp   
2      0 2010-09-25 02:00:54  30.317016 -97.719569   1145567  9v6s2   
3      0 2010-09-25 19:21:51  30.244761 -97.748961     25151  9v6kp   
4      0 2010-09-26 16:43:45  30.250618 -97.765900    121955  9v6kp   
5      0 2010-09-27 14:52:01  30.269103 -97.749395    420315  9v6kp   
6      0 2010-09-28 23:47:07  30.231375 -97.797455     15590  9v67y   
7      0 2010-09-29 00:31:24  30.237911 -97.799961    539065  9v6kn   
8      0 2010-09-30 23:23:22  30.264336 -97.741046    211286  9v6kp   
9      0 2010-10-01 14:36:39  30.267910 -97.749312     21714  9v6kp   
10     0 2010-10-02 01:48:43  30.255814 -97.763418   4256132  9v6kp   
11     0 2010-10-02 22:21:22  30.405304 -97.878399   1151119  9v6ku   
12     0 2010-10-03 22:21:49  30.244860 -97.757163     18417  9v6kp   
13    




In [3]:
def split_data(data, ratio):
    user_set = set(data["user"])
    train_user, test_user = set(), set()
    # gowalla--6; brightkite--6,foursquareNY--10,foursquareTK--10
    random.seed(6)
    for i in user_set:
        ra = random.random()
        if ra < 0.8:
            train_user.add(i)
        else:
            test_user.add(i)
    print(f"there are {len(train_user)} users in train set")
    print(f"there are {len(test_user)} users in test set")
    print("the rate of train set and test set is",len(train_user)/len(test_user))
    train_data = pd.DataFrame(columns=list(data.columns))
    test_data = pd.DataFrame(columns=list(data.columns))
    for i in tqdm(train_user,ncols=80, colour="blue"):
        train_data = pd.concat([data[data["user"]==i], train_data])
    for j in tqdm(test_user,ncols=80, colour="blue"):
        test_data = pd.concat([data[data["user"]==j], test_data])
    train_POI_num = len(set(train_data["location"]))
    print(f"there are {train_POI_num} POIs in train set,{round(train_POI_num/location_num*100,2)}% of all")
    return train_data, test_data

In [4]:
train_data, test_data = split_data(data, ratio=0.8)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
filepath = "./modeldata/gowalla/"
train_data.to_csv(filepath+"train_data.csv",index=0)
test_data.to_csv(filepath+"test_data.csv", index=0)

there are 2256 users in train set
there are 588 users in test set
the rate of train set and test set is 3.836734693877551


100%|[34m██████████████████████████████████████[0m| 2256/2256 [00:09<00:00, 236.75it/s][0m
100%|[34m████████████████████████████████████████[0m| 588/588 [00:01<00:00, 428.71it/s][0m


there are 14121 POIs in train set,87.96% of all


# POI2Vec
get word2idx, idx2word, poi_embedding

In [5]:
%%time
from node2vec import node2vec
Node2Vec_P = node2vec(data, train_data, filepath, node = "location")
print("loaction number of train data = ",len(set(train_data["location"])))
POI2idx, idx2POI, poi_embedding = Node2Vec_P.get_vec(C = 2, K = 15, epoch = 2,
                                                     MAX_VOCAB_SIZE = 20000,
                                                     EMBEDDING_SIZE = 100,
                                                     batch_size = 64,
                                                     lr = 0.001)
print(len(POI2idx))
print(poi_embedding.shape)

loaction number of train data =  14121
- -- -- -- -- -- -- -- -- -- -
----------start training----------
epoch 0 iteration 0 256.69427490234375
epoch 0 iteration 300 255.176513671875
epoch 0 iteration 600 232.31893920898438
epoch 0 iteration 900 231.271240234375
epoch 1 iteration 0 223.53216552734375
epoch 1 iteration 300 218.6048126220703
epoch 1 iteration 600 213.3433074951172
epoch 1 iteration 900 203.63412475585938
epoch 2 iteration 0 208.79315185546875
epoch 2 iteration 300 209.73739624023438
epoch 2 iteration 600 195.9639892578125
epoch 2 iteration 900 194.16796875
epoch 3 iteration 0 179.53750610351562
epoch 3 iteration 300 171.20785522460938
epoch 3 iteration 600 171.16656494140625
epoch 3 iteration 900 155.7832794189453
epoch 4 iteration 0 154.16192626953125
epoch 4 iteration 300 143.3174591064453
epoch 4 iteration 600 131.94546508789062
epoch 4 iteration 900 136.5543212890625
epoch 5 iteration 0 119.89389038085938
epoch 5 iteration 300 112.84535217285156
epoch 5 iteration 600

In [6]:
import numpy as np
import pickle
poi_embedding = np.load(filepath +"location"+ 'embedding.npy')
print("poi_embedding.shape = ",poi_embedding.shape)
with open(filepath +"location"+'idx2node.pickle', 'rb') as f:
    idx2POI = pickle.load(f)
print(len(idx2POI))
with open(filepath +"location"+'node2idx.pickle', 'rb') as f:
    POI2idx = pickle.load(f)
print(len(POI2idx))

poi_embedding.shape =  (21932, 100)
16054
16054


# Region2Vec

In [7]:
%%time
from node2vec import node2vec
Node2Vec_R = node2vec(data, train_data, filepath, node = "region")
region2idx, idx2region, region_embedding = Node2Vec_R.get_vec(C = 6, K = 15, epoch = 3,
                                                              MAX_VOCAB_SIZE = len(set(train_data["region"])),
                                                              EMBEDDING_SIZE = 50,
                                                              batch_size = 64,
                                                              lr = 0.001)
print(len(region2idx))
print(region_embedding.shape)

- -- -- -- -- -- -- -- -- -- -
----------start training----------
epoch 0 iteration 0 615.6592407226562
epoch 0 iteration 300 242.82127380371094
epoch 0 iteration 600 168.8892364501953
epoch 0 iteration 900 102.94479370117188
epoch 1 iteration 0 111.2987060546875
epoch 1 iteration 300 89.10466003417969
epoch 1 iteration 600 63.6519660949707
epoch 1 iteration 900 70.28202056884766
epoch 2 iteration 0 73.64806365966797
epoch 2 iteration 300 59.85749053955078
epoch 2 iteration 600 46.93395233154297
epoch 2 iteration 900 48.37794494628906
epoch 3 iteration 0 49.073551177978516
epoch 3 iteration 300 59.65852355957031
epoch 3 iteration 600 40.528072357177734
epoch 3 iteration 900 47.431884765625
epoch 4 iteration 0 45.318172454833984
epoch 4 iteration 300 44.669857025146484
epoch 4 iteration 600 40.87523651123047
epoch 4 iteration 900 39.6197509765625
epoch 5 iteration 0 40.70005798339844
epoch 5 iteration 300 41.39249801635742
epoch 5 iteration 600 42.50654983520508
epoch 5 iteration 900 39

In [8]:
region_embedding = np.load(filepath +"region"+ 'embedding.npy')
print("region_embedding.shape = ",region_embedding.shape)
with open(filepath +"region"+'idx2node.pickle', 'rb') as f:
    idx2region = pickle.load(f)
print(len(idx2region))
with open(filepath +"region"+'node2idx.pickle', 'rb') as f:
    region2idx = pickle.load(f)
print(len(region2idx))

region_embedding.shape =  (594, 50)
594
594


# weight matrix of dist-graph
Calation the distance of between every 2 region as the weight of "Region-Graph1"

In [9]:
def distance_cal(regions_num, id2region):
    distance_of_regions = np.zeros([regions_num, regions_num])
    for i in tqdm(range(regions_num), ncols=80, colour = "red"):
        for j in range(i):
            gps_i = geohash.decode(id2region[i])
            gps_j = geohash.decode(id2region[j])
            if 0 < distance(gps_i, gps_j).m <30000:
                distance_of_regions[i,j] = round(1/distance(gps_i, gps_j).m, 4)
    distance_of_regions = distance_of_regions + distance_of_regions.T
    distance_of_regions = distance_of_regions.astype(np.float32)
    # normalization of every columns
    for col in tqdm(range(regions_num), ncols=80, colour = "red"):
        s = np.sum(distance_of_regions[:,col]**2)
        if s != 0:
            distance_of_regions[:,col] = distance_of_regions[:,col]/np.sqrt(s)
        
    return distance_of_regions

In [10]:
dist_matrix = distance_cal(regions_num, idx2region)
# save dist_matrix
np.save(filepath+'dist_matrix.npy',dist_matrix)

100%|[31m█████████████████████████████████████████[0m| 594/594 [00:44<00:00, 13.41it/s][0m
100%|[31m██████████████████████████████████████[0m| 594/594 [00:00<00:00, 18853.80it/s][0m


# weight matrix of visit-graph
Calculate the visit times between 2 regions in time_period as the weight of Graph2

In [11]:
def covisit_cal(timedelta, data, regions_num, region2idx):
    times_between_regions = np.zeros([regions_num, regions_num])
    grouped = data.groupby("user")
    timedelta_ = datetime.timedelta(hours = 0)
    for user,group in tqdm(grouped, ncols=80, colour = "red"):
        group = group.sort_values('time')
        group = group.reset_index()
        for i in range(len(group)):
            for j in range(i):
                a = group.loc[i,"region"]
                b = group.loc[j,"region"]
                a = region2idx[a]
                b = region2idx[b]
                d = group.loc[i,"time"] - group.loc[j,"time"]
                if timedelta_ < d < timedelta:
                    d = d/np.timedelta64(1, 'h')
                    times_between_regions[b,a]=times_between_regions[b,a]+1/d
    times_between_regions = times_between_regions.astype(np.float32)
    # normalization of every columns
    for col in tqdm(range(regions_num), ncols=80, colour = "red"):
        s = np.sum(times_between_regions[:,col]**2)
        if s != 0:
            times_between_regions[:,col] = times_between_regions[:,col]/np.sqrt(s)
                
    return times_between_regions

In [12]:
max_timedelta = datetime.timedelta(days = 6)
visit_matrix = covisit_cal(max_timedelta, train_data, regions_num, region2idx)
# save dist_matrix
np.save(filepath+'visit_matrix.npy',visit_matrix)

100%|[31m███████████████████████████████████████[0m| 2256/2256 [01:28<00:00, 25.39it/s][0m
100%|[31m██████████████████████████████████████[0m| 594/594 [00:00<00:00, 37816.92it/s][0m


In [13]:
np.sum(visit_matrix[:,1]**2)

0.99999994

# weight matrix of visit-graph

In [None]:
def visit_num_cal(data, regions_num,region2idx):
    visit_num_m = np.zeros([regions_num, regions_num])
    grouped = data.groupby("region")
    for re,group in tqdm(grouped, ncols=80, colour="red"):
        ind = region2idx[re]
        num = len(group)
        visit_num_m[ind] += num
    for col in tqdm(range(regions_num), ncols=80, colour = "red"):
        s = np.sum(visit_num_m[:,col]**2)
        if s != 0:
            visit_num_m[:,col] = visit_num_m[:,col]/np.sqrt(s)
    return visit_num_m

In [None]:
visit_num_matrix = visit_num_cal(train_data, regions_num,region2idx)
# save dist_matrix
np.save(filepath+'visit_num_matrix.npy',visit_num_matrix)