# Document Overview
**Purpose:**
1. Generate training and testing sequences: 
2. Generate negative samples for each sequence
3. Generate category distribution matrix for each collective POI
4. Generate index map for reindexed POIs and Users

**Input file:** 
1. Original Data: 'data_SIN.csv'
2. Helper Function Libaray: 'Helper_Functions.py'

**Output file:** 
1. Sample sets consisting POI, distance, time, type, category, negative sequence
2. POI, user, category id mapping from old to new
3. Collective POI's category distribution dictionary
4. POI distance matrix 
5. Other paramters: POI max_distance and max_sequence_length

**Creation Date:** 4th Nov 2019

In [1]:
# dependencies
import numpy as np
import pandas as pd
import os
# import argparse

In [2]:
# import helper functions

import Helper_Functions as Helper

### Adjustable parameters:

1. **small_sample** *(boolean)*: Whether to use a small sample (1000 visits) for testing
2. **augment_sample** *(boolean)*: Whether to perform sample augmentation
3. **pad_data** *(boolean)*: Whether to perform padding on data sequence

4. **min_seq_len** *(int)*: Minimum No. POIs for a valid sequence
5. **min_seq_num** *(int)*: Minimun No. valid sequences for a valid user
6. **neg_sample_num** *(int)*: Number of negative samples for each POI

In [3]:
# setup parameters (for ipython execution)

small_sample = False
augment_sample = True
pad_data = False

min_seq_len = 2
min_seq_num = 2
neg_sample_num = 5

## 1.Import data

In [4]:
if small_sample:  
    data = pd.read_csv('./data_SIN.csv')[:20000] 
else: 
    data = pd.read_csv('./data_SIN.csv')

In [5]:
data.head()

Unnamed: 0,User_id,Local_sg_time,UTC_time,POI_id,POI_name,POI_Type,Location_id,Location_name,L1_Venue_category_name,L2_Venue_category_name,street_info,Latitude,Longitude,Date,Time,L1_id,L2_id,L1_fre,L2_fre
0,1,2012/05/07 22:59:32,Mon May 07 14:59:32 +0000 2012,884,HOLLAND VILLAGE,Combined,2689,Bar @ HOLLAND VILLAGE,Bar,Bar,"21A Lorong Mambong Holland Village, Singapore ...",1.311546,103.794714,20120507,225932,1,1,2989,922
1,1,2012/05/07 23:45:07,Mon May 07 15:45:07 +0000 2012,884,HOLLAND VILLAGE,Combined,816,Ice Cream Shop @ HOLLAND VILLAGE,Dessert Shop,Ice Cream Shop,"21 Lor Mambong, Singapore 277680",1.311395,103.794778,20120507,234507,2,2,2217,1073
2,1,2012/05/12 06:16:09,Fri May 11 22:16:09 +0000 2012,1270,"Pier Tanah Merah Ferry Terminal, 50 Tanah Mera...",Independent,117,"Pier Tanah Merah Ferry Terminal, 50 Tanah Mera...",Pier,Pier,"Tanah Merah Ferry Terminal, 50 Tanah Merah Fer...",1.313664,103.988574,20120512,61609,3,3,393,393
3,1,2012/05/13 10:39:50,Sun May 13 02:39:50 +0000 2012,1270,"Pier Tanah Merah Ferry Terminal, 50 Tanah Mera...",Independent,117,"Pier Tanah Merah Ferry Terminal, 50 Tanah Mera...",Pier,Pier,"Tanah Merah Ferry Terminal, 50 Tanah Merah Fer...",1.313664,103.988574,20120513,103950,3,3,393,393
4,1,2012/05/14 17:18:58,Mon May 14 09:18:58 +0000 2012,1530,THE CENTREPOINT,Combined,2529,Dessert Shop @ THE CENTREPOINT,Dessert Shop,Dessert Shop,"176 Orchard Rd, Singapore 238843",1.301605,103.839778,20120514,171858,2,4,2217,1107


In [6]:
data.columns

Index(['User_id', 'Local_sg_time', 'UTC_time', 'POI_id', 'POI_name',
       'POI_Type', 'Location_id', 'Location_name', 'L1_Venue_category_name',
       'L2_Venue_category_name', 'street_info', 'Latitude', 'Longitude',
       'Date', 'Time', 'L1_id', 'L2_id', 'L1_fre', 'L2_fre'],
      dtype='object')

## 2. Generate Visit Sequence 
Generate valid index sequences for each valid user

In [7]:
# check consecutiveness of User_id, Location_id, POI_id, L1_id, L2_id (only when full run)

if not(small_sample): 
    
    check_columns = ['User_id','Location_id','POI_id','L1_id','L2_id']

    for col in check_columns:
        Helper.check_is_consecutive(np.array(data[col]), 1)

In [8]:
# form visit sequences 

visit_sequences, max_seq_len, valid_visits, user_reIndex_mapping = Helper.generate_sequence(data, min_seq_len, min_seq_num)

assert bool(visit_sequences), 'no qualified sequence after filtering!' # check if output sequence is empty



In [9]:
Helper.peep_dictionary(visit_sequences)

5  :
 [list([41, 42, 43]) list([45, 46, 47, 48])]
dictionary size:  862


In [10]:
max_seq_len 

20

In [11]:
len(valid_visits)

11771

In [12]:
user_reIndex_mapping

array([   5,    7,   17,   25,   26,   40,   60,   63,   66,   69,   80,
         85,   90,  107,  108,  113,  116,  118,  119,  120,  121,  124,
        129,  130,  131,  135,  141,  144,  145,  146,  149,  151,  162,
        166,  177,  181,  185,  186,  191,  199,  211,  212,  215,  223,
        224,  226,  227,  231,  234,  239,  242,  244,  250,  254,  265,
        266,  270,  273,  278,  280,  288,  299,  300,  306,  313,  314,
        325,  326,  345,  346,  349,  357,  367,  372,  377,  378,  379,
        387,  388,  389,  390,  399,  410,  414,  427,  428,  435,  439,
        444,  446,  449,  450,  464,  480,  484,  486,  487,  494,  503,
        531,  532,  547,  552,  554,  572,  573,  576,  577,  582,  607,
        613,  621,  637,  639,  646,  649,  650,  651,  657,  661,  663,
        667,  668,  669,  671,  675,  680,  688,  694,  698,  700,  706,
        711,  717,  719,  725,  729,  732,  734,  737,  744,  758,  768,
        770,  773,  788,  791,  796,  798,  805,  8

In [13]:
# augment sequences (optional)

if augment_sample:
#     visit_sequences = Helper.aug_sequence(visit_sequences, min_len=3)
    visit_sequences, ground_truth_dict = Helper.aug_sequence(visit_sequences, min_len=3)

In [14]:
Helper.peep_dictionary(visit_sequences)

5  :
 [list([41, 42, 43]) list([45, 46, 47]) list([45, 46, 47, 48])]
dictionary size:  862


In [15]:
Helper.peep_dictionary(ground_truth_dict)

5  :
 [list([43]) list([47, 48]) list([48])]
dictionary size:  862


In [16]:
# pad sequences (optional)

if pad_data:
    
    visit_sequences = Helper.pad_sequence(visit_sequences, max_seq_len)

In [17]:
Helper.peep_dictionary(visit_sequences)

5  :
 [list([41, 42, 43]) list([45, 46, 47]) list([45, 46, 47, 48])]
dictionary size:  862


## 3. Prepare Input Sequences
Five input sequences paralleled with the Visit Sequence are prepared:
1. POI sequence
2. Distance sequence
3. Time sequence
4. Type sequence
5. Category sequence

In [18]:
data[41:44]

Unnamed: 0,User_id,Local_sg_time,UTC_time,POI_id,POI_name,POI_Type,Location_id,Location_name,L1_Venue_category_name,L2_Venue_category_name,street_info,Latitude,Longitude,Date,Time,L1_id,L2_id,L1_fre,L2_fre
41,5,2013-01-25 10:00:23,Fri Jan 25 02:00:23 +0000 2013,905,HSBC VIVOCITY,Combined,1031,Food Court @ HSBC VIVOCITY,Food Court,Food Court,"1 Harbourfront Walk, Singapore 098585",1.264741,103.821788,20130125,100023,5,6,9500,9500
42,5,2013-01-25 11:00:01,Fri Jan 25 03:00:01 +0000 2013,1316,RESORTS WORLD SENTOSA,Combined,2353,Resort @ RESORTS WORLD SENTOSA,Hotel,Resort,"Resort World Sentosa, 26 Sentosa Gateway, #01-...",1.25631,103.820897,20130125,110001,19,20,1878,1585
43,5,2013-01-25 11:18:17,Fri Jan 25 03:18:17 +0000 2013,1599,"Theme Park 5 Garden Ave, Singapore 099625",Independent,278,"Theme Park 5 Garden Ave, Singapore 099625",Theme Park,Theme Park,"5 Garden Ave, Singapore 099625",1.256461,103.821359,20130125,111817,13,14,1168,1150


In [19]:
# generate POI sequence

POI_sequences, POI_reIndex_mapping = Helper.generate_POI_sequences(data, visit_sequences)

In [20]:
POI_sequences[0] # POI_sequence for first user

[[715, 1010, 1214], [490, 565, 614], [490, 565, 614, 974]]

In [21]:
POI_reIndex_mapping

array([   1,    2,    3, ..., 1631, 1632, 1633], dtype=int64)

In [22]:
print(POI_reIndex_mapping[16])
print(POI_reIndex_mapping[37])
print(POI_reIndex_mapping[50])

21
45
62


In [23]:
# generate distance sequence

dist_sequences, max_dist = Helper.generate_dist_sequences(data, visit_sequences)

In [24]:
dist_sequences[0] # dist_sequence for first user # can perform analysis

[[0, 1, 1], [0, 2, 7], [0, 2, 7, 2]]

In [25]:
max_dist # maximum distance between two consecutive visits 

28

In [26]:
# generate time sequence

time_sequences = Helper.generate_time_sequences(data, visit_sequences)

In [27]:
time_sequences[0] # time_sequence for first user

[[10, 11, 11], [11, 19, 19], [11, 19, 19, 19]]

In [28]:
# generage Type sequence

type_sequences = Helper.generate_type_sequence(data, visit_sequences)

In [29]:
type_sequences[0] # type_sequence for first user

[[1, 1, 0], [0, 0, 1], [0, 0, 1, 0]]

In [30]:
# generate category sequence

cat_sequences, cat_reIndex_mapping = Helper.generate_cat_sequences(data, visit_sequences)

In [31]:
cat_sequences[0] # cat_sequence for first user

[[5, 19, 13], [25, 5, 11], [25, 5, 11, 26]]

In [32]:
cat_reIndex_mapping 

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 102, 103, 104, 105, 106,
       107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120,
       121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
       135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 162,
       163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 175, 176,
       177, 178, 179, 180, 182, 183, 184, 185, 186, 187, 188, 18

In [33]:
print(cat_reIndex_mapping[2])
print(cat_reIndex_mapping[12])
print(cat_reIndex_mapping[8])

3
13
9


In [34]:
# generate ground truth for each sequence

ground_truth_sequences = Helper.generate_ground_truth_sequences(data, ground_truth_dict, POI_reIndex_mapping)

In [35]:
ground_truth_sequences[0]

[[1214], [614, 974], [974]]

## 4. Extra Data Preperation

### Collective POI's category distribution

For each collective POI, count the number stores belongs to each category it has.
The distribution is recorded in a 2-layer dictionary of form:

{ POI_id (new id) : { category_id (new id): store count (int)} }

In [36]:
# generate collective POI's category distribution

poi_cat_distrib = Helper.generate_cat_distrib(data, valid_visits, POI_reIndex_mapping, cat_reIndex_mapping)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  collective_POI_visit['L2_id'] = collective_POI_visit['L2_id'].apply(lambda x: _old_id_to_new(cat_reIndex_mapping, x))


In [37]:
Helper.peep_dictionary(poi_cat_distrib)

715  :
 Counter({5: 29, 53: 13, 15: 8, 153: 5, 35: 4, 6: 3, 41: 3, 18: 2, 43: 2, 121: 2, 114: 1, 157: 1, 7: 1})
dictionary size:  426


In [38]:
POI_reIndex_mapping

array([   1,    2,    3, ..., 1631, 1632, 1633], dtype=int64)

In [39]:
data[data['POI_id'] == 1]

Unnamed: 0,User_id,Local_sg_time,UTC_time,POI_id,POI_name,POI_Type,Location_id,Location_name,L1_Venue_category_name,L2_Venue_category_name,street_info,Latitude,Longitude,Date,Time,L1_id,L2_id,L1_fre,L2_fre
115,12,2012-07-14 13:28:17,Sat Jul 14 05:28:17 +0000 2012,1,313 @ SOMERSET,Combined,1221,Coffee Shop @ 313 @ SOMERSET,Coffee Shop,Coffee Shop,"313 Orchard Rd, Singapore 238895",1.301198,103.838527,20120714,132817,6,7,5519,5519
116,12,2012-07-14 15:32:41,Sat Jul 14 07:32:41 +0000 2012,1,313 @ SOMERSET,Combined,1613,Clothing Store @ 313 @ SOMERSET,Clothing Store,Clothing Store,"313 Orchard Rd, Singapore 238895",1.301339,103.838439,20120714,153241,16,17,2762,1038
117,12,2012-07-14 19:10:26,Sat Jul 14 11:10:26 +0000 2012,1,313 @ SOMERSET,Combined,1382,Swiss Restaurant @ 313 @ SOMERSET,Swiss Restaurant,Swiss Restaurant,"313 Orchard Rd, Singapore 238895",1.301124,103.838139,20120714,191026,41,47,270,270
152,16,2012-08-20 10:19:23,Mon Aug 20 02:19:23 +0000 2012,1,313 @ SOMERSET,Combined,1382,Swiss Restaurant @ 313 @ SOMERSET,Swiss Restaurant,Swiss Restaurant,"313 Orchard Rd, Singapore 238895",1.301124,103.838139,20120820,101923,41,47,270,270
212,22,2012-07-15 18:37:41,Sun Jul 15 10:37:41 +0000 2012,1,313 @ SOMERSET,Combined,339,Clothing Store @ 313 @ SOMERSET,Clothing Store,Clothing Store,"313 Orchard Rd, Singapore 238895",1.300915,103.838664,20120715,183741,16,17,2762,1038
396,40,2013-08-10 16:25:57,Sat Aug 10 08:25:57 +0000 2013,1,313 @ SOMERSET,Combined,1613,Clothing Store @ 313 @ SOMERSET,Clothing Store,Clothing Store,"313 Orchard Rd, Singapore 238895",1.301339,103.838439,20130810,162557,16,17,2762,1038
609,61,2013-06-05 17:20:07,Wed Jun 05 09:20:07 +0000 2013,1,313 @ SOMERSET,Combined,3392,Fried Chicken Joint @ 313 @ SOMERSET,Fried Chicken Joint,Fried Chicken Joint,"313 Orchard Rd, Singapore 238895",1.301224,103.838527,20130605,172007,48,56,1816,1816
694,70,2012-06-21 14:05:38,Thu Jun 21 06:05:38 +0000 2012,1,313 @ SOMERSET,Combined,556,Clothing Store @ 313 @ SOMERSET,Clothing Store,Clothing Store,"313 Orchard Rd, Singapore 238895",1.300914,103.838447,20120621,140538,16,17,2762,1038
697,70,2012-06-21 19:00:22,Thu Jun 21 11:00:22 +0000 2012,1,313 @ SOMERSET,Combined,556,Clothing Store @ 313 @ SOMERSET,Clothing Store,Clothing Store,"313 Orchard Rd, Singapore 238895",1.300914,103.838447,20120621,190022,16,17,2762,1038
794,80,2012-04-12 12:41:19,Thu Apr 12 04:41:19 +0000 2012,1,313 @ SOMERSET,Combined,1382,Swiss Restaurant @ 313 @ SOMERSET,Swiss Restaurant,Swiss Restaurant,"313 Orchard Rd, Singapore 238895",1.301124,103.838139,20120412,124119,41,47,270,270


In [40]:
poi_cat_distrib[0]

Counter({16: 24,
         45: 9,
         5: 14,
         59: 9,
         3: 2,
         48: 4,
         6: 14,
         39: 8,
         76: 2,
         172: 13,
         34: 2,
         30: 1,
         114: 6,
         107: 3,
         121: 1})

In [41]:
valid_visit_data = data[data.index.isin(valid_visits)]

In [42]:
valid_visit_data[valid_visit_data['POI_id'] == POI_reIndex_mapping[0]]

Unnamed: 0,User_id,Local_sg_time,UTC_time,POI_id,POI_name,POI_Type,Location_id,Location_name,L1_Venue_category_name,L2_Venue_category_name,street_info,Latitude,Longitude,Date,Time,L1_id,L2_id,L1_fre,L2_fre
396,40,2013-08-10 16:25:57,Sat Aug 10 08:25:57 +0000 2013,1,313 @ SOMERSET,Combined,1613,Clothing Store @ 313 @ SOMERSET,Clothing Store,Clothing Store,"313 Orchard Rd, Singapore 238895",1.301339,103.838439,20130810,162557,16,17,2762,1038
794,80,2012-04-12 12:41:19,Thu Apr 12 04:41:19 +0000 2012,1,313 @ SOMERSET,Combined,1382,Swiss Restaurant @ 313 @ SOMERSET,Swiss Restaurant,Swiss Restaurant,"313 Orchard Rd, Singapore 238895",1.301124,103.838139,20120412,124119,41,47,270,270
1064,107,2012-07-06 14:09:37,Fri Jul 06 06:09:37 +0000 2012,1,313 @ SOMERSET,Combined,257,Food Court @ 313 @ SOMERSET,Food Court,Food Court,"313 Orchard Rd, Singapore 238895",1.301178,103.838418,20120706,140937,5,6,9500,9500
1832,177,2012-08-05 20:02:04,Sun Aug 05 12:02:04 +0000 2012,1,313 @ SOMERSET,Combined,339,Clothing Store @ 313 @ SOMERSET,Clothing Store,Clothing Store,"313 Orchard Rd, Singapore 238895",1.300915,103.838664,20120805,200204,16,17,2762,1038
2366,223,2012-12-09 16:04:19,Sun Dec 09 08:04:19 +0000 2012,1,313 @ SOMERSET,Combined,2773,Japanese Restaurant @ 313 @ SOMERSET,Asian Restaurant,Japanese Restaurant,"313 Orchard Rd, Singapore 238895",1.300864,103.838374,20121209,160419,18,61,11526,1750
3608,325,2013-01-26 13:37:14,Sat Jan 26 05:37:14 +0000 2013,1,313 @ SOMERSET,Combined,257,Food Court @ 313 @ SOMERSET,Food Court,Food Court,"313 Orchard Rd, Singapore 238895",1.301178,103.838418,20130126,133714,5,6,9500,9500
4025,357,2012-06-03 15:04:55,Sun Jun 03 07:04:55 +0000 2012,1,313 @ SOMERSET,Combined,2057,Dessert Shop @ 313 @ SOMERSET,Dessert Shop,Dessert Shop,"313 Orchard Rd, Singapore 238895",1.301038,103.838568,20120603,150455,2,4,2217,1107
7405,582,2012-05-29 16:43:39,Tue May 29 08:43:39 +0000 2012,1,313 @ SOMERSET,Combined,257,Food Court @ 313 @ SOMERSET,Food Court,Food Court,"313 Orchard Rd, Singapore 238895",1.301178,103.838418,20120529,164339,5,6,9500,9500
8677,657,2012-04-22 13:56:22,Sun Apr 22 05:56:22 +0000 2012,1,313 @ SOMERSET,Combined,2775,Japanese Restaurant @ 313 @ SOMERSET,Asian Restaurant,Japanese Restaurant,"313 Orchard Rd, Singapore 238895",1.300910,103.838375,20120422,135622,18,61,11526,1750
8744,661,2012-10-28 17:45:50,Sun Oct 28 09:45:50 +0000 2012,1,313 @ SOMERSET,Combined,2798,Snack Place @ 313 @ SOMERSET,Snack Place,Snack Place,"313 Orchard Rd, Singapore 238895",1.301190,103.838294,20121028,174550,43,50,161,161


In [43]:
print(cat_reIndex_mapping[10])
print(cat_reIndex_mapping[26])

11
27


### Negative Samples for Each Sequence

For each user's each sequence, generate 'neg_sample_num' number of negative POIs

Negative POIs statisfy following criteria:

1. The POI does not appear in the true sequence 

2. The distance between:
    *a) negative POI and true destination* and 
    *b) true second last POI and true destination*
   should be as close as possible
   
The output neg_sequences should be a 3d array of shape [user, seq, neg_sample]

In [44]:
# store distance between each valid POI (time consuming)
    
dist_mat = Helper.generate_POI_dist_mat(data, POI_reIndex_mapping)



In [45]:
dist_mat

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 5.31944421,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.90112899,  4.84153765,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 1.7214249 ,  4.8141696 ,  5.08556577, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.3748249 ,  4.59032709,  5.5002234 , ...,  0.67173523,
         0.        ,  0.        ],
       [14.81053363, 19.96239947, 16.08917572, ..., 16.03372476,
        16.63653582,  0.        ]])

In [46]:
POI_reIndex_mapping[:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8, 10, 11], dtype=int64)

In [47]:
data[data['POI_id'] == 1].iloc[0]

User_id                                                 12
Local_sg_time                          2012-07-14 13:28:17
UTC_time                    Sat Jul 14 05:28:17 +0000 2012
POI_id                                                   1
POI_name                                    313 @ SOMERSET
POI_Type                                          Combined
Location_id                                           1221
Location_name                 Coffee Shop @ 313 @ SOMERSET
L1_Venue_category_name                         Coffee Shop
L2_Venue_category_name                         Coffee Shop
street_info               313 Orchard Rd, Singapore 238895
Latitude                                            1.3012
Longitude                                          103.839
Date                                              20120714
Time                                                132817
L1_id                                                    6
L2_id                                                   

In [48]:
data[data['POI_id'] == 136].iloc[0]

User_id                                               17
Local_sg_time                        2013-07-20 17:25:43
UTC_time                  Sat Jul 20 09:25:43 +0000 2013
POI_id                                               136
POI_name                  Beach Beach Station, Singapore
POI_Type                                     Independent
Location_id                                         3148
Location_name             Beach Beach Station, Singapore
L1_Venue_category_name                             Beach
L2_Venue_category_name                             Beach
street_info                     Beach Station, Singapore
Latitude                                         1.25139
Longitude                                        103.818
Date                                            20130720
Time                                              172543
L1_id                                                 26
L2_id                                                 28
L1_fre                         

In [49]:
# generate negative samples 

neg_sequences = Helper.generate_neg_sequences(POI_sequences, dist_mat, neg_sample_num, data, POI_reIndex_mapping, cat_reIndex_mapping)

In [50]:
neg_sequences[0] # negative samples for each sequence of 1st user

[[[1241, 127, 0],
  [1221, -1, 1],
  [1240, 117, 0],
  [1218, -1, 1],
  [1217, 130, 0]],
 [[620, -1, 1], [831, -1, 1], [830, -1, 1], [829, -1, 1], [828, -1, 1]],
 [[1241, 127, 0], [1073, -1, 1], [1074, -1, 1], [1075, -1, 1], [1076, -1, 1]]]

In [51]:
cat_reIndex_mapping

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  46,  47,  48,  49,  50,  51,  52,  53,
        54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
        67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
        80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,
        93,  94,  95,  96,  97,  98,  99, 100, 102, 103, 104, 105, 106,
       107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118, 119, 120,
       121, 122, 123, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134,
       135, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148,
       149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 162,
       163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 175, 176,
       177, 178, 179, 180, 182, 183, 184, 185, 186, 187, 188, 18

## 5. Form Sample Sets

Concatenate five sequences to form a sample, which is a tuple consists of: (POI_seq, dist_seq, time_seq, type_seq, cat_seq, neg_samplw)

Organise samples according to users in a dictionary of form:

{ User_id (new id) : sample sets } 

In [52]:
# form sample set for each user

sample_sets = Helper.form_sample_sets(POI_sequences, dist_sequences, time_sequences, type_sequences, cat_sequences, ground_truth_sequences, neg_sequences)

Total user: 862 -- Total sample: 5800


In [53]:
Helper.peep_dictionary(sample_sets)

0  :
 [([715, 1010, 1214], [0, 1, 1], [10, 11, 11], [1, 1, 0], [5, 19, 13], [1214], [[1241, 127, 0], [1221, -1, 1], [1240, 117, 0], [1218, -1, 1], [1217, 130, 0]]), ([490, 565, 614], [0, 2, 7], [11, 19, 19], [0, 0, 1], [25, 5, 11], [614, 974], [[620, -1, 1], [831, -1, 1], [830, -1, 1], [829, -1, 1], [828, -1, 1]]), ([490, 565, 614, 974], [0, 2, 7, 2], [11, 19, 19, 19], [0, 0, 1, 0], [25, 5, 11, 26], [974], [[1241, 127, 0], [1073, -1, 1], [1074, -1, 1], [1075, -1, 1], [1076, -1, 1]])]
dictionary size:  862


In [54]:
sample_sets

{0: [([715, 1010, 1214],
   [0, 1, 1],
   [10, 11, 11],
   [1, 1, 0],
   [5, 19, 13],
   [1214],
   [[1241, 127, 0],
    [1221, -1, 1],
    [1240, 117, 0],
    [1218, -1, 1],
    [1217, 130, 0]]),
  ([490, 565, 614],
   [0, 2, 7],
   [11, 19, 19],
   [0, 0, 1],
   [25, 5, 11],
   [614, 974],
   [[620, -1, 1], [831, -1, 1], [830, -1, 1], [829, -1, 1], [828, -1, 1]]),
  ([490, 565, 614, 974],
   [0, 2, 7, 2],
   [11, 19, 19, 19],
   [0, 0, 1, 0],
   [25, 5, 11, 26],
   [974],
   [[1241, 127, 0],
    [1073, -1, 1],
    [1074, -1, 1],
    [1075, -1, 1],
    [1076, -1, 1]])],
 1: [([715, 489, 607],
   [0, 6, 6],
   [9, 13, 17],
   [1, 0, 1],
   [5, 25, 15],
   [607],
   [[620, -1, 1], [831, -1, 1], [830, -1, 1], [829, -1, 1], [828, -1, 1]]),
  ([164, 929],
   [0, 2],
   [9, 13],
   [1, 1],
   [33, 34],
   [929],
   [[1241, 127, 0],
    [1028, 19, 0],
    [1029, 19, 0],
    [1030, 19, 0],
    [1031, 19, 0]])],
 2: [([599, 1100],
   [0, 1],
   [15, 15],
   [1, 0],
   [4, 22],
   [1100],
   [[

# 6. Output Files

In [55]:
# set output directory

dir = './np_save/'
if small_sample:
    dir = './test_np_save/'

In [56]:
# create directory if not exists

if not os.path.exists(dir):
    os.makedirs(dir)

In [57]:
# save concatenated samples

Helper.save_dict(sample_sets, dir + 'sample_sets.pkl')

In [58]:
# save id mappings

np.save(dir + 'POI_reIndex_mapping.npy', POI_reIndex_mapping)
np.save(dir + 'user_reIndex_mapping.npy', user_reIndex_mapping)
np.save(dir + 'cat_reIndex_mapping.npy', cat_reIndex_mapping)

In [59]:
# save collective POI's category distribution dictionary

Helper.save_dict(poi_cat_distrib, dir + 'poi_cat_distrib.pkl')

In [60]:
# save POI distance matrix 

np.save(dir + 'dist_mat.npy', dist_mat)

In [61]:
# save other relavant parameters

np.save(dir + 'max_dist.npy', max_dist) # max distance (for distance embedding)
np.save(dir + 'max_seq_len.npy', max_seq_len) # max sequence length (for input size)
np.save(dir + 'neg_sample_num.npy', neg_sample_num) # number of negative samples (for negative input size)