In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
fsq_nyc = pd.read_csv("dataSet_fsq_nyc.csv")
fsq_nyc.head()

Unnamed: 0,id,user_id,location_id,category,name,latitude,longitude,start_day,start_min,weekday,x,y,timestamp
0,377,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,0,1088,1,1000472.2,236870.78,1333476000.0
1,378,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,4,825,5,1000472.2,236870.78,1333806000.0
2,379,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,5,503,6,1000472.2,236870.78,1333873000.0
3,380,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,7,1117,1,1000472.2,236870.78,1334083000.0
4,381,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,12,505,6,1000472.2,236870.78,1334478000.0


# Basic statistics - Table 1

In [4]:
# The number of unique locations in the dataset
print(f"The number of unique locations: {fsq_nyc['location_id'].nunique()}")

# The number of unique users
print(f"The number of unique users: {fsq_nyc['user_id'].nunique()}")

# The number of records (stays) per user
fsq_nyc['user_id'].value_counts().describe()

The number of unique locations: 4019
The number of unique users: 535


count     535.000000
mean      178.265421
std       206.082636
min        31.000000
25%        83.000000
50%       116.000000
75%       191.000000
max      2499.000000
Name: count, dtype: float64

In [5]:
# The number of unique locations per user
fsq_nyc.groupby('user_id').aggregate({'location_id': 'nunique'}).describe()

Unnamed: 0,location_id
count,535.0
mean,34.706542
std,21.37278
min,3.0
25%,20.5
50%,32.0
75%,44.0
max,226.0


In [7]:
fsq_nyc.groupby(['user_id', 'start_day']).size().describe()

count    44833.000000
mean         2.127272
std          2.105456
min          1.000000
25%          1.000000
50%          1.000000
75%          2.000000
max         32.000000
dtype: float64

In [9]:
# Numbers of days tracked
fsq_nyc['start_day'].nunique()

319

In [5]:
with open("fsq_nyc_7_train.pk", "rb") as f:
    train_data = pickle.load(f)

print(f"len(train_data): {len(train_data)}")

train_data[:2]

len(train_data): 48606


[{'X': array([1610, 1610, 1610]),
  'lon_X': array([-73.9413925, -73.9413925, -73.9413925]),
  'lat_X': array([40.8168158, 40.8168158, 40.8168158]),
  'x_X': array([1000472.2, 1000472.2, 1000472.2]),
  'y_X': array([236870.78, 236870.78, 236870.78]),
  'user_X': 1,
  'weekday_X': array([1, 5, 6]),
  'start_min_X': array([1088,  825,  503]),
  'timestamp_X': array([1.33347649e+09, 1.33380632e+09, 1.33387342e+09]),
  'diff': array([7, 3, 2]),
  'Y': 1610,
  'lon_Y': -73.94139250202096,
  'lat_Y': 40.81681580433024,
  'x_Y': 1000472.2,
  'y_Y': 236870.78,
  'weekday_Y': 1,
  'start_min_Y': 1117,
  'timestamp_Y': 1334083028.0,
  'poi_cname_X': array(['Church', 'Church', 'Church'], dtype=object)},
 {'X': array([1610, 1610, 1316]),
  'lon_X': array([-73.9413925, -73.9413925, -73.977499 ]),
  'lat_X': array([40.8168158, 40.8168158, 40.77693  ]),
  'x_X': array([1000472.2 , 1000472.2 ,  990481.83]),
  'y_X': array([236870.78, 236870.78, 222334.37]),
  'user_X': 1,
  'weekday_X': array([6, 2, 3

# Get the pretrain corpus from .pk file

In [30]:
train_corpus = []

# train_corpus is a list of list of numbers
for iter_dict in train_data:
    X = iter_dict["X"].tolist()
    Y = iter_dict["Y"]
    X.append(Y)
    train_corpus.append(X)


In [31]:
train_corpus[:2]

[[1610, 1610, 1610, 1610], [1610, 1610, 1316, 3203]]

In [32]:
# Get the average length of the train_corpus
lengths = [len(x) for x in train_corpus]
print(f"Average length of train_corpus: {np.mean(lengths)}")

Average length of train_corpus: 24.112228942928855


In [34]:
import os

In [36]:
os.path.join("11", "_mhsa_7_train.pk", "89")

'11/_mhsa_7_train.pk/89'

# Try sampling the locations for inductive setting

In [6]:
fsq_nyc_train = pd.read_csv("fsq_nyc_train.csv")
fsq_nyc_train

Unnamed: 0,id,user_id,location_id,category,name,latitude,longitude,start_day,start_min,weekday,x,y,timestamp,location_id_num
0,377,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,0,1088,1,1000472.20,236870.78,1.333476e+09,1610
1,378,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,4,825,5,1000472.20,236870.78,1.333806e+09,1610
2,379,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,5,503,6,1000472.20,236870.78,1.333873e+09,1610
3,380,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,7,1117,1,1000472.20,236870.78,1.334083e+09,1610
4,381,1,4b376908f964a520ef4025e3,4bf58dd8d48988d132941735,Church,40.816816,-73.941393,12,505,6,1000472.20,236870.78,1.334478e+09,1610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59696,227338,535,3fd66200f964a52020e71ee3,4bf58dd8d48988d1df931735,BBQ Joint,40.744557,-73.990650,164,1265,4,986840.72,210539.17,1.347657e+09,27
59697,227339,535,4eda64ced5fb8f213a5d740e,4bf58dd8d48988d176941735,Gym / Fitness Center,40.746119,-73.993070,165,880,5,986170.12,211108.20,1.347720e+09,3442
59698,227341,535,42911d00f964a520f5231fe3,4bf58dd8d48988d129951735,Train Station,40.750795,-73.993576,188,519,0,986029.71,212811.73,1.349686e+09,300
59699,227343,535,4a53d9a7f964a520c7b21fe3,4bf58dd8d48988d124941735,Office,40.745518,-73.992351,188,536,0,986369.32,210889.07,1.349687e+09,866


In [7]:
locs_id_num = fsq_nyc_train["location_id_num"].unique()
locs_id_num = np.sort(locs_id_num)
locs_id_num

array([   2,    3,    4, ..., 3815, 3816, 3817])

In [None]:
locs_id_num = fsq_nyc_train["location_id_num"].unique()
locs_id_num = np.sort(locs_id_num)

# randomly sample 10% of the locs for 5 times, each on with a different seed
for i in range(5):
    np.random.seed(i)
    locs_id_num_sample = np.random.choice(locs_id_num, size=int(0.1 * len(locs_id_num)), replace=False)
    locs_id_num_sample = np.sort(locs_id_num_sample)

    print(f"Seed {i}: {len(locs_id_num_sample)}")
    print(f"Seed {i}: {locs_id_num_sample[:7]}")
    locs_id_num_sample = np.sort(locs_id_num_sample)
    # Convert the locs_id_num_sampled to a dataframe and save it
    locs_id_num_sample_df = pd.DataFrame(locs_id_num_sample, columns=["location_id_num"])

    locs_id_num_sample_df.to_csv(f"locs2remove_10pct_{i}.csv", index=False)


Seed 0: 381
Seed 0: [  3  24  35  41  47  90 119]
Seed 1: 381
Seed 1: [ 5  8  9 15 19 21 39]
Seed 2: 381
Seed 2: [ 6 20 22 64 73 74 78]
Seed 3: 381
Seed 3: [ 8 23 32 36 40 61 71]
Seed 4: 381
Seed 4: [15 31 38 46 70 79 92]


## Process train, val, test data

In [9]:
with open("fsq_nyc_7_train.pk", "rb") as f:
    train_data = pickle.load(f)

with open("fsq_nyc_7_validation.pk", "rb") as f:
    vali_data = pickle.load(f)

with open("fsq_nyc_7_test.pk", "rb") as f:
    test_data = pickle.load(f)

print(f"len(train_data): {len(train_data)}")
print(f"len(vali_data): {len(vali_data)}")
print(f"len(test_data): {len(test_data)}")

len(train_data): 48606
len(vali_data): 11510
len(test_data): 12141


In [None]:
# Iterate 5 times to process the train, val and test data
for i in range(5):
    # Get the locs_sampled
    locs_id_num_sampled_df = pd.read_csv(f"locs2remove_10pct_{i}.csv")
    locs_id_num_sampled = locs_id_num_sampled_df["location_id_num"].values
    print(locs_id_num_sampled[:5])

    ## Train data
    new_train_data = []
    train_data_inductive = []
    # Iterate through train_data
    for iter_dict in train_data:
        # if there is loc in locs_id_num_sampled, then remove it
        X = iter_dict["X"]
        Y = iter_dict["Y"]
        
        if Y in locs_id_num_sampled or any([iter_loc in locs_id_num_sampled for iter_loc in X]):
            train_data_inductive.append(iter_dict)
        else:
            new_train_data.append(iter_dict)
    
    print(f"len(new_train_data): {len(new_train_data)}")
    print(f"len(train_data_inductive): {len(train_data_inductive)}")
    print(f"The length of the sum of the two: {len(new_train_data) + len(train_data_inductive)}")

    ## Validation data
    new_vali_data = []
    vali_data_inductive = []
    # Iterate through train_data
    for iter_dict in vali_data:
        # if there is loc in locs_id_num_sampled, then remove it
        X = iter_dict["X"]
        Y = iter_dict["Y"]
        
        if Y in locs_id_num_sampled or any([iter_loc in locs_id_num_sampled for iter_loc in X]):
            vali_data_inductive.append(iter_dict)
        else:
            new_vali_data.append(iter_dict)

    print(f"len(new_vali_data): {len(new_vali_data)}")
    print(f"len(vali_data_inductive): {len(vali_data_inductive)}")
    print(f"The length of the sum of the two: {len(new_vali_data) + len(vali_data_inductive)}")

    ## Test data
    new_test_data = []
    test_data_inductive = []
    # Iterate through train_data
    for iter_dict in test_data:
        # if there is loc in locs_id_num_sampled, then remove it
        X = iter_dict["X"]
        Y = iter_dict["Y"]
        
        if Y in locs_id_num_sampled or any([iter_loc in locs_id_num_sampled for iter_loc in X]):
            test_data_inductive.append(iter_dict)
        else:
            new_test_data.append(iter_dict)
            
    print(f"len(new_test_data): {len(new_test_data)}")
    print(f"len(test_data_inductive): {len(test_data_inductive)}")
    print(f"The length of the sum of the two: {len(new_test_data) + len(test_data_inductive)}")

    ## Save the new data
    with open(f"fsq_nyc_7_train_remove10pct_{i}.pk", "wb") as f:
        pickle.dump(new_train_data, f)

    with open(f"fsq_nyc_7_validation_remove10pct_{i}.pk", "wb") as f:
        pickle.dump(new_vali_data, f)

    with open(f"fsq_nyc_7_test_remove10pct_{i}.pk", "wb") as f:
        pickle.dump(new_test_data, f)

    with open(f"fsq_nyc_7_testinductive10pct_{i}.pk", "wb") as f:
        pickle.dump(test_data_inductive, f)

[ 3 24 35 41 47]
len(new_train_data): 19234
len(train_data_zeroshot): 29372
The length of the sum of the two: 48606
len(new_vali_data): 5633
len(vali_data_zeroshot): 5877
The length of the sum of the two: 11510
len(new_test_data): 5826
len(test_data_zeroshot): 6315
The length of the sum of the two: 12141
[ 5  8  9 15 19]
len(new_train_data): 21503
len(train_data_zeroshot): 27103
The length of the sum of the two: 48606
len(new_vali_data): 5873
len(vali_data_zeroshot): 5637
The length of the sum of the two: 11510
len(new_test_data): 6362
len(test_data_zeroshot): 5779
The length of the sum of the two: 12141
[ 6 20 22 64 73]
len(new_train_data): 21840
len(train_data_zeroshot): 26766
The length of the sum of the two: 48606
len(new_vali_data): 6530
len(vali_data_zeroshot): 4980
The length of the sum of the two: 11510
len(new_test_data): 7300
len(test_data_zeroshot): 4841
The length of the sum of the two: 12141
[ 8 23 32 36 40]
len(new_train_data): 21748
len(train_data_zeroshot): 26858
The le

## Get the total sampled locations (L^{new})

In [33]:
locs_id_num_sum = pd.DataFrame({
    "location_id_num": None,
}, index=[])

locs_id_num_sum

Unnamed: 0,location_id_num


In [35]:
locs_id_num_sum = pd.DataFrame({
    "location_id_num": None,
}, index=[])

# Iterate 5 times to process the train, val and test data
for i in range(5):
    # Get the locs_sampled
    locs_id_num_sampled_df = pd.read_csv(f"locs2remove_10pct_{i}.csv")
    locs_id_num_sampled = locs_id_num_sampled_df["location_id_num"].values
    locs_id_num_sum = pd.concat([locs_id_num_sum, locs_id_num_sampled_df], axis=0)

locs_id_num_sum

  locs_id_num_sum = pd.concat([locs_id_num_sum, locs_id_num_sampled_df], axis=0)


Unnamed: 0,location_id_num
0,3
1,24
2,35
3,41
4,47
...,...
376,3739
377,3745
378,3786
379,3796


In [37]:
print(f"len(locs_id_num_sum): {len(locs_id_num_sum)}")
# check if there is any duplicated locs
num_duplicated = locs_id_num_sum.duplicated().sum()
print(f"number of duplicated locs: {num_duplicated}")
# drop the duplicated locs
locs_id_num_sum = locs_id_num_sum.drop_duplicates()
print(f"after dropping duplicated locs: len(locs_id_num_sum): {len(locs_id_num_sum)}")

len(locs_id_num_sum): 1905
number of duplicated locs: 312
after dropping duplicated locs: len(locs_id_num_sum): 1593


In [39]:
locs_id_num_sum.reset_index(drop=True, inplace=True)
locs_id_num_sum

Unnamed: 0,location_id_num
0,3
1,24
2,35
3,41
4,47
...,...
1588,3719
1589,3724
1590,3745
1591,3786


In [42]:
fsq_nyc_locs = pd.read_csv("fsq_nyc_locs.csv")
fsq_nyc_locs

Unnamed: 0,location_id,location_id_num,latitude,longitude
0,3fd66200f964a52000e71ee3,2,40.733596,-74.003139
1,3fd66200f964a52001e81ee3,3,40.756363,-73.967635
2,3fd66200f964a52003e71ee3,4,40.739685,-74.006020
3,3fd66200f964a52004e41ee3,5,40.718363,-73.990817
4,3fd66200f964a52004e61ee3,6,40.722842,-73.994116
...,...,...,...,...
3811,50ad65d5e4b0bc45ee102a57,3813,40.868763,-73.901206
3812,50aeda7b7ab4aecf17b12bd6,3814,40.757644,-73.985903
3813,50c0437f498ef71d06056bcd,3815,40.745574,-73.812498
3814,50c85b8de4b090305763bf7e,3816,40.755385,-73.983650


In [44]:
locs_id_num_sum = pd.merge(locs_id_num_sum, fsq_nyc_locs, on="location_id_num", how="left")
locs_id_num_sum.drop(columns=["latitude", "longitude"], inplace=True)
locs_id_num_sum

Unnamed: 0,location_id_num,location_id
0,3,3fd66200f964a52001e81ee3
1,24,3fd66200f964a5201deb1ee3
2,35,3fd66200f964a52029e61ee3
3,41,3fd66200f964a52034e81ee3
4,47,3fd66200f964a52036eb1ee3
...,...,...
1588,3719,4fb93e19e4b04abc0c479bec
1589,3724,4fbfe16ae4b05190680fe0b1
1590,3745,4fd51cf2e4b0211979c2e273
1591,3786,502826a4e4b0f23b022f3a5e


In [45]:
locs_id_num_sum.to_csv("locs2remove_10pct_sum.csv", index=False)