In [42]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from scipy.sparse import csr_matrix
import scipy.sparse as sp
from sklearn.model_selection import train_test_split

In [44]:
city = 'foursquare_nyc'
df = pd.read_csv('../' + city + '/foursquare_mapped_NYC.geo')
bvc = df['venue_category_name'].value_counts() >= 20
bvc = bvc[bvc > 0].index
df = df[df['venue_category_name'].isin(bvc)]
df.reset_index(inplace=True, drop=True)

brand2id, cate12id, cate22id = {}, {}, {}
for idx, row in df.iterrows():
    brand, cate_1, cate_2 = row['venue_category_name'], row['topCate'], row['region_id']
    if brand not in brand2id.keys():
        brand2id[brand] = len(brand2id)
    if cate_1 not in cate12id.keys():
        cate12id[cate_1] = len(cate12id)
    if cate_2 not in cate22id.keys():
        cate22id[cate_2] = len(cate22id)

brand2id = pd.DataFrame({'venue_category_name': list(
    brand2id.keys()), 'Brand_ID': list(brand2id.values())})
cate12id = pd.DataFrame(
    {'topCate': list(cate12id.keys()), 'Cate1_ID': list(cate12id.values())})
cate22id = pd.DataFrame(
    {'region_id': list(cate22id.keys()), 'Region_ID': list(cate22id.values())})


df = df.merge(brand2id, on=['venue_category_name'], how='left')
df = df.merge(cate12id, on=['topCate'], how='left')
df = df.merge(cate22id, on=['region_id'], how='left')

df = df[['geo_id', 'venue_category_name', 'Brand_ID', 'Cate1_ID', 'Region_ID']]

print(df['Brand_ID'].max())
print(df['Region_ID'].max())

94
1602


In [41]:
print(df['Brand_ID'].max())
print(df['Region_ID'].max())

94
1602


In [40]:
df.nunique()

geo_id                 9067
venue_category_name      95
Brand_ID                 95
Cate1_ID                  9
Region_ID              1603
dtype: int64

In [35]:
np.random.seed(42)
train_data, test_data = [], []
for i in range(df['Brand_ID'].max() + 1):
    data = df[df['Brand_ID'] == i]
    x_train, x_test, y_train, y_test = train_test_split(
        data[['Brand_ID', 'Cate1_ID']], data['Region_ID'],
        test_size=0.2, random_state=42)
    x_train['Region_ID'] = y_train
    x_test['Region_ID'] = y_test
    train_data.append(x_train)
    test_data.append(x_test)

In [36]:
train_data, test_data = pd.concat(
    train_data, axis=0), pd.concat(test_data, axis=0)
print(train_data.shape, "train_data.shape")
print(test_data.shape, "test_data.shape")

(7217, 3) train_data.shape
(1850, 3) test_data.shape


In [39]:
import os


dir_path = os.path.join(city, 'split')

if not os.path.exists(dir_path):
    os.makedirs(dir_path)
train_data.to_pickle(os.path.join(dir_path, 'train.pkl'))
test_data.to_pickle(os.path.join(dir_path, 'test.pkl'))

In [12]:
city = 'foursquare_nyc'
df = pd.read_csv('../' + city + '/foursquare_mapped_NYC.geo')
df

Unnamed: 0.1,Unnamed: 0,geo_id,type,Long,Lat,venue_category_id,venue_category_name,region_id,cate_id,topCate
0,0,0,Point,-74.003139,40.733596,4bf58dd8d48988d1e7931735,Music Venue,689,145,Arts & Entertainment
1,1,1,Point,-73.967644,40.756368,4bf58dd8d48988d11d941735,Bar,838,21,Nightlife Spot
2,2,2,Point,-73.981393,40.724827,4bf58dd8d48988d118941735,Bar,596,21,Nightlife Spot
3,3,3,Point,-74.006020,40.739685,4bf58dd8d48988d10c941735,French Restaurant,690,89,Food
4,4,4,Point,-73.990817,40.718363,4bf58dd8d48988d116941735,Bar,559,21,Nightlife Spot
...,...,...,...,...,...,...,...,...,...,...
9881,9984,9984,Point,-74.108426,40.889077,4bf58dd8d48988d124941735,Office,1489,150,Professional & Other Places
9882,9985,9985,Point,-73.984446,40.728397,4d954b06a243a5684965b473,Residential Building (Apartment / Condo),642,172,Residence
9883,9986,9986,Point,-74.004831,40.741677,4f2a23984b9023bd5841ed2c,Moving Target,741,142,Travel & Transport
9884,9987,9987,Point,-73.981934,40.757790,4bf58dd8d48988d162941735,Other Great Outdoors,840,151,Outdoors & Recreation


In [13]:
df.nunique()

Unnamed: 0             9886
geo_id                 9886
type                      1
Long                   8850
Lat                    9518
venue_category_id       333
venue_category_name     219
region_id              1671
cate_id                 219
topCate                   9
dtype: int64

In [9]:
city = 'foursquare_nyc'
dyna = pd.read_csv('../' + city + '/' + city+'.dyna')
dyna

Unnamed: 0,dyna_id,type,time,entity_id,location
0,0,trajectory,2012-04-03T14:00:09Z,0,1230
1,1,trajectory,2012-04-03T14:00:25Z,1,1879
2,2,trajectory,2012-04-03T14:02:24Z,2,6161
3,3,trajectory,2012-04-03T14:03:00Z,3,6859
4,4,trajectory,2012-04-03T14:04:00Z,4,4017
...,...,...,...,...,...
179463,179463,trajectory,2013-02-15T21:29:11Z,113,0
179464,179464,trajectory,2013-02-15T21:29:11Z,568,964
179465,179465,trajectory,2013-02-15T21:31:35Z,490,5135
179466,179466,trajectory,2013-02-15T21:33:16Z,691,9933


In [None]:
dyna = dyna.sort_values(['User', 'timestamp'])

In [15]:
threshold = 20
bvc = df['cate_id'].value_counts() >= threshold
bvc = bvc[bvc > 0].index
df = df[df['cate_id'].isin(bvc)]
df.reset_index(inplace=True, drop=True)

In [31]:

brand2id, cate12id, cate22id = {}, {}, {}
for idx, row in df.iterrows():
    brand, cate_1, cate_2 = row['Brand'], row['topCate'], row['region_id']
    if brand not in brand2id.keys():
        brand2id[brand] = len(brand2id)
    if cate_1 not in cate12id.keys():
        cate12id[cate_1] = len(cate12id)
    if cate_2 not in cate22id.keys():
        cate22id[cate_2] = len(cate22id)

brand2id = pd.DataFrame(
    {'Brand': list(brand2id.keys()), 'Brand_ID': list(brand2id.values())})
cate12id = pd.DataFrame(
    {'cate_1': list(cate12id.keys()), 'Cate1_ID': list(cate12id.values())})
cate22id = pd.DataFrame(
    {'cate_2': list(cate22id.keys()), 'Cate2_ID': list(cate22id.values())})

KeyError: 'Brand'

In [20]:
cate12id = {}
for idx, row in df.iterrows():
    cate_1 = row['topCate']
    if cate_1 not in cate12id.keys():
        cate12id[cate_1] = len(cate12id)
cate12id = pd.DataFrame(
    {'topCate': list(cate12id.keys()), 'topCate_ID': list(cate12id.values())})
df = df.merge(cate12id, on=['topCate'], how='left')
df
# df = df[['geo_id', 'venue_category_name', 'cate_id', 'topCate_ID', 'region_id']]

Unnamed: 0.1,Unnamed: 0,geo_id,type,Long,Lat,venue_category_id,venue_category_name,region_id,cate_id,topCate,topCate_ID_x,topCate_ID_y,topCate_ID
0,0,0,Point,-74.003139,40.733596,4bf58dd8d48988d1e7931735,Music Venue,689,145,Arts & Entertainment,0,0,0
1,1,1,Point,-73.967644,40.756368,4bf58dd8d48988d11d941735,Bar,838,21,Nightlife Spot,1,1,1
2,2,2,Point,-73.981393,40.724827,4bf58dd8d48988d118941735,Bar,596,21,Nightlife Spot,1,1,1
3,3,3,Point,-74.006020,40.739685,4bf58dd8d48988d10c941735,French Restaurant,690,89,Food,2,2,2
4,4,4,Point,-73.990817,40.718363,4bf58dd8d48988d116941735,Bar,559,21,Nightlife Spot,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9062,9981,9981,Point,-73.943611,40.703281,4bf58dd8d48988d1f1931735,General Entertainment,481,98,Arts & Entertainment,0,0,0
9063,9984,9984,Point,-74.108426,40.889077,4bf58dd8d48988d124941735,Office,1489,150,Professional & Other Places,3,3,3
9064,9985,9985,Point,-73.984446,40.728397,4d954b06a243a5684965b473,Residential Building (Apartment / Condo),642,172,Residence,8,8,8
9065,9986,9986,Point,-74.004831,40.741677,4f2a23984b9023bd5841ed2c,Moving Target,741,142,Travel & Transport,4,4,4


In [22]:
df = df[['geo_id', 'venue_category_name', 'cate_id', 'topCate_ID', 'region_id']]

In [23]:
df = df.rename(columns={
    'venue_category_name': 'Name',
    'cate_id': 'Brand_ID',
    'topCate_ID': 'Cate1_ID',
    'region_id': 'Region_ID'
})

In [24]:
df

Unnamed: 0,geo_id,Name,Brand_ID,Cate1_ID,Region_ID
0,0,Music Venue,145,0,689
1,1,Bar,21,1,838
2,2,Bar,21,1,596
3,3,French Restaurant,89,2,690
4,4,Bar,21,1,559
...,...,...,...,...,...
9062,9981,General Entertainment,98,0,481
9063,9984,Office,150,3,1489
9064,9985,Residential Building (Apartment / Condo),172,8,642
9065,9986,Moving Target,142,4,741


In [None]:
df.nunique()

geo_id       9067
Name           95
Brand_ID       95
Cate1_ID        9
Region_ID    1603
dtype: int64

In [30]:
df[df['Brand_ID'] == 0]

Unnamed: 0,geo_id,Name,Brand_ID,Cate1_ID,Region_ID


In [29]:
np.random.seed(42)
train_data, test_data = [], []
for i in range(df['Brand_ID'].max() + 1):
    data = df[df['Brand_ID'] == i]
    print(data)
    x_train, x_test, y_train, y_test = train_test_split(
        data[['Brand_ID', 'Cate1_ID']], data['Region_ID'],
        test_size=0.2, random_state=42)
    x_train['Region_ID'] = y_train
    x_test['Region_ID'] = y_test
    train_data.append(x_train)
    test_data.append(x_test)
train_data, test_data = pd.concat(
    train_data, axis=0), pd.concat(test_data, axis=0)
print(train_data.shape, "train_data.shape")
print(test_data.shape, "test_data.shape")

Empty DataFrame
Columns: [geo_id, Name, Brand_ID, Cate1_ID, Region_ID]
Index: []


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
df

Unnamed: 0.1,Unnamed: 0,geo_id,type,Long,Lat,venue_category_id,venue_category_name,region_id,cate_id,topCate,topCate_ID
0,0,0,Point,-74.003139,40.733596,4bf58dd8d48988d1e7931735,Music Venue,689,145,Arts & Entertainment,0
1,1,1,Point,-73.967644,40.756368,4bf58dd8d48988d11d941735,Bar,838,21,Nightlife Spot,1
2,2,2,Point,-73.981393,40.724827,4bf58dd8d48988d118941735,Bar,596,21,Nightlife Spot,1
3,3,3,Point,-74.006020,40.739685,4bf58dd8d48988d10c941735,French Restaurant,690,89,Food,2
4,4,4,Point,-73.990817,40.718363,4bf58dd8d48988d116941735,Bar,559,21,Nightlife Spot,1
...,...,...,...,...,...,...,...,...,...,...,...
9062,9981,9981,Point,-73.943611,40.703281,4bf58dd8d48988d1f1931735,General Entertainment,481,98,Arts & Entertainment,0
9063,9984,9984,Point,-74.108426,40.889077,4bf58dd8d48988d124941735,Office,1489,150,Professional & Other Places,3
9064,9985,9985,Point,-73.984446,40.728397,4d954b06a243a5684965b473,Residential Building (Apartment / Condo),642,172,Residence,8
9065,9986,9986,Point,-74.004831,40.741677,4f2a23984b9023bd5841ed2c,Moving Target,741,142,Travel & Transport,4


In [None]:
print(df['geo_id'].max())
print(df['region_id'].max())

9988
1681


In [None]:
np.random.seed(42)
train_data, test_data = [], []
for i in range(df['geo_id'].max() + 1):
    data = df[df['geo_id'] == i]
    x_train, x_test, y_train, y_test = train_test_split(
        data[['geo_id', 'cate_id']], data['region_id'],
        test_size=0.2, random_state=42)
    x_train['region_id'] = y_train
    x_test['region_id'] = y_test
    train_data.append(x_train)
    test_data.append(x_test)
train_data, test_data = pd.concat(
    train_data, axis=0), pd.concat(test_data, axis=0)

ValueError: With n_samples=1, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [None]:
bvc = bvc[bvc > 0].index
df = df[df['Brand'].isin(bvc)]

KeyError: 'Brand'