# ML Models for Geographic Features

This notebook uses geographic feature to create and evaluate ML models and should therefore be run __after__ ``features_geo.ipynb``.

### Import packages

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import warnings
warnings.filterwarnings(action="ignore")

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.feature_selection import RFECV
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from IPython.display import display_html

from utils import *

Define constants.

- ``PATH``: Path to the base data folder
- ``NO_COUNTS``: Remove count based features which are not in the cited papers
- ``K_FOLDS``: Number of folds to perform for cross validation

In [3]:
PATH = "C:/Users/Tim/.keras/datasets/wikipedia_rss/"
NO_COUNTS = True
K_FOLDS = 5

Read structured data with added text features.

In [4]:
structured = pd.read_csv(PATH + f"structured_geo_features.csv")
structured = add_dummy_category(structured)
print(structured.shape)
structured.head(10)

(2836, 449)


Unnamed: 0,venue_id,latitude,longitude,borough,_category,total_visits,jazz_club_count,gym_count,indian_restaurant_count,bowling_alley_count,...,dummy_Hookah_Bar,dummy_Hotel_Bar,dummy_Juice_Bar,dummy_Karaoke_Bar,dummy_Piano_Bar,dummy_Pub,dummy_Sake_Bar,dummy_Sports_Bar,dummy_Whisky_Bar,dummy_Wine_Bar
0,3fd66200f964a52001e51ee3,40.726961,-73.980039,Manhattan,Bar,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3fd66200f964a52003e51ee3,40.724822,-73.981456,Manhattan,Bar,15,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3fd66200f964a52010e51ee3,40.727027,-73.982702,Manhattan,Bar,14,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3fd66200f964a52011e81ee3,40.762812,-73.967519,Manhattan,Bar,18,0,3,1,0,...,0,0,0,0,0,0,0,0,0,0
4,3fd66200f964a52018e51ee3,40.725112,-73.981278,Manhattan,Bar,29,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,3fd66200f964a5201be41ee3,40.719238,-73.985588,Manhattan,Bar,17,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,3fd66200f964a52025e41ee3,40.72488,-73.994685,Manhattan,Bar,26,1,2,2,0,...,0,0,0,0,0,0,0,0,0,0
7,3fd66200f964a52029e31ee3,40.725638,-73.984561,Manhattan,Bar,15,0,0,8,1,...,0,0,0,0,0,0,0,0,0,0
8,3fd66200f964a5202ee41ee3,40.728543,-73.984699,Manhattan,Bar,34,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,3fd66200f964a52033e61ee3,40.72478,-73.994703,Manhattan,Bar,10,1,2,2,0,...,0,0,0,0,0,0,0,0,0,0


Split data into train and test.

In [6]:
if NO_COUNTS:
    X = structured.loc[:, "area_density":"dummy_Wine_Bar"]
else:
    X = structured.loc[:, "jazz_club_count":"dist_center"]
y = structured["total_visits"]

X_train, X_test, y_train, y_test = train_test_split(X.to_numpy(dtype=np.float),
                                                    y.to_numpy(dtype=np.float),
                                                    test_size=0.25, random_state=42)

print(f"{X.shape}: {X_train.shape} + {X_test.shape}")
print(f"{y.shape}: {y_train.shape} + {y_test.shape}")

(2836, 21): (2127, 21) + (709, 21)
(2836,): (2127,) + (709,)


### Linear(GEO)

In [10]:
for lin_reg in [LinearRegression, Lasso, Ridge]:
    print(f"Regressor: {lin_reg.__name__}")
    reg = make_pipeline(StandardScaler(), lin_reg())
    reg.fit(X_train, y_train)
    lin_pred = reg.predict(X_test)
    metrics_lin = get_metrics(y_test, lin_pred)
    print("")

Regressor: LinearRegression
MAE:  12.535
RMSE: 20.375
NDCG:  0.798

Regressor: Lasso
MAE:  12.677
RMSE: 20.831
NDCG:  0.782

Regressor: Ridge
MAE:  12.535
RMSE: 20.376
NDCG:  0.798



In [11]:
lin_reg = make_pipeline(StandardScaler(), Lasso())
lin_results, lin_cols = cross_validation(lin_reg, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  12.72 ± 0.53
RMSE: 20.59 ± 1.55
NDCG: 0.797 ± 0.016


In [13]:
estimator = make_pipeline(StandardScaler(), Lasso())
error_df, col_names, metrics, stds = soos_validation(estimator, structured, return_std=True,
                                                     verbose_drop=False, split_var="borough")
maes, rmses, r_squareds = metrics
list(zip(maes, list(pd.unique(structured["borough"]))))

Predicting borough 1/5
Predicting borough 2/5
Predicting borough 3/5
Predicting borough 4/5
Predicting borough 5/5

Weighted metrics:
MAE:  13.51 ± 4.36
RMSE: 20.42 ± 6.38
NDCG: 0.697 ± 0.136


[(17.255641562456795, 'Manhattan'),
 (8.178975450841115, 'Brooklyn'),
 (8.01144123414179, 'JC + SI'),
 (9.80570139060686, 'Bronx + Queens'),
 (5.46563475608764, 'Newark')]

### SVR(GEO)

In [14]:
svr_reg = make_pipeline(StandardScaler(), SVR()).fit(X_train, y_train)
svr_pred = svr_reg.predict(X_test)
metrics_svr = get_metrics(y_test, svr_pred)

MAE:  11.101
RMSE: 22.365
NDCG:  0.816


In [15]:
svr_results, svr_cols = cross_validation(svr_reg, X, y, K_FOLDS)

  0%|          | 0/5 [00:00<?, ?it/s]


MAE:  11.05 ± 0.7
RMSE: 21.71 ± 1.61
NDCG: 0.833 ± 0.01


In [16]:
estimator = make_pipeline(StandardScaler(), SVR())
error_df, col_names, metrics, stds = soos_validation(estimator, structured, return_std=True,
                                                     verbose_drop=False, split_var="borough")
maes, rmses, r_squareds = metrics
list(zip(maes, list(pd.unique(structured["borough"]))))

Predicting borough 1/5
Predicting borough 2/5
Predicting borough 3/5
Predicting borough 4/5
Predicting borough 5/5

Weighted metrics:
MAE:  12.07 ± 4.65
RMSE: 22.49 ± 8.0
NDCG: 0.693 ± 0.155


[(16.003214261644573, 'Manhattan'),
 (5.2187190702874915, 'Brooklyn'),
 (6.685330837005574, 'JC + SI'),
 (8.924913790272523, 'Bronx + Queens'),
 (4.109456610017165, 'Newark')]