This file plays around with basic ML models.

ToDo:
- [ ] Add distance to other regions?
- [ ] Add people who listed as future region?

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-data" data-toc-modified-id="Import-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import data</a></span><ul class="toc-item"><li><span><a href="#Partition" data-toc-modified-id="Partition-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Partition</a></span></li><li><span><a href="#Initialize-results-data-frame" data-toc-modified-id="Initialize-results-data-frame-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Initialize results data frame</a></span></li></ul></li><li><span><a href="#Fit-models" data-toc-modified-id="Fit-models-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Fit models</a></span><ul class="toc-item"><li><span><a href="#Define-the-models" data-toc-modified-id="Define-the-models-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Define the models</a></span></li><li><span><a href="#Fit-them" data-toc-modified-id="Fit-them-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Fit them</a></span></li></ul></li><li><span><a href="#Compile-results" data-toc-modified-id="Compile-results-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Compile results</a></span></li></ul></div>

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from dateutil.relativedelta import *

from sklearn.metrics         import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

%matplotlib inline

import sys
print(sys.version)

import jetson_configs as cfg

from src.ml_helpers.make_ml_models import *
from src.ml_helpers.fit_ml_models import *
from src.ml_helpers.split_ml_dataset import *
from src.ml_helpers.compile_ml_results import *
from src.ml_helpers.make_ml_dataset import *
from src.ml_helpers.make_baseline_models import *
from src.ml_helpers.fit_baseline_models import *

from src.ml_helpers.custom_cv import *

3.7.3 (default, Mar 27 2019, 22:11:17) 
[GCC 7.3.0]


In [2]:
import sklearn
#sorted(sklearn.metrics.SCORERS.keys())

In [3]:
split_date     = cfg.train_test_split
current_month  = cfg.current_month

admin_level = "admin1"
horizon     = 3
fpath       = f'{admin_level}_lag{horizon}'

experiment_params = {
    'cv' : "tscv",
    'fill_miss_y' : True, # ymiss
    'y_hist'      : False # yhist
}

experiment = 'tscv_yfill_noyhist'

## Import data

In [4]:
learn_df = make_ml_features(current_month, admin_level)


Dropped entirely missing columns:  []
Dropped duplicate columns:  ['miss_vegetation_cover_ndvi']


In [5]:
if experiment_params['y_hist']==True:
    naive = fit_baseline_models(learn_df, admin_level, horizon=0)
    naive = naive[['em', 'ewm_12', 'naive_01', f'naive_02', f'naive_03']]
    learn_df = learn_df.merge(naive, left_index=True, right_index=True)

In [6]:
### Split the columns into input and target columns and set the target variable
learn_df['true'] = learn_df['arrivals'].copy()

y_col =  'true'    
X_cols = [i for i in learn_df.columns if i != y_col]   

In [7]:
learn_df = fill_missing_values(learn_df, X_cols)

# OPTIONAL: fill the y values
if experiment_params['fill_miss_y']==True:
    learn_df = fill_missing_values(learn_df, y_col)

learn_df = shift_input_features(learn_df, X_cols, y_col, horizon, current_month, admin_unit = 'region')

learn_df.dropna(subset=X_cols, inplace=True, how='any')

In [None]:
print("Number of months for training + evaluation: ", 
      len(learn_df.dropna(subset=['true']).index.get_level_values('date').unique()))

learn_df.index.get_level_values('date')

Number of months for training + evaluation:  104


DatetimeIndex(['2011-04-01', '2011-04-01', '2011-04-01', '2011-04-01',
               '2011-04-01', '2011-04-01', '2011-04-01', '2011-04-01',
               '2011-04-01', '2011-04-01',
               ...
               '2020-02-01', '2020-02-01', '2020-02-01', '2020-02-01',
               '2020-02-01', '2020-02-01', '2020-02-01', '2020-02-01',
               '2020-02-01', '2020-02-01'],
              dtype='datetime64[ns]', name='date', length=1926, freq=None)

### Partition

In [None]:
X_train_scaled, y_train, X_scaled, X,y, train_months, test_months = split_ml_dataset(learn_df, split_date, y_col = y_col, X_cols = X_cols)

Total months: 107
Training months:
 ['2011-04', '2011-05', '2011-06', '2011-07', '2011-08', '2011-09', '2011-10', '2011-11', '2011-12', '2012-01', '2012-02', '2012-03', '2012-04', '2012-05', '2012-06', '2012-07', '2012-08', '2012-09', '2012-10', '2012-11', '2012-12', '2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12', '2015-01', '2015-02', '2015-03', '2015-04', '2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-

## Fit models

### Define the models

In [None]:
# Blocking time series split https://hub.packtpub.com/cross-validation-strategies-for-time-series-forecasting-tutorial/
# Cross validator <- group-aware time series split
#

scoring = 'neg_mean_squared_error'

if experiment_params['cv'] == 'tscv':
    
    tscv = TimeSeriesSplit(n_splits=10) # NB: Need to run the split for each algorithm
    
    model_ridge = make_ridge(       cv = tscv.split(y_train), scoring=scoring)
    model_lasso = make_lasso(       cv = tscv.split(y_train), scoring=scoring)
    model_svr   = make_svm(         cv = tscv.split(y_train), scoring=scoring)
    model_ada   = make_adaboost(    cv = tscv.split(y_train), scoring=scoring)
    model_rf    = make_randomforest(cv = tscv.split(y_train), scoring=scoring)
    model_tree  = make_decisiontree(cv = tscv.split(y_train), scoring=scoring)
    model_mlp   = make_perceptron(  cv = tscv.split(y_train), scoring=scoring)
    model_xgb   = make_xgboost     (cv = tscv.split(y_train), scoring=scoring)

elif experiment_params['cv'] == 'gbtscv':
    gbtscv = GroupedBlockingTimeSeriesSplit(n_splits=10)
    
    model_ridge = make_ridge(       cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_lasso = make_lasso(       cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_svr   = make_svm(         cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_ada   = make_adaboost(    cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_rf    = make_randomforest(cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_tree  = make_decisiontree(cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_mlp   = make_perceptron(  cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_xgb   = make_xgboost     (cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)

### Fit them

In [None]:
for model, modelname in [
                        (model_ridge, f"Ridge_{experiment}"),
                        (model_lasso, f"Lasso_{experiment}"), 
                        (model_mlp,   f"Perceptron_{experiment}"),
                        (model_xgb,   f"Xgboost_{experiment}"),
                        (model_ada,   f"Adaboost_{experiment}"), 
                        (model_tree,  f"Decisiontree_{experiment}"),
                        (model_rf,    f"Randomforest_{experiment}"), 
                        #(model_svr,   f"SVM_{experiment}")
                        ]:
                        

    mm = fit_ml_models(X_train_scaled, y_train, 
             X_scaled, X, y, train_months, test_months,
             m=model, mname=modelname, fpath=fpath)
    
    print(f"Finished {modelname}")



Finished Ridge_tscv_yfill_noyhist




Finished Lasso_tscv_yfill_noyhist




## Compile results

In [None]:
#compile_ml_results(fpath)