This file plays around with basic ML models.

ToDo:
- [ ] Add distance to other regions?
- [ ] Add people who listed as future region?

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-data" data-toc-modified-id="Import-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import data</a></span><ul class="toc-item"><li><span><a href="#Partition" data-toc-modified-id="Partition-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Partition</a></span></li><li><span><a href="#Initialize-results-data-frame" data-toc-modified-id="Initialize-results-data-frame-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Initialize results data frame</a></span></li></ul></li><li><span><a href="#Fit-models" data-toc-modified-id="Fit-models-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Fit models</a></span><ul class="toc-item"><li><span><a href="#Define-the-models" data-toc-modified-id="Define-the-models-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Define the models</a></span></li><li><span><a href="#Fit-them" data-toc-modified-id="Fit-them-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Fit them</a></span></li></ul></li><li><span><a href="#Compile-results" data-toc-modified-id="Compile-results-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Compile results</a></span></li></ul></div>

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from dateutil.relativedelta import *

from sklearn.metrics         import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import sklearn
#sorted(sklearn.metrics.SCORERS.keys())

import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy_utils import create_database, database_exists, drop_database

%matplotlib inline

import sys
print(sys.version)

import jetson_configs as jcfg

from src.ml_helpers.make_ml_models import *
from src.ml_helpers.fit_ml_models import *
from src.ml_helpers.split_ml_dataset import *
from src.ml_helpers.compile_ml_results import *
from src.ml_helpers.make_ml_dataset import *
from src.ml_helpers.make_baseline_models import *
from src.ml_helpers.fit_baseline_models import *

from src.ml_helpers.custom_cv import *

3.6.10 |Anaconda, Inc.| (default, May  7 2020, 19:46:08) [MSC v.1916 64 bit (AMD64)]


In [2]:
split_date        = jcfg.train_test_split
current_month     = jcfg.current_month
admin_level       = jcfg.admin_level
horizon           = jcfg.horizon #1
experiment_params = jcfg.experiment_params
experiment_name   = jcfg.experiment_name

fpath       = f'{admin_level}_lag{horizon}'


# Configure connection

In [3]:
#We supply our credentials and point to the database we're trying to connect to and
host     = jcfg.sql_host
user     = jcfg.sql_user
password = jcfg.sql_password
database = "jetson"

engine = create_engine(f'postgresql://{user}:{password}@{host}/{database}') 

In [4]:
# Load the SQL_magic extension and configure the connection
%reload_ext sql_magic
%config SQL.conn_name = 'engine'

## Import data

In [5]:
learn_df = make_ml_features(current_month, horizon, admin_level, sql_engine=engine)


Dropped entirely missing columns:  []
Dropped duplicate columns:  ['miss_incidents', 'miss_malaria_cases', 'miss_measles_cases', 'miss_vegetation_cover_ndvi']


## Prepare

### Experiments

#### Add baseline models

In [6]:
# OPTIONAL: add baseline models to the learning dataframe (lags, exponential mean, exponential weighted means)
if experiment_params['y_hist']==True:
    naive = fit_baseline_models(learn_df, admin_level, horizon=1)
    naive = naive[['em', 'ewm_12', 'naive_01', 'naive_02', 'naive_03', 'naive_06', 'naive_12']]
    learn_df = learn_df.merge(naive, left_index=True, right_index=True)

In [7]:
# OPTIONAL: add averages across the country to the learning dataframe
if experiment_params['means_all_regions']==True:
    time_varying_cols = [i for i in learn_df.columns if
                         'distance_' not in i and 
                         'river_' not in i and 
                         'months_since' not in i and 
                         'dummies' not in i and 
                         'miss_' not in i]

    monthly_means = learn_df[time_varying_cols].groupby(['date']).transform('mean')
    monthly_means.columns = [i + '_all_regions' for i in monthly_means.columns]

    learn_df = learn_df.merge(monthly_means, left_index=True, right_index=True)

In [8]:
### Split the columns into input and target columns and set the target variable
learn_df['true'] = learn_df['arrivals'].copy()

y_col =  'true'    
X_cols = [i for i in learn_df.columns if i != y_col]   

In [9]:
learn_df = fill_missing_values(learn_df, X_cols)

# OPTIONAL: fill the y values
if experiment_params['fill_miss_y']==True:
    learn_df = fill_missing_values(learn_df, y_col)

learn_df = shift_input_features(learn_df, X_cols, y_col, horizon, current_month, admin_unit = 'region')

learn_df.dropna(subset=X_cols, inplace=True, how='any')

In [10]:
print("Number of months for training + evaluation: ", 
      len(learn_df.dropna(subset=['true']).index.get_level_values('date').unique()))

learn_df.index.get_level_values('date')

Number of months for training + evaluation:  113


DatetimeIndex(['2011-04-01', '2011-04-01', '2011-04-01', '2011-04-01',
               '2011-04-01', '2011-04-01', '2011-04-01', '2011-04-01',
               '2011-04-01', '2011-04-01',
               ...
               '2020-08-01', '2020-08-01', '2020-08-01', '2020-08-01',
               '2020-08-01', '2020-08-01', '2020-08-01', '2020-08-01',
               '2020-08-01', '2020-08-01'],
              dtype='datetime64[ns]', name='date', length=2034, freq=None)

### Partition

In [11]:
X_train_scaled, y_train, X_scaled, X,y, train_months, test_months = split_ml_dataset(learn_df, split_date, y_col = y_col, X_cols = X_cols)

Total months: 113
Training months:
 ['2011-04', '2011-05', '2011-06', '2011-07', '2011-08', '2011-09', '2011-10', '2011-11', '2011-12', '2012-01', '2012-02', '2012-03', '2012-04', '2012-05', '2012-06', '2012-07', '2012-08', '2012-09', '2012-10', '2012-11', '2012-12', '2013-01', '2013-02', '2013-03', '2013-04', '2013-05', '2013-06', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12', '2014-01', '2014-02', '2014-03', '2014-04', '2014-05', '2014-06', '2014-07', '2014-08', '2014-09', '2014-10', '2014-11', '2014-12', '2015-01', '2015-02', '2015-03', '2015-04', '2015-05', '2015-06', '2015-07', '2015-08', '2015-09', '2015-10', '2015-11', '2015-12', '2016-01', '2016-02', '2016-03', '2016-04', '2016-05', '2016-06', '2016-07', '2016-08', '2016-09', '2016-10', '2016-11', '2016-12', '2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01', '2018-02', '2018-03', '2018-04', '2018-05', '2018-06', '2018-

## Fit models

### Define the models

In [12]:
# Blocking time series split https://hub.packtpub.com/cross-validation-strategies-for-time-series-forecasting-tutorial/
# Cross validator <- group-aware time series split
#

scoring = 'neg_mean_squared_error'

if experiment_params['cv'] == 'tscv':
    
    tscv = TimeSeriesSplit(n_splits=10) # NB: Need to run the split for each algorithm
    
    model_ridge = make_ridge(       cv = tscv.split(y_train), scoring=scoring)
    model_lasso = make_lasso(       cv = tscv.split(y_train), scoring=scoring)
    model_svr   = make_svm(         cv = tscv.split(y_train), scoring=scoring)
    model_ada   = make_adaboost(    cv = tscv.split(y_train), scoring=scoring)
    model_rf    = make_randomforest(cv = tscv.split(y_train), scoring=scoring)
    model_tree  = make_decisiontree(cv = tscv.split(y_train), scoring=scoring)
    model_mlp   = make_perceptron(  cv = tscv.split(y_train), scoring=scoring)
    model_xgb   = make_xgboost     (cv = tscv.split(y_train), scoring=scoring)

elif experiment_params['cv'] == 'gbtscv':
    gbtscv = GroupedBlockingTimeSeriesSplit(n_splits=10)
    
    model_ridge = make_ridge(       cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_lasso = make_lasso(       cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_svr   = make_svm(         cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_ada   = make_adaboost(    cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_rf    = make_randomforest(cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_tree  = make_decisiontree(cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_mlp   = make_perceptron(  cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)
    model_xgb   = make_xgboost     (cv = gbtscv.split(y_train, n_samples_per_group=18), scoring=scoring)

### Fit them

In [13]:
for model, modelname in [
                        (model_ridge, f"Ridge_{experiment_name}"),
                        (model_lasso, f"Lasso_{experiment_name}"), 
                        (model_mlp,   f"Perceptron_{experiment_name}"),
                        (model_xgb,   f"Xgboost_{experiment_name}"),
                        (model_ada,   f"Adaboost_{experiment_name}"), 
                        (model_tree,  f"Decisiontree_{experiment_name}"),
                        (model_rf,    f"Randomforest_{experiment_name}"), 
                        #(model_svr,   f"SVM_{experiment_name}")
                        ]:
                        

    mm = fit_ml_models(X_train_scaled, y_train, 
             X_scaled, X, y, train_months, test_months,
             m=model, mname=modelname, fpath=fpath)
    
    print(f"Finished {modelname}")

Finished Ridge_tscv_nomeans_noyfill_noyhist


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


Finished Lasso_tscv_nomeans_noyfill_noyhist




Finished Perceptron_tscv_nomeans_noyfill_noyhist
Parameters: { max_iter } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { max_iter } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { max_iter } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { max_iter } might not be used.

  This may not be accurate due to some parameters are 

## Compile results

In [14]:
compile_ml_results(fpath)