## Parameter Tuning with `hyperopt`

In [36]:
import os
import json
import sys

import numpy as np
import pandas as pd

from keras.layers import SimpleRNN, LSTM, GRU
from models.recurrent import Recurrent

from sklearn.metrics import mean_squared_error
from hyperopt import tpe, hp, fmin, STATUS_OK, Trials
from hyperopt.pyll.base import scope


from tqdm import tqdm_notebook as tqdm
from pprint import pprint
from matplotlib import pyplot as plt

### Back to the crime dataset

We look at two distinct questions:
1. Can we accurately predict the development of the **whole** time series for the city of Chicago?
2. Can we accurately predict the development of the series for **each district**?

#### 1. Load the full series:

The full series contains a single column of summed up incidents per day. We also parse the dates inside the `date` column and set them as the index of the `DataFrame`.

In [37]:
datapath_whole = os.path.join("data", "crime_total.csv")
whole = pd.read_csv(datapath_whole, index_col=["date"],
                    dtype={"crimes_total": np.float32},
                    parse_dates=["date"])
whole.head()

Unnamed: 0_level_0,crimes_total
date,Unnamed: 1_level_1
2001-01-01,1814.0
2001-01-02,1143.0
2001-01-03,1151.0
2001-01-04,1166.0
2001-01-05,1267.0


#### 2. Load the series by district:

Here we also parse the dates inside the `Date` column and specify a `MultiIndex` where the levels are `Date` and then `District`. This allows our `models.recurrent`-class to distinguish between the series for the whole of Chicago and the Chicago crime series by each district.

In [38]:
datapath_district = os.path.join("data", "crimes_district.csv")
district = pd.read_csv(datapath_district, index_col=["Date", "District"],
                       dtype={"District": object,
                              "Incidents": np.float32},
                       parse_dates=["Date"])
district.sort_index().head(24).T

Date,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01,2001-01-01
District,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,16.0,17.0,18.0,19.0,20.0,21.0,22.0,24.0,25.0,31.0
Incidents,37.0,110.0,103.0,96.0,95.0,84.0,83.0,111.0,109.0,104.0,...,67.0,67.0,72.0,72.0,40.0,0.0,61.0,59.0,120.0,0.0


### Separate data and a holdout set.

* We use an algorithm that uses the test set.
* Danger of overfitting.
* so we separate a holdout set.

In [39]:
cutoff = pd.to_datetime("2018-10-01")
whole_data = whole.loc[whole.index < cutoff]
whole_holdout = whole.loc[whole.index >= cutoff]

# As always with MultiIndices, things are a little more complicated.
district_data = district.loc[district.index.map(lambda x: x[0] < cutoff)]
district_holdout = district.loc[district.index.map(lambda x: x[0] > cutoff)]

### Define a parameter space

* choose a prior distribution from which parameters are sampled.
* we choose mostly from the uniform distribution.

In [40]:
# add optimizer, learn-rate, 
paramspace = {"maxlag": scope.int(hp.quniform("maxlag", 1, 3, 1)),
              "cell_neurons": scope.int(hp.quniform("cell_neurons", 1, 30, 1)),
              "batch_size": scope.int(hp.quniform("batch_size", 1, 10, 1)),
              "optimizer": hp.choice("optimizer", ["adam", "sgd"])}

* and create one dictionary for each cell we want to use. 

In [41]:
spacesdict ={}
for cell in (SimpleRNN, LSTM, GRU):
    spacesdict[cell.__name__] = {"cell": cell, **paramspace}

pprint(spacesdict)

{'GRU': {'batch_size': <hyperopt.pyll.base.Apply object at 0x7f8b06b8b400>,
         'cell': <class 'keras.layers.recurrent.GRU'>,
         'cell_neurons': <hyperopt.pyll.base.Apply object at 0x7f8b06b3f8d0>,
         'maxlag': <hyperopt.pyll.base.Apply object at 0x7f8b06b3f828>,
         'optimizer': <hyperopt.pyll.base.Apply object at 0x7f8b06b80048>},
 'LSTM': {'batch_size': <hyperopt.pyll.base.Apply object at 0x7f8b06b8b400>,
          'cell': <class 'keras.layers.recurrent.LSTM'>,
          'cell_neurons': <hyperopt.pyll.base.Apply object at 0x7f8b06b3f8d0>,
          'maxlag': <hyperopt.pyll.base.Apply object at 0x7f8b06b3f828>,
          'optimizer': <hyperopt.pyll.base.Apply object at 0x7f8b06b80048>},
 'SimpleRNN': {'batch_size': <hyperopt.pyll.base.Apply object at 0x7f8b06b8b400>,
               'cell': <class 'keras.layers.recurrent.SimpleRNN'>,
               'cell_neurons': <hyperopt.pyll.base.Apply object at 0x7f8b06b3f8d0>,
               'maxlag': <hyperopt.pyll.base.Ap

### Set up `Trials`-object to store the results

* We can later visualize the search process.
* Based on that, we can make decisions if we want to keep the parameters or not.

In [42]:
trialsdict = {key: {model: Trials() for model in ("SimpleRNN", "LSTM", "GRU")} for key in ("whole", "district")}
pprint(trialsdict)

{'district': {'GRU': <hyperopt.base.Trials object at 0x7f8b06b46240>,
              'LSTM': <hyperopt.base.Trials object at 0x7f8b06b462e8>,
              'SimpleRNN': <hyperopt.base.Trials object at 0x7f8b06b46278>},
 'whole': {'GRU': <hyperopt.base.Trials object at 0x7f8b06b46198>,
           'LSTM': <hyperopt.base.Trials object at 0x7f8b06b46eb8>,
           'SimpleRNN': <hyperopt.base.Trials object at 0x7f8b3c5c53c8>}}


### Optimizing over the parameter space

* We first define an objective function.
* Then we pass it to the `fmin`-routine.

* With 3 model classes and 2 different datasets we would need 6 of those.
* So we do something different - we use `Python`s decorator syntax.

In [43]:
def minimizer(objective):
    def outer(paramspace, trials, max_evals=3):
        pbar = tqdm(total=max_evals, desc=paramspace["cell"].__name__)
        def inner(*args, **kwargs):
            pbar.update()
            return objective(*args, **kwargs)

        best = fmin(fn=inner,
                    space=paramspace,
                    algo=tpe.suggest,
                    max_evals=max_evals,
                    trials=trials)
        pbar.close()
        return best
    return outer

And now to the function we wish to decorate. It contains 3 steps:
1. Create a model with a certain set of (yet undefined) parameters.
2. Train the model.
3. Calculate the loss on a test set and return a dictionary, that contains the loss and a flag indicating that everything went okay.

We do this for the whole series:

In [44]:
@minimizer
def whole_get_loss(params):
    """Return loss on test set."""
    model = Recurrent(whole_data, epochs=3, verbose=False, **params)
    model.train()
    predictions = model.forecast(model.X_test)
    loss = mean_squared_error(y_true=model.y_test, y_pred=predictions)
    return {'loss': loss, 'status': STATUS_OK}

And for the series by district:

In [45]:
@minimizer
def district_get_loss(params):
    """Return loss on test set."""
    model = Recurrent(district_data, epochs=3, verbose=False, **params)
    model.train()
    predictions = model.forecast(model.X_test)
    loss = mean_squared_error(y_true=model.y_test, y_pred=predictions)
    return {'loss': loss, 'status': STATUS_OK}

In [46]:
import time
@minimizer
def district_get_loss(params):
    time.sleep(1)
    return {"loss": np.random.uniform(), "status": STATUS_OK}
@minimizer
def whole_get_loss(params):
    time.sleep(2)
    return {"loss": np.random.uniform(), "status": STATUS_OK}

### Tune the parameters

* We pass through all our models 
* for each dataset and finally
* store the best hyperparameters in a dictionary.

In [47]:
models = ("SimpleRNN", "LSTM", "GRU")
best = {"whole": {},
        "district": {}}

for model in models:
    best["whole"][model] = whole_get_loss(spacesdict[model], 
                                          trialsdict["whole"][model])
    best["district"][model] = district_get_loss(spacesdict[model],
                                                trialsdict["district"][model])

HBox(children=(IntProgress(value=0, description='SimpleRNN', max=3, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='SimpleRNN', max=3, style=ProgressStyle(description_width='ini…




HBox(children=(IntProgress(value=0, description='LSTM', max=3, style=ProgressStyle(description_width='initial'…




HBox(children=(IntProgress(value=0, description='LSTM', max=3, style=ProgressStyle(description_width='initial'…




HBox(children=(IntProgress(value=0, description='GRU', max=3, style=ProgressStyle(description_width='initial')…




HBox(children=(IntProgress(value=0, description='GRU', max=3, style=ProgressStyle(description_width='initial')…




In [53]:
with open("analysis/best_params.json") as file:
    best = json.load(file)

In [54]:
pprint(best)

{'district': {'GRU': {'batch_size': 7.0,
                      'cell_neurons': 30.0,
                      'maxlag': 2.0,
                      'optimizer': 0},
              'LSTM': {'batch_size': 7.0,
                       'cell_neurons': 3.0,
                       'maxlag': 2.0,
                       'optimizer': 0},
              'SimpleRNN': {'batch_size': 6.0,
                            'cell_neurons': 14.0,
                            'maxlag': 3.0,
                            'optimizer': 0}},
 'whole': {'GRU': {'batch_size': 1.0,
                   'cell_neurons': 21.0,
                   'maxlag': 3.0,
                   'optimizer': 1},
           'LSTM': {'batch_size': 1.0,
                    'cell_neurons': 17.0,
                    'maxlag': 3.0,
                    'optimizer': 1},
           'SimpleRNN': {'batch_size': 5.0,
                         'cell_neurons': 14.0,
                         'maxlag': 3.0,
                         'optimizer': 1}}}


## Visualize the search Process

First for the districts:

<img src="analysis/district-hyperopt-SimpleRNN.png">

<img src="analysis/district-hyperopt-LSTM.png">

<img src="analysis/district-hyperopt-GRU.png">

And for the whole series:

<img src="analysis/whole-hyperopt-SimpleRNN.png">

<img src="analysis/whole-hyperopt-LSTM.png">

<img src="analysis/whole-hyperopt-GRU.png">

In [55]:
def typecast(params):
    out = {}
    optimizers = ["adam", "sgd"]
    for key, val in params.items():
        if key == "optimizer":
            out[key] = optimizers[val]
        else:
            try:
                out[key] = int(val)
            except TypeError:
                out[key] = val
    return out

### Evaluate the obtained Hyperparameters 

* We can compute the proportion of observations before our cutoff point
* and use the remainder for the new test set.

In [56]:
train_size = whole.index.get_loc(cutoff) /len(whole.index.unique())

In [57]:
name2cell = {"RNN": SimpleRNN,
             "LSTM": LSTM,
             "GRU": GRU}
series2data = {"whole": whole, "district": district}
series_col = []
cell_col = []
loss_col = []

In [64]:
for series, data in tqdm(series2data.items(), desc="data loop"):
    for name, cell in tqdm(name2cell.items(), desc=name):
        bestparams = typecast({"cell": cell,
                               **best[series][cell.__name__]})
        
        model = Recurrent(data, epochs=3, train_size=train_size,
                          **bestparams, verbose=False)
        model.train()
        
        predictions = model.forecast(model.X_test)
        loss = mean_squared_error(y_true=model.y_test,
                                  y_pred=predictions)
        series_col.append(series)
        cell_col.append(name)
        loss_col.append(loss)

HBox(children=(IntProgress(value=0, description='data loop', max=2, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='cell loop', max=3, style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='cell loop', max=3, style=ProgressStyle(description_width='ini…

#### How does it fit?

<img src="analysis/best_RNN.png">

#### How does it fit?

<img src="analysis/best_LSTM.png">

#### How does it fit?

<img src="analysis/best_GRU.png">

### Loss on the holdout set

In [62]:
validation = pd.read_csv("analysis/loss.csv",
                         usecols=["Series", "Model", "Validation Loss"],
                         index_col=["Series", "Model"])

In [66]:
validation.T

Series,whole,whole,whole,district,district,district
Model,RNN,LSTM,GRU,RNN,LSTM,GRU
Validation Loss,0.001171,0.000847,0.000849,0.000247,0.000237,0.000216
