This notebook provides two different strategies how incremental drift can be handled by applying uninformed adaptations. 
Strategies include:
- 1) incremental training/updating of a model after a specific period (e.g. quarterly or yearly)
- 2) training of a new model and discard old model after a specific period (e.g. quarterly or yearly)

All strategies are applied with a custom feedforward MLP model & a custom LSTM model. Both models were trained to predict taxi demand in different areas in New York City at the same time. 

## Load Modules

In [1]:
import pandas as pd
import numpy as np
 
#from tqdm import tqdm


#load custom deep Models (LSTM, MLP)
from multivar_lstm import MultivariateLSTM

from complex_mlp import ComplexMLP


#import custom functions to store all kinds of results on disk:
import save_files_collection as sv_files

import regular_retraining_collection as rrc


Using TensorFlow backend.


## Load & prepare Data

In [2]:
#load data
TRAIN_PATH = '/media/...'
Store_PATH = '/media/...'
file_final = 'preprocessed_data.csv'

df_m = pd.read_csv(TRAIN_PATH + file_final, header=0)

#convert to datetime format:
df_m['date'] = pd.to_datetime(df_m['date'], utc=True)
df_m['date'] = df_m['date'].dt.strftime('%Y-%m-%d %H:%M:%S')
df_m['date'] = pd.to_datetime(df_m['date'])
#df_m = df_m.set_index("date") -> set index later, since we need "date" column to find highest demand columns..
print(df_m.head())

                 date  1  2  3    4  5  6   7  8  9  ...  254  255  256  257  \
0 2009-01-01 05:00:00  0  0  0   91  0  0  30  0  0  ...    0   50   39    3   
1 2009-01-01 06:00:00  1  0  0  105  0  0  62  0  0  ...    0   77   67    5   
2 2009-01-01 07:00:00  0  0  0   96  0  0  79  0  0  ...    0   90   83    4   
3 2009-01-01 08:00:00  0  0  0   91  0  0  84  0  0  ...    0   54   77    3   
4 2009-01-01 09:00:00  2  0  0   82  0  0  85  0  1  ...    0   66   54    4   

   258  259  260  261  262  263  
0    1    0    3   52  127  326  
1    0    0   15   65  166  476  
2    0    0   19   39  125  460  
3    1    0   19   54   79  313  
4    0    0   13   24   47  224  

[5 rows x 264 columns]


In [3]:
'''filter areas with highest demand '''
#get time series with highest "demand patterns":

#function filters nlargest areas:
def get_nlargest_areas(nlargest, org_dataset = df_m):
    
    #get time series with highest "demand patterns":
    df_sum = org_dataset.copy(deep=True).drop(columns=["date"],axis=1)
    df_sum = df_sum.sum(axis=0,numeric_only=True)

    #store nlargest values:
    df_sum = df_sum.nlargest(nlargest) 
    idx_filter = list(df_sum.index.values)
    #append "date" column
    idx_filter.append("date")

    del df_sum
    
    #filter columns with largest values:
    ts_largest = org_dataset[idx_filter].copy(deep=True)
    ts_largest = ts_largest.set_index("date")

    #shift datetimeindex to use local NYC time not UTC:
    ts_largest.index = ts_largest.index.shift(-5,freq='H')

    return ts_largest


In [4]:
ts_20largest = get_nlargest_areas(20)
ts_10largest = get_nlargest_areas(10)

print('20 largest areas: ', ts_20largest.shape)
print('10 largest areas: ', ts_10largest.shape)


20 largest areas:  (83231, 20)
10 largest areas:  (83231, 10)


## Function to load models from disk

In [5]:
def load_pretrained_model_from_disk(model_type):
    
    '''
    Returns pre-trained model from disk
    '''


    model_PATH = '/media/...'

    #complex MLP:
    model_architecture_complex_MLP_PATH = '/media/...'    
    complex_MLP_model_file = 'complex_MLP_early_stopping_W168_20areas__y2012.json'
    complex_MLP_weights = 'complex_MLP_early_stopping_W168_20areas__y2012_weights.h5'
    
    #multivar LSTM without additional features:
    #prepare files for models:
    model_architecture_multivar_20 = '/media/...'

    multivar_LSTM_file = 'multivar_LSTM_W168_20areas__y2012.json'
    multivar_LSTM_weights = 'multivar_LSTM_W168_20areas__y2012_weights.h5'

        
    instances_dict = {'SingleMLP': (),
                  'SingleLSTM': (),
                  'ComplexMLP': (complex_MLP_model_file, model_architecture_complex_MLP_PATH, complex_MLP_weights),
                  'MultivarLSTM': (multivar_LSTM_file, model_architecture_multivar_20, multivar_LSTM_weights),                
                 }
    
       
    

    #load complexMLP model 20largest areas:
    json_file = open(model_PATH + instances_dict[model_type][0], 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    prediction_model = model_from_json(loaded_model_json)

    #load weights of best model:
    prediction_model.load_weights(instances_dict[model_type][1] + instances_dict[model_type][2])


    
    return prediction_model



#### helper function to create new model instances

In [6]:
#function needed to create new instances, otherwise same instances is used!!
def create_model_instance(model_type):
    
    instances_dict = {'ComplexMLP': ComplexMLP(),
                      'MultivarLSTM': MultivariateLSTM(),
                      'MultivarLSTM encoded': MultivariateLSTM(use_features_per_lag_flag = True, n_hidden_neurons_2 = 128),
                     }
    
    return instances_dict[model_type]

    
    

## MultivarLSTM Model
- without lagged features

### Strategy: Train new model on a quarterly basis
- Params for test purpose: n_epochs = 5, end of dataset: 2011

In [20]:
#set model_name based on used params:
model_name = 'multivar_lstm_2H_256_32_batch512_drop03_clip_norm_shuffle_scaling_tanh_W168_20largest_areas_reg_retrain_quarterly' 

#create instance of class:
multiLSTM_model_new = create_model_instance('MultivarLSTM')

#update instance with model from disk:
prediction_model = load_pretrained_model_from_disk('MultivarLSTM')
multiLSTM_model_new.load_model(prediction_model)

#set dataset for slicing:
ts_series_input = ts_20largest.copy()



#call function for drift detection & retraining:
results_tuple_multiLSTM_retrain = rrc.regular_retraining_scheme(multiLSTM_model_new, org_ts_series=ts_series_input, model_name=model_name, 
                                          n_epochs_retrain = 5, update_weights_flag = False, overwrite_params = True,
                                          start_date_training = '2009', last_date_training = '2010', 
                                          first_date_dataset = '2009-01-01 00:00:00',
                                          start_of_preds_date = '2011-01-01 00:00:00',
                                          end_of_dataset_date = '2011-12-31 23:00:00',
                                          forecast_range_months = 3, 
                                          retrain_shifting_window_months = 3,
                                          month_forecasting = True,
                                          retrain_shifting_window_flag_day = False,
                                          retraining_range_years = 2,
                                          first_preds_flag = False,           
                                          verbosity=0)
                               

      
    
all_multiLSTM_MODELS_dict_retrain = results_tuple_multiLSTM_retrain[0]
all_multiLSTM_RESULTS_dict_retrain = results_tuple_multiLSTM_retrain[1]
all_multiLSTM_results_retrain = results_tuple_multiLSTM_retrain[2]

Note: "loaded" model must have identical params as the Class currently has
Forecasting based on given months is used...
Shifting Window based on given months is used:  3
Retraining range based on years:  2
## Very first predictions with given pre-defined model are made..
months to predict:  3
## Assigned Dates are double checked..
selected years for training:  ['2009', Timestamp('2010-12-31 23:00:00')]
year_list given:  ['2009', Timestamp('2010-12-31 23:00:00'), '2011-01-01 00:00:00', None]
#### Make predictions model: multivar_lstm_2H_256_32_batch512_drop03_clip_norm_shuffle_scaling_tanh_W168_20largest_areas_reg_retrain_quarterly ####
Shape of org. dataset after shift:  (2160, 20)
Shape of org. dataset after shift:  (1, 20)
## New model is trained and predictions are made..
months to predict:  3
>> Current Time:  23/01/2020 14:26:03
## Assigned Dates are double checked..
# Start training new model and make predictions..
selected years for training:  [Timestamp('2009-04-01 00:00:00'), 

Shape of org. dataset after shift:  (2208, 20)
Shape of org. dataset after shift:  (1, 20)
## New model is trained and predictions are made..
months to predict:  3
>> Current Time:  23/01/2020 14:29:17
## Assigned Dates are double checked..
# Start training new model and make predictions..
selected years for training:  [Timestamp('2009-10-01 00:00:00'), Timestamp('2011-09-30 23:00:00')]
year_list given:  [Timestamp('2009-10-01 00:00:00'), Timestamp('2011-09-30 23:00:00'), Timestamp('2011-10-01 00:00:00'), None]
#### Train model: multivar_lstm_2H_256_32_batch512_drop03_clip_norm_shuffle_scaling_tanh_W168_20largest_areas_reg_retrain_quarterly__trainsize729_s10_2009_e9_2011__stepsize1__p12_2011 ####
Keras Model is used...
#params are overwritten
## New Model is created, old model is discarded..
create stacked LSTM 2 layer non-stateful model:
#Dropout applied
#Clipping Norm applied
Regular 2H-LSTM Model is created...
Train on 17351 samples, validate on 2208 samples
Epoch 1/5
#Current Learn

#### store predictions on disk:

In [None]:
#store models & history on disk:

model_save_PATH = '/media/...'
df_save_PATH = '/media/...'

#call function to store models & history on disk:
_ = sv_files.store_model_and_history_on_disk(all_multiLSTM_MODELS_dict_retrain, model_save_PATH, df_save_PATH)

#call function to store prediction results:
_ = sv_files.store_retrained_predictions(all_multiLSTM_RESULTS_dict_retrain, all_multiLSTM_MODELS_dict_retrain, df_save_PATH)




## Complex MLP

### Strategy: Train new model on a quarterly basis
- Params for test purpose: n_epochs = 5, end of dataset: 2011

In [9]:
#set model_name based on used params:
model_name = 'complex_MLP_2H_128_32_batch512_drop03_clip_norm_shuffle_scaling_std_W168_20largest_areas_reg_retrain_quarterly'


#create instance of class:
cplxMLP_model_new = create_model_instance('ComplexMLP')

#update instance with model from disk:
prediction_model = load_pretrained_model_from_disk('ComplexMLP')
cplxMLP_model_new.load_model(prediction_model)

#set dataset for slicing:
ts_series_input = ts_20largest.copy()



#call function for drift detection & retraining:
results_tuple_MLP_retrain = rrc.regular_retraining_scheme(cplxMLP_model_new, org_ts_series=ts_series_input, model_name=model_name, 
                                          n_epochs_retrain = 5, update_weights_flag = False, overwrite_params = True,
                                          start_date_training = '2009', last_date_training = '2010', 
                                          first_date_dataset = '2009-01-01 00:00:00',
                                          start_of_preds_date = '2011-01-01 00:00:00',
                                          end_of_dataset_date = '2011-12-31 23:00:00',
                                          forecast_range_months = 3, 
                                          retrain_shifting_window_months = 3,
                                          month_forecasting = True,
                                          retrain_shifting_window_flag_day = False,
                                          retraining_range_years = 2,
                                          first_preds_flag = False,           
                                          verbosity=0)
                               

      
    
all_cplxMLP_MODELS_dict_retrain = results_tuple_MLP_retrain[0]
all_cplxMLP_RESULTS_dict_retrain = results_tuple_MLP_retrain[1]
all_cplxMLP_RMSE_results_retrain = results_tuple_MLP_retrain[2]

Forecasting based on given months is used...
Shifting Window based on given months is used:  3
Retraining range based on years:  2
## Very first predictions with given pre-defined model are made..
months to predict:  3
## Assigned Dates are double checked..
selected years for training:  ['2009', Timestamp('2010-12-31 23:00:00')]
year_list given:  ['2009', Timestamp('2010-12-31 23:00:00'), '2011-01-01 00:00:00', None]
#### Make predictions model: complex_MLP_2H_128_32_batch512_drop03_clip_norm_shuffle_scaling_std_W168_20largest_areas_reg_retrain_quarterly ####
Shape of org. dataset after shift:  (2160, 20)
Shape of org. dataset after shift:  (1, 20)
## New model is trained and predictions are made..
months to predict:  3
>> Current Time:  23/01/2020 14:06:26
## Assigned Dates are double checked..
# Start training new model and make predictions..
selected years for training:  [Timestamp('2009-04-01 00:00:00'), Timestamp('2011-03-31 23:00:00')]
year_list given:  [Timestamp('2009-04-01 00:

#### store results

In [15]:
#store models & history on disk:

model_save_PATH = '/media/...'
df_save_PATH = '/media/...'

#call function to store models & history on disk:
_ = sv_files.store_model_and_history_on_disk(all_cplxMLP_MODELS_dict_retrain, model_save_PATH, df_save_PATH)

#call function to store prediction results:
_ = sv_files.store_retrained_predictions(all_cplxMLP_RESULTS_dict_retrain, all_cplxMLP_MODELS_dict_retrain, df_save_PATH)




Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk
Save history_df on disk done
Saved model to disk


Unnamed: 0_level_0,237,161,230,79,236,162,170,234,48,186,142,107,163,68,239,164,141,249,138,90
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2011-01-01 00:00:00,378.765518,357.815909,170.954948,1169.934412,442.598282,377.660637,550.033426,629.927204,595.273397,309.469646,471.259277,729.279806,207.935535,590.287590,428.227081,418.747496,397.209515,592.011109,253.741570,465.114122
2011-01-01 01:00:00,181.884430,168.474030,42.926036,622.392572,233.041191,168.738678,419.629261,424.060136,353.670305,125.292004,222.339844,448.033763,106.633280,385.276383,219.484436,272.536284,282.326519,351.437659,17.621679,298.460461
2011-01-01 02:00:00,241.671692,96.039963,29.937252,584.864774,250.420319,212.491394,414.283463,259.114410,279.256687,142.871590,193.208305,429.281321,89.379490,309.107438,298.728119,264.542053,345.457260,247.204753,-33.437027,207.911047
2011-01-01 03:00:00,189.511894,167.290459,10.560440,472.070137,229.376633,216.780785,359.250160,144.219376,231.996140,101.239594,164.243835,317.994476,102.161911,305.833828,284.499657,174.675640,280.474159,220.177378,4.844269,167.948177
2011-01-01 04:00:00,110.338257,136.352837,83.788517,416.465500,116.343174,176.087357,271.892715,120.585670,314.153542,150.121731,152.050491,315.191330,107.946434,203.333633,174.203522,194.469173,225.335232,149.999481,7.342397,201.800657
2011-01-01 05:00:00,62.164257,54.906208,89.495232,259.148331,61.549626,110.638420,148.955566,99.215950,213.046333,97.153419,82.878651,241.789742,43.310005,122.863838,110.274092,141.823845,125.010246,128.085880,7.036086,144.025841
2011-01-01 06:00:00,66.781725,61.357931,134.885628,295.728729,87.218090,70.971793,112.930840,100.278843,231.373892,102.563267,88.324356,113.528565,73.603363,72.584560,72.351177,123.736671,89.766273,121.528501,17.609750,94.061470
2011-01-01 07:00:00,84.726795,67.493668,153.786304,298.408325,82.917557,86.868332,103.207989,100.012419,156.731279,110.103443,75.645695,121.926693,78.401155,132.896870,85.122429,102.120327,71.124947,133.895611,40.226765,123.242514
2011-01-01 08:00:00,101.432892,100.571232,163.821236,195.815018,127.228683,88.336998,128.774742,111.582752,175.556850,121.871658,107.839367,123.028694,80.145538,124.796234,110.470451,99.240084,119.437119,89.794327,78.869400,92.342709
2011-01-01 09:00:00,121.160362,116.529305,163.194427,195.470207,143.858780,121.915550,144.180702,122.628166,198.226601,129.227875,165.387985,117.207775,80.417599,125.327011,161.142014,90.730434,115.199486,94.248634,119.996246,96.119381


### Strategy: Incremental training / updating of model on a quarterly basis

In [17]:
#set model_name based on used params:
model_name = 'complex_MLP_2H_128_32_batch512_drop03_clip_norm_shuffle_scaling_std_W168_20largest_areas_update_weights_quarterly'


#create instance of class:
cplxMLP_model_new = create_model_instance('ComplexMLP')

#update instance with model from disk:
prediction_model = load_pretrained_model_from_disk('ComplexMLP')
cplxMLP_model_new.load_model(prediction_model)

#set dataset for slicing:
ts_series_input = ts_20largest.copy()



#call function for drift detection & retraining:
results_tuple_MLP_w_update = rrc.regular_retraining_scheme(cplxMLP_model_new, org_ts_series=ts_series_input, model_name=model_name, 
                                          n_epochs_weight = 5, overwrite_params = True,
                                          update_weights_flag = True,
                                          start_date_training = '2009', last_date_training = '2010', 
                                          first_date_dataset = '2009-01-01 00:00:00',
                                          start_of_preds_date = '2011-01-01 00:00:00',
                                          end_of_dataset_date = '2011-12-31 23:00:00',
                                          forecast_range_months = 3, 
                                          retrain_shifting_window_months = 3,
                                          month_forecasting = True,
                                          retrain_shifting_window_flag_day = False,
                                          retraining_range_years = 2,
                                          first_preds_flag = False,           
                                          verbosity=1)
                               

      
    
all_cplxMLP_MODELS_dict_w_update = results_tuple_MLP_w_update[0]
all_cplxMLP_RESULTS_dict_w_update = results_tuple_MLP_w_update[1]
all_cplxMLP_RMSE_results_w_update = results_tuple_MLP_w_update[2]

Forecasting based on given months is used...
Shifting Window based on given months is used:  3
Retraining range based on years:  2
## Very first predictions with given pre-defined model are made..
months to predict:  3
## Assigned Dates are double checked..
selected years for training:  ['2009', Timestamp('2010-12-31 23:00:00')]
year_list given:  ['2009', Timestamp('2010-12-31 23:00:00'), '2011-01-01 00:00:00', None]
#### Make predictions model: complex_MLP_2H_128_32_batch512_drop03_clip_norm_shuffle_scaling_std_W168_20largest_areas_update_weights_quarterly ####
Shape of org. dataset after shift:  (2160, 20)
Shape of org. dataset after shift:  (1, 20)
## Avg. RMSE of recent predictions: 
[65.07505260778235]
## New model is trained and predictions are made..
months to predict:  3
>> Current Time:  23/01/2020 14:22:02
## Assigned Dates are double checked..
# Start training new model and make predictions..
selected years for training:  [Timestamp('2009-04-01 00:00:00'), Timestamp('2011-03

## Yearly Retraining of existing models..

### ComplexMLP

### Strategy: training of new model on a yearly basis

In [16]:
#set model_name based on used params:
model_name = 'complex_MLP_2H_128_32_batch512_drop03_clip_norm_shuffle_scaling_std_W168_20largest_areas_reg_retrain_yearly'


#create instance of class:
cplxMLP_model_new = create_model_instance('ComplexMLP')

#update instance with model from disk:
prediction_model = load_pretrained_model_from_disk('ComplexMLP')
cplxMLP_model_new.load_model(prediction_model)

#set dataset for slicing:
ts_series_input = ts_20largest.copy()



#call function for drift detection & retraining:
results_tuple_MLP_retrain_yearly = rrc.regular_retraining_scheme(cplxMLP_model_new, org_ts_series=ts_series_input, model_name=model_name, 
                                          n_epochs_retrain = 5, update_weights_flag = False, overwrite_params = True,
                                          start_date_training = '2009', last_date_training = '2010', 
                                          first_date_dataset = '2009-01-01 00:00:00',
                                          start_of_preds_date = '2011-01-01 00:00:00',
                                          end_of_dataset_date = '2012-12-31 23:00:00',
                                          forecast_range_months = 7, 
                                          retrain_shifting_window_months = 12,
                                          month_forecasting = True,
                                          retrain_shifting_window_flag_day = False,
                                          retraining_range_years = 2,
                                          first_preds_flag = True,           
                                          verbosity=0)
                               

      
    


Forecasting based on given months is used...
Shifting Window based on given months is used:  12
Retraining range based on years:  2
## New model is trained and predictions are made..
months to predict:  7
>> Current Time:  23/01/2020 14:21:10
## Assigned Dates are double checked..
# Start training new model and make predictions..
selected years for training:  [Timestamp('2010-01-01 00:00:00'), Timestamp('2011-12-31 23:00:00')]
year_list given:  [Timestamp('2010-01-01 00:00:00'), Timestamp('2011-12-31 23:00:00'), Timestamp('2012-01-01 00:00:00'), None]
#### Train model: complex_MLP_2H_128_32_batch512_drop03_clip_norm_shuffle_scaling_std_W168_20largest_areas_reg_retrain_yearly__trainsize729_s1_2010_e12_2011__stepsize1__p12_2012 ####
Keras Model is used...
#params are overwritten
## New Model is created, old model is discarded..
create MLP Model:
#Dropout applied
#Clipping Norm applied
Train on 336940 samples, validate on 102240 samples
Epoch 1/5
#Current LearningRate:  0.001
Epoch 2/5
#C