In [498]:
import sherpa
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time

import sys
sys.path.append('/home/yusukemh/github/yusukemh/StatisticalDownscaling/writeup')
from config import C_COMMON, C_GRID, C_SINGLE, FILENAME

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [499]:
columns = C_SINGLE
df = pd.read_csv(FILENAME, usecols=C_COMMON + columns).sort_values(['year', 'month'])

# we use the last 1/5 data as the heldout clean dataset. We do not use this fold for any use except for just reporting the result.
df_train_outer = df.query('fold != 4')
df_test_outer = df.query('fold == 4')
assert (sorted(df_test_outer['skn'].unique()) == sorted(df_train_outer['skn'].unique()))

# split the trainig data into 5 folds for inner cross validation
def assign_inner_fold(df, n_folds=5):
    # assign fold for each sample
    df_len_by_month = pd.DataFrame(df.groupby(by=['year', 'month']).size()).reset_index().rename({0: "len"}, axis=1)
    df_len_by_month = df_len_by_month.sort_values(['year', 'month'])
    df_len_by_month['cumsum'] = df_len_by_month['len'].cumsum()
    n_samples_total = df_len_by_month['cumsum'].iloc[-1]
    n_samples_per_fold = np.ceil(n_samples_total / n_folds)
    
    df_len_by_month['inner_fold'] = df_len_by_month.apply(lambda row: int(row['cumsum'] / n_samples_per_fold), axis=1)
    
    df_w_fold = pd.merge(left=df, right=df_len_by_month, left_on=['year', 'month'], right_on=['year', 'month'])
    
    return df_w_fold

df_inner_split = assign_inner_fold(df_train_outer)

In [500]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model

def define_model(
    input_dim=20,
    n_units=512,
    activation='selu',#selu
    learning_rate=0.00001,
    loss='mse',
    batch_size=64
):
    inputs = Input(shape=(input_dim))
    # x = Dense(units=n_units, activation=activation, kernel_regularizer='l1')(inputs)
    x = Dense(units=n_units, activation=activation)(inputs)
    x = Dropout(rate=0.5)(x)
    x = Dense(units=n_units, activation=activation)(x)
    x = Dropout(rate=0.5)(x)
    x = Dense(units=n_units, activation=activation)(x)
    x = Dropout(rate=0.5)(x)# serves as regularization
    outputs = Dense(units=1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
        loss=loss,
        metrics=[RootMeanSquaredError()]
    )
    return model, batch_size


In [358]:
# def prepare_dataset(df, skn, inner_fold):
#     """
#     Splits dataset into train and test, and scales x
#     """
#     df_station = df[df['skn'] == skn]
#     df_train = df_station[df_station['inner_fold'] != inner_fold]
#     df_test = df_station[df_station['inner_fold'] == inner_fold]
#     x_train, x_test = np.array(df_train[columns]), np.array(df_test[columns])
#     y_train, y_test = np.array(df_train['data_in']), np.array(df_test['data_in'])
    
#     x_scaler = MinMaxScaler()
#     x_train = x_scaler.fit_transform(x_train)
#     x_test = x_scaler.transform(x_test)
    
#     return x_train, x_test, y_train, y_test

# def transform_y(y_train, y_test):
#     scaler = MinMaxScaler(feature_range=(0,1))
#     y_train = np.log(y_train + 1.)
#     y_test = np.log(y_test + 1.)
    
#     y_train = scaler.fit_transform(y_train.reshape(-1, 1))
#     y_test = scaler.transform(y_test.reshape(-1, 1))
    
#     return y_train, y_test, scaler
    
# def inverse_transform_y(y, scaler):
#     y = scaler.inverse_transform(y)
#     y = np.power(np.e, y) - 1
#     return y
    

In [522]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

class NeuralNetwork():
    
    def __init__(self, model_func, params):
        self.model_func = model_func
        self.params = params
        pass
    
    def cross_val_predict(self, df, skn, verbose=0, n_folds=5):
        assert 'inner_fold' in df.columns, 'define fold with column name "inner_fold"'
        df_station = df[df['skn'] == skn]
        
        list_ytrue = []
        list_ypred = []
        for k in range(n_folds):
            # split the dataset
            df_train = df_station[df_station['inner_fold'] != k]
            df_test = df_station[df_station['inner_fold'] == k]
            
            # convert to numpy
            x_train, x_test = np.array(df_train[columns]), np.array(df_test[columns])
            y_train, y_test = np.array(df_train['data_in']), np.array(df_test['data_in'])
            
            # scale the input and output
            x_train, x_test = self.transform_x(x_train, x_test)
            y_train, y_test, y_scaler = self.transform_y(y_train, y_test)
            
            # train the model
            self.train(x_train, y_train, verbose=0, retrain_full=False) # to speed up computation for hyperparaemter tuning
            
            # make prediction and scale
            y_pred = self.model.predict(x_test)
            y_pred = self.inverse_transform_y(y_pred, y_scaler)
            # scale y_test
            y_test = self.inverse_transform_y(y_test, y_scaler)
            
            # keep the record
            list_ytrue.extend(y_test)
            list_ypred.extend(y_pred)
        
        # calculate the loss and return
        return {
            "mse": mean_squared_error(list_ytrue, list_ypred, squared=False),
            "mae": mean_absolute_error(list_ytrue, list_ypred)
        }

    def transform_x(self, x_train, x_test):
        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        return x_train, x_test
    
    def transform_y(self, y_train, y_test):
        scaler = MinMaxScaler(feature_range=(0,1))
        y_train = np.log(y_train + 1.)
        y_test = np.log(y_test + 1.)

        y_train = scaler.fit_transform(y_train.reshape(-1, 1))
        y_test = scaler.transform(y_test.reshape(-1, 1))

        return y_train, y_test, scaler
    
    def inverse_transform_y(self, y, scaler):
        y = scaler.inverse_transform(y)
        y = np.power(np.e, y) - 1
        return y
    
    def train(self, x, y, verbose=0, retrain_full=False):
        # build the model
        self.model, batch_size = self.model_func(**self.params)
        
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                min_delta=0,
                patience=20,
                restore_best_weights=True,
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.95,
                patience=10
            )
        ]
        history = self.model.fit(
            x, y,
            epochs=500,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=verbose
        )
        
        if retrain_full:
            epochs = len(history.history['loss'])
            # rebuild the model
            self.model, batch_size = self.model_func(**params)
            callbacks = [EarlyStopping(monitor='loss', min_delta=0, patience=1e3, restore_best_weights=True)]
            history = self.model.fit(
                x, y,
                epochs=epochs,
                validation_split=0,
                callbacks=callbacks,
                batch_size=batch_size,
                verbose=verbose
            )
        return history        

In [523]:
skn = 54
model = NeuralNetwork(
    model_func=define_model,
    params = {
        'input_dim': 16,
        'n_units': 362,
        'learning_rate': 0.000695,
        'loss': 'mse',
        'batch_size': 128
    }
)
model.cross_val_predict(df_inner_split, skn)

{'mse': 5.321785170696368, 'mae': 3.4525449770133205}

In [520]:
'''
	n_units	learning_rate	loss	skn	batch_size	mae	rmse	trial_id
291.966094	0.000833	mae	54.0	256	3.495661	5.385094	0
455.336094	0.008304	mae	54.0	64	18.907637	29.576737	
16	362.231815	0.000695	mse	54.0	128
'''
skn = 54
# df_station = df_inner_split.query(f'skn == {skn}')
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=20,
        restore_best_weights=True,
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.95,
        patience=10
    )
]
batch_size=128
a = []
b = []
for inner_fold in range(5):
    x_train, x_test, y_train, y_test = prepare_dataset(df_inner_split, skn=skn, inner_fold=inner_fold)
    y_train, y_test, scaler = transform_y(y_train, y_test)

    params = {'input_dim': 16,
     'n_units': 362,
     'learning_rate': 0.000695,
     'loss': 'mse'}
    model, _ = define_model(**params)
    model.fit(
        x_train,
        y_train,
        epochs=500,
        batch_size=batch_size,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=0
    )
    y_pred = model.predict(x_test)
    y_pred = inverse_transform_y(y_pred, scaler)
    y_test = inverse_transform_y(y_test, scaler)
    a.extend(y_pred)
    b.extend(y_test)
    print(mean_squared_error(y_test, y_pred, squared=False))

5.538217695379
4.441658202184386
5.186904581988482
6.106725221154468
5.592084881258681


In [521]:
mean_squared_error(a, b, squared=False), mean_absolute_error(a, b)

(5.402684042795685, 3.4198443049863214)

In [518]:
x_train, x_test, y_train, y_test= prepare_dataset(df_inner_split, skn=skn, inner_fold=0)
linear_regression = LinearRegression()
linear_regression.fit(x_train, y_train)
yhat = linear_regression.predict(x_test)
mean_squared_error(y_test, yhat, squared=False)

1.7998480148265161

In [382]:
parameters = [
    sherpa.Continuous('n_units', [256, 512, 1024]),
    sherpa.Continuous('learning_rate', [0.00001, 0.01]),
    sherpa.Choice('batch_size', [64, 128, 192, 256, 512]),
    sherpa.Choice('loss', ['mse', 'mae'])
]
n_run = 2
alg = sherpa.algorithms.RandomSearch(max_num_trials=n_run)
study = sherpa.Study(parameters=parameters, algorithm=alg, lower_is_better=True)
dfs = []

callbacks = [
    EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=20,
        restore_best_weights=True,
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.95,
        patience=10
    )
]
dfs = []
for i, trial in enumerate(study):
    start = time.time()
    params = {
        'input_dim': len(columns),
        'n_units': trial.parameters['n_units'],
        'learning_rate': trial.parameters['learning_rate'],
        'loss': trial.parameters['loss']
    }
    batch_size = trial.parameters['batch_size']
    
    for skn in tqdm(df_inner_split['skn'].unique()):        
        ytest_station = []
        yhat_station = []
        for inner_fold in range(5):
            x_train, x_test, y_train, y_test = prepare_dataset(df_inner_split, skn=skn, inner_fold=inner_fold)
            y_train, y_test, scaler = transform_y(y_train, y_test)
            model = define_model(**params)
            model.fit(x_train, y_train, epochs=500, validation_split=0.2, callbacks=callbacks,
                batch_size=batch_size,
                verbose=0
            )
            yhat = model.predict(x_test)
            yhat = inverse_transform_y(yhat, scaler)
            y_test = inverse_transform_y(y_test, scaler)
            
            # record the result
            yhat_station.extend(yhat)
            ytest_station.extend(y_test)
        
        mae_station = mean_absolute_error(ytest_station, yhat_station)
        rmse_station = mean_squared_error(ytest_station, yhat_station, squared=False)
        
        _ = pd.DataFrame([params])
        _['skn'] = [skn]
        _['batch_size'] = [batch_size]
        _['mae'] = [mae_station]
        _['rmse'] = [rmse_station]
        _['trial_id'] = [i]
        dfs.append(_)
#     pd.concat(dfs)

INFO:sherpa.core:
-------------------------------------------------------
SHERPA Dashboard running. Access via
http://10.100.11.207:8898 if on a cluster or
http://localhost:8898 if running locally.
-------------------------------------------------------


 * Serving Flask app 'sherpa.app.app' (lazy loading)


  0%|          | 0/24 [00:00<?, ?it/s]

 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on


100%|██████████| 24/24 [04:44<00:00, 11.84s/it]
100%|██████████| 24/24 [03:00<00:00,  7.53s/it]


# get the result

In [406]:
df_report = pd.read_csv('nn_report_240_single.csv')

In [414]:
df_groupby = df_report.groupby(by='trial_id').mean()

In [426]:
min_mae_index = df_groupby[df_groupby['mae'] == df_groupby['mae'].min()].index.values[0]
min_mse_index = df_groupby[df_groupby['rmse'] == df_groupby['rmse'].min()].index.values[0]

In [481]:
params = {}
idx = min_mse_index
params['input_dim'] = int(df_report[df_report['trial_id'] == idx].iloc[0]['input_dim'])
params['n_units'] = int(df_report[df_report['trial_id'] == idx].iloc[0]['n_units'])
params['learning_rate'] = float(df_report[df_report['trial_id'] == idx].iloc[0]['learning_rate'])
params['loss'] = df_report[df_report['trial_id'] == idx].iloc[0]['loss']
batch_size = df_report[df_report['trial_id'] == idx].iloc[0]['batch_size']

In [488]:
skn = 54
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        min_delta=0,
        patience=20,
        restore_best_weights=True,
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.95,
        patience=10
    )
]
list_ytrue = []
list_ypred = []
for inner_fold in tqdm(range(5)):
    x_train, x_test, y_train, y_test = prepare_dataset(df_inner_split, skn=skn, inner_fold=inner_fold)
    y_train, y_test, scaler = transform_y(y_train, y_test)
    model = define_model(**params)
    model.fit(
        x_train,
        y_train,
        epochs=500,
        batch_size=batch_size,
        validation_split=0.2,
        callbacks=callbacks,
        verbose=0
    )
    y_pred = model.predict(x_test)
    y_pred = inverse_transform_y(y_pred, scaler)
    y_test = inverse_transform_y(y_test, scaler)
    list_ypred.extend(y_pred)
    list_ytrue.extend(y_test)
    # print(mean_squared_error(y_test, y_pred, squared=False))
print(mean_absolute_error(list_ytrue, list_ypred))

100%|██████████| 5/5 [00:43<00:00,  8.74s/it]

3.6282724449488883





In [483]:
df_report[df_report['trial_id'] == idx]

Unnamed: 0.1,Unnamed: 0,input_dim,n_units,learning_rate,loss,skn,batch_size,mae,rmse,trial_id
1776,0,16,362.231815,0.000695,mse,54.0,128,3.433046,5.296056,74
1777,0,16,362.231815,0.000695,mse,79.0,128,3.886207,5.973462,74
1778,0,16,362.231815,0.000695,mse,338.0,128,2.54407,4.762007,74
1779,0,16,362.231815,0.000695,mse,250.0,128,1.394395,2.049286,74
1780,0,16,362.231815,0.000695,mse,267.0,128,1.505138,2.224552,74
1781,0,16,362.231815,0.000695,mse,296.1,128,0.799617,1.681203,74
1782,0,16,362.231815,0.000695,mse,311.0,128,0.79694,1.520546,74
1783,0,16,362.231815,0.000695,mse,396.0,128,1.051577,1.784216,74
1784,0,16,362.231815,0.000695,mse,400.0,128,1.138457,1.805463,74
1785,0,16,362.231815,0.000695,mse,406.0,128,1.241432,1.891523,74


In [490]:
list_ypred = []
list_ytrue = []
for fold in range(5):
    x_train, x_test, y_train, y_test= prepare_dataset(df_inner_split, skn=skn, inner_fold=0)
    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)
    yhat = linear_regression.predict(x_test)
    list_ytrue.append(y_test)
    list_ypred.append(yhat)
mean_absolute_error(list_ytrue, list_ypred)

4.524202419975154