In [25]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])?  y


In [26]:
import sherpa
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error, mean_absolute_error
import time

import sys
sys.path.append('/home/yusukemh/github/yusukemh/StatisticalDownscaling/writeup')
from config import C_COMMON, C_GRID, C_SINGLE, FILENAME

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
def assign_inner_fold(df, n_folds=5):
    # assign fold for each sample
    df_len_by_month = pd.DataFrame(df.groupby(by=['year', 'month']).size()).reset_index().rename({0: "len"}, axis=1)
    df_len_by_month = df_len_by_month.sort_values(['year', 'month'])
    df_len_by_month['cumsum'] = df_len_by_month['len'].cumsum()
    n_samples_total = df_len_by_month['cumsum'].iloc[-1]
    n_samples_per_fold = np.ceil(n_samples_total / n_folds)
    df_len_by_month['inner_fold'] = df_len_by_month.apply(lambda row: int(row['cumsum'] / n_samples_per_fold), axis=1)
    df_w_fold = pd.merge(left=df, right=df_len_by_month, left_on=['year', 'month'], right_on=['year', 'month'])
    
    return df_w_fold

def load_data(columns, filename):
    """
    Loads dataset and splits into train and test.
    It also splits training dataset into 5 folds as a column named 'inner_fold'
    """
    df = pd.read_csv(filename, usecols=C_COMMON + columns).sort_values(['year', 'month'])
    df_train = df.query('fold != 4')
    df_test = df.query('fold == 4')
    assert (sorted(df_train['skn'].unique()) == sorted(df_test['skn'].unique()))
    
    df_train = assign_inner_fold(df_train)
    
    return df_train, df_test

In [28]:
columns = C_SINGLE
df_train, _ = load_data(columns, FILENAME)

In [29]:
# columns = C_SINGLE
# df = pd.read_csv(FILENAME, usecols=C_COMMON + columns).sort_values(['year', 'month'])

# # we use the last 1/5 data as the heldout clean dataset. We do not use this fold for any use except for just reporting the result.
# df_train_outer = df.query('fold != 4')
# df_test_outer = df.query('fold == 4')
# assert (sorted(df_test_outer['skn'].unique()) == sorted(df_train_outer['skn'].unique()))

# # split the trainig data into 5 folds for inner cross validation
# def assign_inner_fold(df, n_folds=5):
#     # assign fold for each sample
#     df_len_by_month = pd.DataFrame(df.groupby(by=['year', 'month']).size()).reset_index().rename({0: "len"}, axis=1)
#     df_len_by_month = df_len_by_month.sort_values(['year', 'month'])
#     df_len_by_month['cumsum'] = df_len_by_month['len'].cumsum()
#     n_samples_total = df_len_by_month['cumsum'].iloc[-1]
#     n_samples_per_fold = np.ceil(n_samples_total / n_folds)
    
#     df_len_by_month['inner_fold'] = df_len_by_month.apply(lambda row: int(row['cumsum'] / n_samples_per_fold), axis=1)
    
#     df_w_fold = pd.merge(left=df, right=df_len_by_month, left_on=['year', 'month'], right_on=['year', 'month'])
    
#     return df_w_fold

# df_inner_split = assign_inner_fold(df_train_outer)

In [30]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model

def define_model(
    input_dim=20,
    n_units=512,
    activation='selu',#selu
    learning_rate=0.00001,
    loss='mse',
    batch_size=64
):
    inputs = Input(shape=(input_dim))
    # x = Dense(units=n_units, activation=activation, kernel_regularizer='l1')(inputs)
    x = Dense(units=n_units, activation=activation)(inputs)
    x = Dropout(rate=0.5)(x)
    x = Dense(units=n_units, activation=activation)(x)
    x = Dropout(rate=0.5)(x)
    x = Dense(units=n_units, activation=activation)(x)
    x = Dropout(rate=0.5)(x)# serves as regularization
    outputs = Dense(units=1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
        loss=loss,
        metrics=[RootMeanSquaredError()]
    )
    return model, batch_size


In [31]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

class NeuralNetwork():
    
    def __init__(self, model_func, params, columns):
        self.model_func = model_func
        self.params = params
        self.columns = columns
        pass
    
    def cross_val_predict(self, df, skn, verbose=0, n_folds=5):
        assert 'inner_fold' in df.columns, 'define fold with column name "inner_fold"'
        df_station = df[df['skn'] == skn]
        
        list_ytrue = []
        list_ypred = []
        for k in range(n_folds):
            # split the dataset
            df_train = df_station[df_station['inner_fold'] != k]
            df_test = df_station[df_station['inner_fold'] == k]
            
            # convert to numpy
            x_train, x_test = np.array(df_train[self.columns]), np.array(df_test[self.columns])
            y_train, y_test = np.array(df_train['data_in']), np.array(df_test['data_in'])
            
            # scale the input and output
            x_train, x_test = self.transform_x(x_train, x_test)
            y_train, y_test, y_scaler = self.transform_y(y_train, y_test)
            
            # train the model
            self.train(x_train, y_train, verbose=0, retrain_full=False) # to speed up computation for hyperparaemter tuning
            
            # make prediction and scale
            y_pred = self.model.predict(x_test)
            y_pred = self.inverse_transform_y(y_pred, y_scaler)
            # scale y_test
            y_test = self.inverse_transform_y(y_test, y_scaler)
            
            # keep the record
            list_ytrue.extend(y_test)
            list_ypred.extend(y_pred)
        
        # calculate the loss and return
        return {
            "mse": mean_squared_error(list_ytrue, list_ypred, squared=False),
            "mae": mean_absolute_error(list_ytrue, list_ypred)
        }

    def transform_x(self, x_train, x_test):
        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        return x_train, x_test
    
    def transform_y(self, y_train, y_test):
        scaler = MinMaxScaler(feature_range=(0,1))
        y_train = np.log(y_train + 1.)
        y_test = np.log(y_test + 1.)

        y_train = scaler.fit_transform(y_train.reshape(-1, 1))
        y_test = scaler.transform(y_test.reshape(-1, 1))

        return y_train, y_test, scaler
    
    def inverse_transform_y(self, y, scaler):
        y = scaler.inverse_transform(y)
        y = np.power(np.e, y) - 1
        return y
    
    def train(self, x, y, verbose=0, retrain_full=False):
        # build the model
        self.model, batch_size = self.model_func(**self.params)
        
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                min_delta=0,
                patience=20,
                restore_best_weights=True,
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.95,
                patience=10
            )
        ]
        history = self.model.fit(
            x, y,
            epochs=500,
            batch_size=batch_size,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=verbose
        )
        
        if retrain_full:
            epochs = len(history.history['loss'])
            # rebuild the model
            self.model, batch_size = self.model_func(**params)
            callbacks = [EarlyStopping(monitor='loss', min_delta=0, patience=1e3, restore_best_weights=True)]
            history = self.model.fit(
                x, y,
                epochs=epochs,
                validation_split=0,
                callbacks=callbacks,
                batch_size=batch_size,
                verbose=verbose
            )
        return history        

In [32]:
for skn in df_train['skn'].unique():
    model = NeuralNetwork(
        model_func=define_model,
        params = {
            'input_dim': 16,
            'n_units': 362,
            'learning_rate': 0.000695,
            'loss': 'mse',
            'batch_size': 128
        },
        columns=columns
    )
    ret = model.cross_val_predict(df_train, skn)
    print(skn, ret)

54.0 {'mse': 5.1910487470013384, 'mae': 3.349645200220334}
79.0 {'mse': 6.044588694398829, 'mae': 3.8787937445801806}


KeyboardInterrupt: 

# get the result

In [406]:
df_report = pd.read_csv('nn_report_240_single.csv')

In [414]:
df_groupby = df_report.groupby(by='trial_id').mean()

In [426]:
min_mae_index = df_groupby[df_groupby['mae'] == df_groupby['mae'].min()].index.values[0]
min_mse_index = df_groupby[df_groupby['rmse'] == df_groupby['rmse'].min()].index.values[0]

In [481]:
params = {}
idx = min_mse_index
params['input_dim'] = int(df_report[df_report['trial_id'] == idx].iloc[0]['input_dim'])
params['n_units'] = int(df_report[df_report['trial_id'] == idx].iloc[0]['n_units'])
params['learning_rate'] = float(df_report[df_report['trial_id'] == idx].iloc[0]['learning_rate'])
params['loss'] = df_report[df_report['trial_id'] == idx].iloc[0]['loss']
batch_size = df_report[df_report['trial_id'] == idx].iloc[0]['batch_size']

In [483]:
df_report[df_report['trial_id'] == idx]

Unnamed: 0.1,Unnamed: 0,input_dim,n_units,learning_rate,loss,skn,batch_size,mae,rmse,trial_id
1776,0,16,362.231815,0.000695,mse,54.0,128,3.433046,5.296056,74
1777,0,16,362.231815,0.000695,mse,79.0,128,3.886207,5.973462,74
1778,0,16,362.231815,0.000695,mse,338.0,128,2.54407,4.762007,74
1779,0,16,362.231815,0.000695,mse,250.0,128,1.394395,2.049286,74
1780,0,16,362.231815,0.000695,mse,267.0,128,1.505138,2.224552,74
1781,0,16,362.231815,0.000695,mse,296.1,128,0.799617,1.681203,74
1782,0,16,362.231815,0.000695,mse,311.0,128,0.79694,1.520546,74
1783,0,16,362.231815,0.000695,mse,396.0,128,1.051577,1.784216,74
1784,0,16,362.231815,0.000695,mse,400.0,128,1.138457,1.805463,74
1785,0,16,362.231815,0.000695,mse,406.0,128,1.241432,1.891523,74


In [490]:
list_ypred = []
list_ytrue = []
for fold in range(5):
    x_train, x_test, y_train, y_test= prepare_dataset(df_inner_split, skn=skn, inner_fold=0)
    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)
    yhat = linear_regression.predict(x_test)
    list_ytrue.append(y_test)
    list_ypred.append(yhat)
mean_absolute_error(list_ytrue, list_ypred)

4.524202419975154