In [30]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression as LR
from tqdm import tqdm
import math


# basic libraries
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker

# sklearn
# from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# others
# import multiprocessing as mp
from tqdm import tqdm

# config
import sys
sys.path.append('/home/yusukemh/github/yusukemh/StatisticalDownscaling/writeup')
from config import C_COMMON, C_SINGLE, C_GRID, FILENAME
from util import load_data

# enable autoreload
%load_ext autoreload
%autoreload 2

class Generator(Sequence):
    # Class is a dataset wrapper for better training performance
    def __init__(self, x_set, y_set, batch_size=256):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
        self.indices = np.arange(self.x.shape[0])

    def __len__(self):
        return math.ceil(self.x.shape[0] / self.batch_size)

    def __getitem__(self, idx):
        # inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]  # Line A
        inds = self.indices.take(range(idx * self.batch_size, (idx + 1) * self.batch_size), mode='wrap')
        batch_x = self.x[inds]
        batch_y = self.y[inds]
        return batch_x, batch_y

    def on_epoch_end(self):
        np.random.shuffle(self.indices)

class NeuralNetwork():
    
    def __init__(self, model_func, params, columns):
        self.model_func = model_func
        self.params = params
        self.columns = columns
        pass
    
    def evaluate_by_station(self, df_train, df_test, skn):
        df_train_station = df_train[df_train['skn'] == skn]
        df_test_station = df_test[df_test['skn'] == skn]

        # convert to numpy
        x_train, x_test = np.array(df_train_station[self.columns]), np.array(df_test_station[self.columns])
        y_train, y_test = np.array(df_train_station['data_in']), np.array(df_test_station['data_in'])

        # scale the input and output
        x_train, x_test = self.transform_x(x_train, x_test)
        y_train, y_test = self.transform_y(y_train, y_test)
        
        # train the model with retrain_full = True
        history = self.train(x_train, y_train, verbose=0, retrain_full=True)

        # make prediction and scale
        y_pred = self.model.predict(x_test)
        y_pred = self.inverse_transform_y(y_pred)

        # scale y_test
        y_test = self.inverse_transform_y(y_test)
        
        return {
            "skn": skn,
            "rmse_nn": mean_squared_error(y_test, y_pred, squared=False),
            "mae_nn": mean_absolute_error(y_test, y_pred)
        }
        
    
    def evaluate(self, df_train, df_test):
        ret_vals = []
        for skn in tqdm(df_train['skn'].unique()):
            r = self.evaluate_by_station(df_train, df_test, skn)
            ret_vals.append(r)

        return pd.DataFrame(ret_vals)
            
            
    
    def cross_val_predict(self, df, skn, verbose=0, n_folds=5):
        assert 'inner_fold' in df.columns, 'define fold with column name "inner_fold"'
        df_station = df[df['skn'] == skn]
        
        list_ytrue = []
        list_ypred = []
        for k in range(n_folds):
            # split the dataset
            df_train = df_station[df_station['inner_fold'] != k]
            df_test = df_station[df_station['inner_fold'] == k]
            
            # convert to numpy
            x_train, x_test = np.array(df_train[self.columns]), np.array(df_test[self.columns])
            y_train, y_test = np.array(df_train['data_in']), np.array(df_test['data_in'])
            
            # scale the input and output
            x_train, x_test = self.transform_x(x_train, x_test)
            y_train, y_test = self.transform_y(y_train, y_test)
            
            # train the model
            history = self.train(x_train, y_train, verbose=0, retrain_full=False) # to speed up computation for hyperparaemter tuning
            
            # make prediction and scale
            y_pred = self.model.predict(x_test)
            y_pred = self.inverse_transform_y(y_pred)
            # scale y_test
            y_test = self.inverse_transform_y(y_test)
            
            # keep the record
            list_ytrue.extend(y_test)
            list_ypred.extend(y_pred)
        
        # calculate the loss and return
        return {
            "rmse": mean_squared_error(list_ytrue, list_ypred, squared=False),
            "mae": mean_absolute_error(list_ytrue, list_ypred),
            'epochs': len(history.history['loss'])
        }

    def transform_x(self, x_train, x_test):
        scaler = MinMaxScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)
        return x_train, x_test
    
    def transform_y(self, y_train, y_test):
        y_train = np.log(y_train + 1.)
        y_test = np.log(y_test + 1.)

        return y_train, y_test# , scaler
    
    def inverse_transform_y(self, y):
        y = np.power(np.e, y) - 1
        return y
    
    def train(self, x, y, verbose=0, retrain_full=False):
        # split into train and validation
        # strictly speaking, this is not appropriate because scaler fit to the union of train/valid
        x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, shuffle=False)
        # build the model
        self.model, batch_size = self.model_func(**self.params)
        # set up callbacks
        callbacks = [
            EarlyStopping(
                monitor='val_loss',
                min_delta=0,
                patience=20,
                restore_best_weights=True,
            ),
            ReduceLROnPlateau(
                monitor='val_loss',
                factor=0.95,
                patience=10
            )
        ]
        
        # set up the generators
        train_datagen = Generator(x_train, y_train, batch_size)
        valid_datagen = Generator(x_valid, y_valid, batch_size)
        
        history = self.model.fit_generator(
            train_datagen,
            steps_per_epoch=np.ceil(len(x_train)/batch_size),
            validation_data=valid_datagen,
            validation_steps=np.ceil(len(x_valid)/batch_size),
            epochs=int(1e3),
            callbacks=callbacks,
            verbose=1
            
            # epochs=int(1e3),
            # batch_size=batch_size,
            # validation_split=0.2,
            # callbacks=callbacks,
            # verbose=0
        )
        
        if retrain_full:
            epochs = len(history.history['loss'])
            train_datagen = Generator(x, y, batch_size)
            # rebuild the model
            self.model, batch_size = self.model_func(**self.params)
            callbacks = [EarlyStopping(monitor='loss', min_delta=0, patience=1e3, restore_best_weights=True)]
            history = self.model.fit(
                train_datagen,
                steps_per_epoch=np.ceil(len(x) / batch_size),
                epochs=epochs,
                callbacks=callbacks,
                verbose=1,
                #epochs=epochs,
                #validation_split=0,
                #callbacks=callbacks,
                #batch_size=batch_size,
                #verbose=0
            )
        return history        


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
test = np.array([1,2,3,4,5,6,7,8,9,10])
test.take(range(5, 11), mode='wrap')

array([ 6,  7,  8,  9, 10,  1])

In [31]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import L2

def define_model(
    input_dim=20,
    n_units=512,
    activation='selu',#selu
    learning_rate=0.00001,
    loss='mse',
    batch_size=64
):
    inputs = Input(shape=(input_dim))
    x = Dense(units=n_units, activation=activation, kernel_regularizer=L2(l2=0.01))(inputs)
    x = Dropout(rate=0.5)(x)
    x = Dense(units=n_units, activation=activation, kernel_regularizer=L2(l2=0.01))(x)
    x = Dropout(rate=0.5)(x)
    x = Dense(units=n_units, activation=activation, kernel_regularizer=L2(l2=0.01))(x)
    x = Dropout(rate=0.5)(x)# serves as regularization
    outputs = Dense(units=1, activation='softplus')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
        loss=loss,
        metrics=[RootMeanSquaredError()]
    )
    return model, batch_size

# class Generator(Sequence):
#     # Class is a dataset wrapper for better training performance
#     def __init__(self, x_set, y_set, batch_size=256):
#         self.x, self.y = x_set, y_set
#         self.batch_size = batch_size
#         self.indices = np.arange(self.x.shape[0])

#     def __len__(self):
#         return math.floor(self.x.shape[0] / self.batch_size)

#     def __getitem__(self, idx):
#         inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size]
#         batch_x = self.x[inds]
#         batch_y = self.y[inds]
#         return batch_x, batch_y

#     def on_epoch_end(self):
#         np.random.shuffle(self.indices)

In [5]:
from tensorflow.keras.utils import Sequence

In [32]:
columns = C_SINGLE
df_train, df_test = load_data(columns + C_COMMON, FILENAME)

In [43]:
skn = 54
# df_train_station = df_train[df_train['skn'] == skn]
# df_test_station = df_test[df_test['skn'] == skn]

# params = {'n_units': 274, 'learning_rate': 0.001701746715659, 'input_dim': 16, 'batch_size': 192, 'loss': 'mse'}
params = {'n_units': 315, 'learning_rate': 0.0018353200721248, 'input_dim': 16, 'batch_size': 512, 'loss': 'mse'}
n_model = NeuralNetwork(
    params=params,
    columns=columns,
    model_func=define_model
)



In [44]:
n_model.evaluate_by_station(df_train, df_test, skn=250)



Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

{'skn': 250, 'rmse_nn': 2.293672673033979, 'mae_nn': 1.3302551504305211}