In [28]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cartopy.crs as ccrs
import cartopy.mpl.ticker as cticker

# sklearn
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# others
# import multiprocessing as mp
from xgboost import XGBRegressor
from tqdm import tqdm

# config
import sys
sys.path.append('/home/yusukemh/github/yusukemh/StatisticalDownscaling/writeup')
from config import C_COMMON, C_SINGLE, C_GRID, FILENAME
from util import load_data, XGB, NeuralNetwork, LinearRegression

# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
columns = C_SINGLE
df_train, df_test = load_data(columns + C_COMMON, FILENAME)

# Model performance comparison

In [47]:
l_model = LinearRegression(columns=columns)
l_pred = l_model.evaluate(df_train, df_test)
l_pred

Unnamed: 0,skn,rmse_lr,mae_lr
0,54.0,4.833861,3.558593
1,79.0,6.301111,4.208636
2,338.0,2.818706,2.123845
3,250.0,2.189325,1.443336
4,267.0,2.243758,1.505239
5,296.1,1.250883,0.888049
6,311.0,1.188327,0.864515
7,396.0,1.308114,0.960739
8,400.0,1.298378,0.976187
9,406.0,1.643493,1.254386


In [48]:
# XGBoost
x_model = XGB(
    params={
        'n_estimators': 120,
        'learning_rate': 0.070755,
        'max_depth': 2
    },
    columns=columns
)
x_pred = x_model.evaluate(df_train, df_test)
x_pred

Unnamed: 0,skn,rmse_xgb,mae_xgb
0,54.0,5.020365,3.724248
1,79.0,6.714039,4.751905
2,338.0,2.948576,2.119263
3,250.0,2.196315,1.43716
4,267.0,2.200451,1.434376
5,296.1,1.222167,0.723007
6,311.0,1.181025,0.671441
7,396.0,1.235662,0.78116
8,400.0,1.260604,0.864615
9,406.0,1.600808,1.175606


In [85]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.models import Model

def define_model(
    input_dim=20,
    n_units=512,
    activation='selu',#selu
    learning_rate=0.00001,
    loss='mse',
    batch_size=64
):
    inputs = Input(shape=(input_dim))
    # x = Dense(units=n_units, activation=activation, kernel_regularizer='l1')(inputs)
    x = Dense(units=n_units, activation=activation)(inputs)
    x = Dropout(rate=0.5)(x)
    # x = Dense(units=n_units, activation=activation, kernel_regularizer='l1')(x)
    x = Dense(units=n_units, activation=activation)(inputs)
    x = Dropout(rate=0.5)(x)
    # x = Dense(units=n_units, activation=activation, kernel_regularizer='l1')(x)
    x = Dense(units=n_units, activation=activation)(inputs)
    x = Dropout(rate=0.5)(x)# serves as regularization
    outputs = Dense(units=1, activation='sigmoid')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.optimizers.Adam(learning_rate=learning_rate),
        loss=loss,
        metrics=[RootMeanSquaredError()]
    )
    return model, batch_size

n_model = NeuralNetwork(
    params={
        'n_units': 835,
        'learning_rate': 0.0002633633813255,
        'input_dim': 16,
        'batch_size': 64,
        'loss': 'mse'
    },
    columns=columns,
    model_func=define_model
)

In [86]:
n_pred = n_model.evaluate(df_train, df_test)

  0%|          | 0/24 [00:29<?, ?it/s]


KeyboardInterrupt: 

In [57]:
n_pred.merge(l_pred, left_on='skn', right_on='skn')

Unnamed: 0,skn,rmse_nn,mae_nn,rmse_lr,mae_lr
0,54.0,4.701864,3.136143,4.833861,3.558593
1,79.0,6.195562,3.861293,6.301111,4.208636
2,338.0,2.72027,1.856383,2.818706,2.123845
3,250.0,2.298945,1.33476,2.189325,1.443336
4,267.0,2.343627,1.454797,2.243758,1.505239
5,296.1,1.234314,0.658452,1.250883,0.888049
6,311.0,1.225338,0.620076,1.188327,0.864515
7,396.0,1.262213,0.733382,1.308114,0.960739
8,400.0,1.298655,0.837869,1.298378,0.976187
9,406.0,1.605307,1.13399,1.643493,1.254386


# site specific hyperparameter for NN

In [75]:
# load data
df_report = pd.read_csv('nn_hyperparameter_tuning/nn_report_500_single.csv')
objective = 'rmse'
param_dicts = []
for skn in df_report['skn'].unique():
    df_station = df_report[df_report['skn'] == skn]
    params = df_station.iloc[df_station[objective].argmin()][['n_units', 'learning_rate', 'input_dim', 'batch_size', 'loss']].to_dict()
    param_dicts.append(
        {
            "skn": skn,
            "params": params
        }
    )

In [78]:
ret_vals = []
for item in tqdm(param_dicts):
    station_model = NeuralNetwork(
        columns=columns,
        params=item['params'],
        model_func=define_model
    )
    r = station_model.evaluate_by_station(df_train, df_test, skn=item['skn'])
    ret_vals.append(r)

100%|██████████| 24/24 [02:44<00:00,  6.87s/it]


In [83]:
pd.DataFrame(ret_vals).merge(l_pred, left_on='skn', right_on='skn')

Unnamed: 0,skn,rmse_nn,mae_nn,rmse_lr,mae_lr
0,54.0,4.675819,3.126183,4.833861,3.558593
1,79.0,6.265651,3.919525,6.301111,4.208636
2,338.0,2.724811,1.86364,2.818706,2.123845
3,250.0,2.310143,1.337126,2.189325,1.443336
4,267.0,2.247264,1.414398,2.243758,1.505239
5,296.1,1.25383,0.663207,1.250883,0.888049
6,311.0,1.261034,0.606666,1.188327,0.864515
7,396.0,1.261544,0.739321,1.308114,0.960739
8,400.0,1.265284,0.831768,1.298378,0.976187
9,406.0,1.609264,1.128342,1.643493,1.254386


In [257]:
def cross_val_predict(df: pd.DataFrame, model, skn: int, columns: list, verbose=False):
    """
    Runs cross_val_predict for a single skn, using XGB or LinearRegression.
    The same functionality as sklearn.model_selection.cross_val_predict,
    except the split is not exactly 1/5 (thus pre-determined by preprocessing).
    This is because the split has to be made in the way it won't separate samples in the same month into different folds.
        Args:
            :param df: dataset for evaluation. Must contain 'fold' column that specifies assignment of each sample to the folds.
            :param model: one of [sklearn.linear_models.LinearRegression, xgboost.XGBRegressor]
            :param skn: identifier for stations
            :param columns: list of str indicating which columns to use as input data for the model
    """
    assert 'fold' in df.columns, "Must contain a column 'fold' to specify assignment of samples to the folds."
    n_folds = len(df['fold'].unique())
    dfs = [] # list of dfs containing result for each fold
    
    iterator = tqdm(range(n_folds)) if verbose else range(n_folds)
    
    for fold in iterator:
        df_train = df.query(f'(fold != {fold}) & (skn == {skn})')
        df_test = df.query(f'(fold == {fold}) & (skn == {skn})')
        
        x_train, x_test = np.array(df_train[columns]), np.array(df_test[columns])
        y_train, y_test = np.array(df_train['data_in']), np.array(df_test['data_in'])
        
        model.fit(x_train, y_train)
        yhat = model.predict(x_test)
        
        _df = pd.DataFrame(
            {
                'skn' : df_test['skn'].values,
                'year': df_test['year'].values,
                'month': df_test['month'].values,
                'observed': df_test['data_in'].values,
                'prediction': yhat,
            }
        )
        dfs.append(_df)
        
    return pd.concat(dfs)

def parallelize(func, args, n_jobs=-1):
    """
    :param args: iterable. list of arguments for the function
    """
    if n_jobs == -1:
        pool = mp.Pool(mp.cpu_count())
    else:
        pool = mp.Pool(n_jobs)
    result_objects = [pool.apply_async(func, args=_args) for _args in args]
    pool.close()
    pool.join()
    return [r.get() for r in result_objects]

In [265]:
linear_regression = LinearRegression()
params = {'n_estimators': 260, 'learning_rate': 0.1, 'max_depth': 3, 'early_stopping_rounds': 8, 'verbosity': 0}
xgboost = XGBRegressor(**params)

for skn in df['skn'].unique():
    print(f"experiment on {skn}")
    df_result = cross_val_predict(df, linear_regression, skn=skn, columns=columns, verbose=False)
    rmse = mean_squared_error(df_result['observed'], df_result['prediction'], squared=False)
    print(f'RMSE using LR: {rmse:.2f}')
    df_result = cross_val_predict(df, xgboost, skn=skn, columns=columns, verbose=False)
    rmse = mean_squared_error(df_result['observed'], df_result['prediction'], squared=False)
    print(f'RMSE using XGB: {rmse:3.2f}')
    print('=================================================================================')
    

experiment on 54.0
RMSE using LR: 5.18
RMSE using XGB: 5.60
experiment on 79.0
RMSE using LR: 6.00
RMSE using XGB: 6.76
experiment on 338.0
RMSE using LR: 4.21
RMSE using XGB: 4.62
experiment on 250.0
RMSE using LR: 2.05
RMSE using XGB: 2.22
experiment on 267.0
RMSE using LR: 2.22
RMSE using XGB: 2.33
experiment on 296.1
RMSE using LR: 1.51
RMSE using XGB: 1.62
experiment on 311.0
RMSE using LR: 1.36
RMSE using XGB: 1.43
experiment on 396.0
RMSE using LR: 1.55
RMSE using XGB: 1.56
experiment on 400.0
RMSE using LR: 1.62
RMSE using XGB: 1.67
experiment on 406.0
RMSE using LR: 1.78
RMSE using XGB: 1.83
experiment on 410.0
RMSE using LR: 1.71
RMSE using XGB: 1.80
experiment on 485.0
RMSE using LR: 2.20
RMSE using XGB: 2.27
experiment on 703.0
RMSE using LR: 1.95
RMSE using XGB: 2.02
experiment on 718.0
RMSE using LR: 5.10
RMSE using XGB: 5.43
experiment on 770.0
RMSE using LR: 2.19
RMSE using XGB: 2.39
experiment on 783.0
RMSE using LR: 4.40
RMSE using XGB: 4.78
experiment on 784.0
RMSE u