In [80]:
# basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# others
import multiprocessing as mp
from xgboost import XGBRegressor
from tqdm import tqdm

# config
import sys
sys.path.append('/home/yusukemh/github/yusukemh/StatisticalDownscaling/writeup')
from config import C_COMMON, C_SINGLE, C_GRID, FILENAME

# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [108]:
df = pd.read_csv(FILENAME, usecols=C_COMMON + C_SINGLE)
columns = C_SINGLE

# Linear Regression

In [257]:
def cross_val_predict(df: pd.DataFrame, model, skn: int, columns: list, verbose=False):
    """
    Runs cross_val_predict for a single skn, using XGB or LinearRegression.
    The same functionality as sklearn.model_selection.cross_val_predict,
    except the split is not exactly 1/5 (thus pre-determined by preprocessing).
    This is because the split has to be made in the way it won't separate samples in the same month into different folds.
        Args:
            :param df: dataset for evaluation. Must contain 'fold' column that specifies assignment of each sample to the folds.
            :param model: one of [sklearn.linear_models.LinearRegression, xgboost.XGBRegressor]
            :param skn: identifier for stations
            :param columns: list of str indicating which columns to use as input data for the model
    """
    assert 'fold' in df.columns, "Must contain a column 'fold' to specify assignment of samples to the folds."
    n_folds = len(df['fold'].unique())
    dfs = [] # list of dfs containing result for each fold
    
    iterator = tqdm(range(n_folds)) if verbose else range(n_folds)
    
    for fold in iterator:
        df_train = df.query(f'(fold != {fold}) & (skn == {skn})')
        df_test = df.query(f'(fold == {fold}) & (skn == {skn})')
        
        x_train, x_test = np.array(df_train[columns]), np.array(df_test[columns])
        y_train, y_test = np.array(df_train['data_in']), np.array(df_test['data_in'])
        
        model.fit(x_train, y_train)
        yhat = model.predict(x_test)
        
        _df = pd.DataFrame(
            {
                'skn' : df_test['skn'].values,
                'year': df_test['year'].values,
                'month': df_test['month'].values,
                'observed': df_test['data_in'].values,
                'prediction': yhat,
            }
        )
        dfs.append(_df)
        
    return pd.concat(dfs)

def parallelize(func, args, n_jobs=-1):
    """
    :param args: iterable. list of arguments for the function
    """
    if n_jobs == -1:
        pool = mp.Pool(mp.cpu_count())
    else:
        pool = mp.Pool(n_jobs)
    result_objects = [pool.apply_async(func, args=_args) for _args in args]
    pool.close()
    pool.join()
    return [r.get() for r in result_objects]

In [258]:
linear_regression = LinearRegression()
df_result = cross_val_predict(df, linear_regression, skn=79, columns=columns, verbose=False)
mean_squared_error(df_result['observed'], df_result['prediction'], squared=False)

6.004229222338949

In [259]:
xgboost = XGBRegressor()
df_result = cross_val_predict(df, xgboost, skn=79, columns=columns, verbose=True)
mean_squared_error(df_result['observed'], df_result['prediction'], squared=False)

100%|██████████| 5/5 [00:01<00:00,  3.76it/s]


7.792404950412038

In [263]:
for skn in df['skn'].unique():
    print(f"experiment on {skn}")
    df_result = cross_val_predict(df, linear_regression, skn=skn, columns=columns, verbose=False)
    rmse = mean_squared_error(df_result['observed'], df_result['prediction'], squared=False)
    print(f'RMSE using LR: {rmse:.2f}')
    df_result = cross_val_predict(df, xgboost, skn=skn, columns=columns, verbose=False)
    rmse = mean_squared_error(df_result['observed'], df_result['prediction'], squared=False)
    print(f'RMSE using XGB: {rmse:3.2f}')
    print('=================================================================================')
    

experiment on 54.0
RMSE using LR: 5.18
RMSE using XGB: 5.73
experiment on 79.0
RMSE using LR: 6.00
RMSE using XGB: 7.79
experiment on 338.0
RMSE using LR: 4.21
RMSE using XGB: 4.84
experiment on 250.0
RMSE using LR: 2.05
RMSE using XGB: 2.30
experiment on 267.0
RMSE using LR: 2.22
RMSE using XGB: 2.43
experiment on 296.1
RMSE using LR: 1.51
RMSE using XGB: 1.65
experiment on 311.0
RMSE using LR: 1.36
RMSE using XGB: 1.41
experiment on 396.0
RMSE using LR: 1.55
RMSE using XGB: 1.66
experiment on 400.0
RMSE using LR: 1.62
RMSE using XGB: 1.81
experiment on 406.0
RMSE using LR: 1.78
RMSE using XGB: 1.97
experiment on 410.0
RMSE using LR: 1.71
RMSE using XGB: 1.93
experiment on 485.0
RMSE using LR: 2.20
RMSE using XGB: 2.39
experiment on 703.0
RMSE using LR: 1.95
RMSE using XGB: 2.18
experiment on 718.0
RMSE using LR: 5.10
RMSE using XGB: 5.65
experiment on 770.0
RMSE using LR: 2.19
RMSE using XGB: 2.48
experiment on 783.0
RMSE using LR: 4.40
RMSE using XGB: 5.00
experiment on 784.0
RMSE u