In [31]:
import sherpa
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import time

import sys
sys.path.append('/home/yusukemh/github/yusukemh/StatisticalDownscaling/writeup')
from config import C_COMMON, C_GRID, C_SINGLE, FILENAME

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [145]:
df = pd.read_csv(FILENAME, usecols=C_COMMON + C_SINGLE).sort_values(['year', 'month'])
columns = C_SINGLE
column_type = 'single'

# we use the last 1/5 data as the heldout clean dataset. We do not use this fold for any use except for just reporting the result.
df_train_outer = df.query('fold != 4')
df_test_outer = df.query('fold == 4')
assert (sorted(df_test_outer['skn'].unique()) == sorted(df_train_outer['skn'].unique()))
# print(f"{df_train_outer.shape}, {df_test_outer.shape}")

# split the trainig data into 5 folds for inner cross validation
def assign_inner_fold(df, n_folds=5):
    # assign fold for each sample
    df_len_by_month = pd.DataFrame(df.groupby(by=['year', 'month']).size()).reset_index().rename({0: "len"}, axis=1)
    df_len_by_month = df_len_by_month.sort_values(['year', 'month'])
    df_len_by_month['cumsum'] = df_len_by_month['len'].cumsum()
    n_samples_total = df_len_by_month['cumsum'].iloc[-1]
    n_samples_per_fold = np.ceil(n_samples_total / n_folds)
    
    df_len_by_month['inner_fold'] = df_len_by_month.apply(lambda row: int(row['cumsum'] / n_samples_per_fold), axis=1)
    
    df_w_fold = pd.merge(left=df, right=df_len_by_month, left_on=['year', 'month'], right_on=['year', 'month'])
    
    return df_w_fold

df_inner_split = assign_inner_fold(df_train_outer)

In [32]:
start = time.time()
# randomly choose params
parameters = [
    sherpa.Choice('n_estimators', list(range(100, 310, 10))),
    sherpa.Continuous('learning_rate', [0.001, 0.1]),
    sherpa.Discrete('max_depth', [1, 10]),
]

n_run = 1
alg = sherpa.algorithms.RandomSearch(max_num_trials=n_run)
study = sherpa.Study(
    parameters=parameters,
    algorithm=alg,
    lower_is_better=True
)

dfs = []
for trial in study:
    params = {
        "n_estimators": trial.parameters['n_estimators'],
        "learning_rate": trial.parameters['learning_rate'],
        "max_depth": trial.parameters['max_depth'],
        "verbosity": 1
        
    }
    for skn in df_inner_split['skn'].unique():
        df_station = df_inner_split[df_inner_split['skn'] == skn]
        
        # residual sum of squares
        rss = []
        for inner_fold in range(5):
            df_train_station = df_station[df_station['inner_fold'] != inner_fold]
            df_test_station = df_station[df_station['inner_fold'] == inner_fold]

            x_train, y_train = np.array(df_train_station[columns]), np.array(df_train_station['data_in'])
            x_test, y_test = np.array(df_test_station[columns]), np.array(df_test_station['data_in'])

            model = XGBRegressor(**params)
            model.fit(x_train, y_train)
            yhat = model.predict(x_test)
            mse = mean_squared_error(y_test, yhat)
            
            rss.append(mse * df_train_station.shape[0])
        rmse = np.sqrt(np.array(rss).sum() / df_station.shape[0])
        # rmse = np.sqrt(np.array(rss).mean())
        df = pd.DataFrame(
            params, index=[rmse]
        )
        df['skn'] = [skn]
        
        dfs.append(df)
# pd.concat(dfs).to_csv(f'xgb_sitespecific_{n_run}_{column_type}.csv')
            
            
        
end = time.time()
print(end - start)

INFO:sherpa.core:
-------------------------------------------------------
SHERPA Dashboard running. Access via
http://10.100.11.207:8890 if on a cluster or
http://localhost:8890 if running locally.
-------------------------------------------------------


 * Serving Flask app 'sherpa.app.app' (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: on




80.39525961875916


In [29]:
np.array(pd.concat(dfs).index)

6.58423293232745

In [157]:
temp = pd.read_csv('xgb_sitespecific_500_single.csv').rename(columns={'Unnamed: 0': 'rmse'})

In [158]:
params = []
for name, group in temp.groupby('skn'):
    param = group[group['rmse'] == group['rmse'].min()]
    params.append(param)

In [159]:
pd.concat(params)['rmse'].mean()

6.368456988273451

In [160]:
pd.concat(params)

Unnamed: 0,rmse,n_estimators,learning_rate,max_depth,verbosity,skn
3984,10.810689,100,0.079117,3,1,54.0
4417,12.606968,270,0.089092,2,1,79.0
6717,10.604335,230,0.067577,2,1,87.0
1395,4.065325,110,0.078369,1,1,250.0
3268,4.382617,230,0.033368,1,1,267.0
3941,3.110837,210,0.085633,1,1,296.1
11142,2.857972,160,0.033821,1,1,311.0
314,9.14988,110,0.027838,4,1,338.0
9871,3.25449,100,0.090134,2,1,396.0
8432,3.44736,300,0.064388,1,1,400.0


In [161]:
for i, row in df_list.iterrows():
    p = row.drop(['skn', 'rmse']).to_dict()
    skn = row['skn']
    df_train_station = df_train_outer.query(f'skn == {skn}')
    df_test_station = df_test_outer.query(f'skn == {skn}')
    
    x_train, x_test = np.array(df_train_station[columns]), np.array(df_test_station[columns])
    y_train, y_test = np.array(df_train_station['data_in']), np.array(df_test_station['data_in'])
    
    # model = XGBRegressor(
    #     **dict(
    #         n_estimators=int(row['n_estimators']),
    #         max_depth=int(row['max_depth']),
    #         verbosity=int(row['verbosity'])
    #     )
    # )
    '''
    model = XGBRegressor(
        **dict(n_estimators=100, learning_rate=0.039964, max_depth=4)
    )
    model.fit(x_train, y_train)
    yhat = model.predict(x_test)
    
    rmse_xgb = mean_squared_error(y_test, yhat, squared =False)
    
    model = LinearRegression()
    model.fit(x_train, y_train)
    yhat = model.predict(x_test)
    
    rmse_lr = mean_squared_error(y_test, yhat, squared=False)
    '''
    xgboost = XGBRegressor(**dict(n_estimators=100, learning_rate=0.039964, max_depth=4))
    xgboost.fit(x_train, y_train)
    yhat_xgb = xgboost.predict(x_test)
    
    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)
    yhat_lr = linear_regression.predict(x_test)
    
    rmse_xgb = mean_squared_error(y_test, yhat_xgb, squared=False)
    rmse_lr = mean_squared_error(y_test, yhat_lr, squared=False)
    
    print('====================')
    print(f'SKN = {skn}')
    print(f"XGB={rmse_xgb:.3f}, LR={rmse_lr:.3f}")
    

SKN = 54.0
XGB=5.055, LR=4.834
SKN = 79.0
XGB=6.709, LR=6.301
SKN = 87.0
XGB=5.812, LR=5.257
SKN = 250.0
XGB=2.210, LR=2.189
SKN = 267.0
XGB=2.216, LR=2.244
SKN = 296.1
XGB=1.381, LR=1.251
SKN = 311.0
XGB=1.149, LR=1.188
SKN = 338.0
XGB=2.880, LR=2.819
SKN = 396.0
XGB=1.250, LR=1.308
SKN = 400.0
XGB=1.280, LR=1.298
SKN = 406.0
XGB=1.581, LR=1.643
SKN = 410.0
XGB=1.412, LR=1.400
SKN = 485.0
XGB=1.935, LR=1.962
SKN = 702.7
XGB=1.518, LR=1.562
SKN = 703.0
XGB=1.538, LR=1.592
SKN = 718.0
XGB=4.647, LR=4.426
SKN = 770.0
XGB=1.923, LR=1.883
SKN = 783.0
XGB=4.615, LR=4.483
SKN = 784.0
XGB=5.036, LR=4.954
SKN = 965.0
XGB=2.210, LR=2.280
SKN = 1020.1
XGB=3.070, LR=2.787
SKN = 1075.0
XGB=3.343, LR=3.171
SKN = 1117.0
XGB=3.937, LR=3.463
SKN = 1134.0
XGB=3.625, LR=3.172
