In [155]:
import sherpa
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

from config import C_COMMON, C_GRID, C_SINGLE, FILENAME

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


# enable autoreload
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# conduct nested cross validation for xgboost.
# https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation

In [126]:
df = pd.read_csv(FILENAME, usecols=C_COMMON + C_SINGLE).sort_values(['year', 'month'])
columns = C_SINGLE

# we use the last 1/5 data as the heldout clean dataset. We do not use this fold for any use except for just reporting the result.
df_train_outer = df.query('fold != 4')
df_test_outer = df.query('fold == 4')
assert (sorted(df_test_outer['skn'].unique()) == sorted(df_train_outer['skn'].unique()))
# print(f"{df_train_outer.shape}, {df_test_outer.shape}")

# split the trainig data into 5 folds for inner cross validation
def assign_inner_fold(df, n_folds=5):
    # assign fold for each sample
    df_len_by_month = pd.DataFrame(df.groupby(by=['year', 'month']).size()).reset_index().rename({0: "len"}, axis=1)
    df_len_by_month = df_len_by_month.sort_values(['year', 'month'])
    df_len_by_month['cumsum'] = df_len_by_month['len'].cumsum()
    n_samples_total = df_len_by_month['cumsum'].iloc[-1]
    n_samples_per_fold = np.ceil(n_samples_total / n_folds)
    
    df_len_by_month['inner_fold'] = df_len_by_month.apply(lambda row: int(row['cumsum'] / n_samples_per_fold), axis=1)
    
    df_w_fold = pd.merge(left=df, right=df_len_by_month, left_on=['year', 'month'], right_on=['year', 'month'])
    
    return df_w_fold

df_inner_split = assign_inner_fold(df_train_outer)

In [158]:
dfs = []
for k in tqdm(range(5)):
    df_train = df_inner_split.query(f'inner_fold != {k}')
    df_test = df_inner_split.query(f'inner_fold == {k}')

    x_train, x_test = np.array(df_train[columns]), np.array(df_test[columns])
    y_train, y_test = np.array(df_train['data_in']), np.array(df_test['data_in'])
    
    model = XGBRegressor()
    model.fit(x_train, y_train)
    
    yhat = model.predict(x_test)
    rmse = mean_squared_error(y_test, yhat, squared=False)
    
    dfs.append(pd.DataFrame({'n_data': x_train.shape[0], 'rmse': rmse}, index=[k]))
df_result = pd.concat(dfs)
# calculate the weighted mean
rmse = (df_result["n_data"] * df_result['rmse']).sum() / df_result['n_data'].sum()

100%|██████████| 5/5 [00:07<00:00,  1.43s/it]


In [152]:
rmse

5.532759931338611