In [1]:
import time
from pathlib import Path
import pandas as pd
import numpy as np

import optuna as opt
from optuna.samplers import TPESampler
# suppress info logs
opt.logging.set_verbosity(opt.logging.WARNING)

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error

from xgboost import XGBRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extractArrays(df):
    '''Extracts feature matrix X and label array y from dataframe.'''
    return df.drop(['r_useful', 'r_id'], axis=1).values, df['r_useful'].values

Settings, load dataset, and constants

In [3]:
RANDOM_SEED = 760
DATA_DIR = Path("../../ready_data")

N_OPTUNA_TRIALS = 50
N_FOLDS = 5
N_REPS = 6 # number of repetitions of CV
T_ES = 20 # threshold # consecutive non-improvement rounds for early stopping

df_train = pd.read_parquet(DATA_DIR/"100K35F_train_main.parquet.snappy")
df_val = pd.read_parquet(DATA_DIR/"100K35F_val_main.parquet.snappy")
df_test = pd.read_parquet(DATA_DIR/"100K35F_test_main.parquet.snappy")

X_train, y_train = extractArrays(df_train)
X_val, y_val = extractArrays(df_val)
X_test, y_test = extractArrays(df_test)

print(f"Shape of the training data : {X_train.shape}")
print(f"Shape of the val data : {X_val.shape}")
print(f"Shape of the test data : {X_test.shape}")

Shape of the training data : (80000, 35)
Shape of the val data : (10000, 35)
Shape of the test data : (10000, 35)


Define model

In [4]:
MODEL_PREFIX = "mm"
class MeanModel:
    def __init__(self):
        self.prediction = None
    
    def fit(self, X, y):
        self.prediction = y.mean()
    
    def predict(self, X):
        return np.full((X.shape[0],), self.prediction)

# Mean imputation and standardisation
model_pipe = Pipeline([
    ("imp", SimpleImputer()),
    ("ss", StandardScaler()),
    (MODEL_PREFIX, MeanModel())])

# needed for setting parameters correctly in pipe
def hp_appender(hp_dict):
    '''Return dictionary where every key has the MODEL_PREFIX__ appended.'''
    new_dict = {}
    for key, val in hp_dict.items():
        new_dict[MODEL_PREFIX + "__" + key] = val
    return new_dict

Implement experiment procedure

In [5]:
def fit_and_score(model, hps, X_train, y_train, X_test, y_test):
    model.set_params(**hp_appender(hps))
    model.fit(X_train, y_train)
    y_preds = model.predict(X_test)

    # calculate scores
    rmse = mean_squared_error(y_test, y_preds, squared=False)
    mae = mean_absolute_error(y_test, y_preds)
    return rmse, mae

Obtain model test results

In [6]:
# combine train and val sets
# https://stackoverflow.com/questions/33356442/when-should-i-use-hstack-vstack-vs-append-vs-concatenate-vs-column-stack
X_train_val = np.vstack((X_train, X_val))
y_train_val = np.hstack((y_train, y_val))
print(X_train_val.shape)
print(y_train_val.shape)

(90000, 35)
(90000,)


In [7]:
rmse, mae = fit_and_score(model_pipe, {},
    X_train_val, y_train_val, X_test, y_test)

print(f"overall RMSE: {rmse:.4}")
print(f"overall MAE: {mae:.4}")

overall RMSE: 3.729
overall MAE: 1.873


Obtain model validation results

In [8]:
rmse, mae = fit_and_score(model_pipe, {},
    X_train, y_train, X_val, y_val)

print(f"validation RMSE: {rmse:.4}")
print(f"validation MAE: {mae:.4}")

validation RMSE: 4.052
validation MAE: 1.914
