In [1]:
import time
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
RANDOM_SEED = 760
DATA_DIR = Path("../../../ready_data")

N_OPTUNA_TRIALS = 50
K_FOLDS = 3 # for both outer and inner cv

df_train = pd.read_parquet(DATA_DIR/"100K18F_train_main.parquet.snappy")
df_test = pd.read_parquet(DATA_DIR/"100K18F_test_main.parquet.snappy")

X_train, y_train = df_train.drop(['r_useful', 'r_id'], axis=1).values, df_train['r_useful'].values
X_test, y_test = df_test.drop(['r_useful', 'r_id'], axis=1).values, df_test['r_useful'].values

print(f"Shape of the training data : {X_train.shape}")
print(f"Shape of the test data : {X_test.shape}")

Shape of the training data : (80000, 18)
Shape of the test data : (20000, 18)


In [3]:
class MeanModel:
    def __init__(self):
        self.prediction = None
    
    def fit(self, X, y):
        self.prediction = y.mean()
    
    def predict(self, X):
        return np.full((X.shape[0],), self.prediction)

# Mean imputation and standardisation
model_pipe = Pipeline([
    ("imp", SimpleImputer()),
    ("ss", StandardScaler()),
    ("mm", MeanModel())])

In [4]:
out_cv = KFold(K_FOLDS)
cv_results = {
    "rmse": [],
    "mae": []
}

# Single CV Loop. No nested CV Loop since no hyperparameters to tune
for cv_train_ii, cv_val_ii in out_cv.split(X_train, y_train):
    # extract cv data for this fold
    cv_X_train, cv_y_train = X_train[cv_train_ii], y_train[cv_train_ii]
    cv_X_val, cv_y_val = X_train[cv_val_ii], y_train[cv_val_ii]

    # fit on all training data for this fold
    model_pipe.fit(cv_X_train, cv_y_train)
    # predict on all validation data for this fold
    y_preds = model_pipe.predict(cv_X_val)

    # calculate scores
    rmse = mean_squared_error(cv_y_val, y_preds, squared=False)
    mae = mean_absolute_error(cv_y_val, y_preds)

    # save results for this iteration
    cv_results["rmse"].append(rmse)
    cv_results["mae"].append(mae)

In [5]:
print(cv_results)
print(f"mean CV RMSE: {np.mean(cv_results['rmse'])}")
print(f"mean CV MAE: {np.mean(cv_results['mae'])}")

{'rmse': [4.318820772647303, 4.064917076249376, 3.689184084034623], 'mae': [1.9020208467370534, 1.9028877044442598, 1.8866411304787063]}
mean CV RMSE: 4.0243073109771
mean CV MAE: 1.8971832272200064
