# Data

In [32]:
import openml
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [33]:
dataset = openml.datasets.get_dataset(40498)
X, y, _, attribute_names = dataset.get_data(target=dataset.default_target_attribute)
y = y.astype(float)
data = pd.concat([X, y], axis=1)

data

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,Class
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,4.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,4.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,4.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,4.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,4.0
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,3.0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,4.0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,5.0


# Train test split

In [34]:
from sklearn.model_selection import train_test_split, StratifiedKFold

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

x_train

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11
4665,7.3,0.17,0.36,8.20,0.028,44.0,111.0,0.99272,3.14,0.41,12.4
1943,6.3,0.25,0.44,11.60,0.041,48.0,195.0,0.99680,3.18,0.52,9.5
3399,5.6,0.32,0.33,7.40,0.037,25.0,95.0,0.99268,3.25,0.49,11.1
843,6.9,0.19,0.35,1.70,0.036,33.0,101.0,0.99315,3.21,0.54,10.8
2580,7.7,0.30,0.26,18.95,0.053,36.0,174.0,0.99976,3.20,0.50,10.4
...,...,...,...,...,...,...,...,...,...,...,...
4426,6.2,0.21,0.52,6.50,0.047,28.0,123.0,0.99418,3.22,0.49,9.9
466,7.0,0.14,0.32,9.00,0.039,54.0,141.0,0.99560,3.22,0.43,9.4
3092,7.6,0.27,0.52,3.20,0.043,28.0,152.0,0.99129,3.02,0.53,11.4
3772,6.3,0.24,0.29,13.70,0.035,53.0,134.0,0.99567,3.17,0.38,10.6


# Metric and train/optimize/judge function

In [42]:
import optuna
import time
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


def metric(y_hat, y_true,features_shape):
    n,p = features_shape
    r2 = r2_score(y_true, y_hat)
    socres = {
        "MAE": mean_absolute_error(y_true, y_hat),
        "MSE": mean_squared_error(y_true, y_hat),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_hat)),
        "R2": r2,
        "Adjusted R2":  1 - (1 - r2) * (n - 1) / (n - p - 1)
    }

    df_score = pd.DataFrame(socres.items(), columns=["Metric","Value"])

    return df_score

def optimize_train_and_judge(model_class, objective_fn, trails=100):
    global x_train, y_train, x_test, y_test, names

    study = optuna.create_study(direction="maximize")

    start = time.time()
    study.optimize(objective_fn, n_trials=trails, show_progress_bar=True)
    stop = time.time()

    best_params = study.best_params.copy()

    pipe = Pipeline([('scaler', StandardScaler()), ('feature_selection', SelectKBest()), ('model', model_class())])

    pipe.set_params(**best_params)

    pipe.fit(x_train, y_train)
    y_hat_train = pipe.predict(x_train)
    y_hat_test = pipe.predict(x_test)

    print(f"Time {stop - start}\n")
    df_test = metric(y_hat_test, y_test,x_test.shape)

    return pipe, df_test


# Random Forest

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score


def objective_RF(trial):
    # selector
    k = trial.suggest_int('feature_selection__k', 1, 10)

    #model
    n_estimators = trial.suggest_int('model__n_estimators', 50, 300)
    max_depth = trial.suggest_int('model__max_depth', 3, 20)
    min_samples_split = trial.suggest_int('model__min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('model__min_samples_leaf', 1, 10)

    pipe = Pipeline([('scaler', StandardScaler()), ('feature_selection', SelectKBest(score_func=f_regression, k=k)), ('model', RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf))])
    pipe.fit(x_train, y_train)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(pipe, x_train, y_train, cv=cv, scoring= 'neg_mean_squared_error').mean()

    return score

rf_pipe, rf_metric = optimize_train_and_judge(RandomForestRegressor, objective_RF, trails=30)

rf_metric

[I 2025-11-26 00:35:09,399] A new study created in memory with name: no-name-40603431-ec85-42da-9e1d-4f8d7c7ce3fc


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-26 00:35:17,031] Trial 0 finished with value: -0.47104281187827457 and parameters: {'feature_selection__k': 8, 'model__n_estimators': 172, 'model__max_depth': 14, 'model__min_samples_split': 15, 'model__min_samples_leaf': 5}. Best is trial 0 with value: -0.47104281187827457.
[I 2025-11-26 00:35:19,037] Trial 1 finished with value: -0.5440227082085165 and parameters: {'feature_selection__k': 4, 'model__n_estimators': 137, 'model__max_depth': 6, 'model__min_samples_split': 2, 'model__min_samples_leaf': 3}. Best is trial 0 with value: -0.47104281187827457.
[I 2025-11-26 00:35:22,030] Trial 2 finished with value: -0.53193697742857 and parameters: {'feature_selection__k': 4, 'model__n_estimators': 159, 'model__max_depth': 10, 'model__min_samples_split': 6, 'model__min_samples_leaf': 10}. Best is trial 0 with value: -0.47104281187827457.
[I 2025-11-26 00:35:25,838] Trial 3 finished with value: -0.565479648290595 and parameters: {'feature_selection__k': 3, 'model__n_estimators': 20

Unnamed: 0,Metric,Value
0,MAE,0.464948
1,MSE,0.383752
2,RMSE,0.619477
3,R2,0.5045
4,Adjusted R2,0.498869


# xgboost

In [47]:
from xgboost import XGBRegressor

def objective_RF(trial):
    # selector
    k = trial.suggest_int('feature_selection__k', 1, 10)

    #model
    learning_rate = trial.suggest_float('model__learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('model__subsample', 0.6, 1.0)


    pipe = Pipeline([('scaler', StandardScaler()), ('feature_selection', SelectKBest(score_func=f_regression, k=k)), ('model', XGBRegressor(learning_rate=learning_rate, subsample=subsample))])
    pipe.fit(x_train, y_train)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(pipe, x_train, y_train, cv=cv, scoring= 'neg_mean_squared_error').mean()

    return score

xgb_pipe, xgb_metric = optimize_train_and_judge(XGBRegressor, objective_RF, trails=30)
xgb_metric

[I 2025-11-26 00:39:46,550] A new study created in memory with name: no-name-5792b14c-057b-4b42-bf58-b740892b9775


  0%|          | 0/30 [00:00<?, ?it/s]

[I 2025-11-26 00:39:47,071] Trial 0 finished with value: -0.4510240264707496 and parameters: {'feature_selection__k': 10, 'model__learning_rate': 0.2330803125231316, 'model__subsample': 0.7935871880196892}. Best is trial 0 with value: -0.4510240264707496.
[I 2025-11-26 00:39:47,446] Trial 1 finished with value: -0.4740916343330376 and parameters: {'feature_selection__k': 9, 'model__learning_rate': 0.12855001032251084, 'model__subsample': 0.7176560839100661}. Best is trial 0 with value: -0.4510240264707496.
[I 2025-11-26 00:39:47,789] Trial 2 finished with value: -0.5304467002421532 and parameters: {'feature_selection__k': 7, 'model__learning_rate': 0.2971612955532658, 'model__subsample': 0.6691961243756869}. Best is trial 0 with value: -0.4510240264707496.
[I 2025-11-26 00:39:48,148] Trial 3 finished with value: -0.47120860280269017 and parameters: {'feature_selection__k': 8, 'model__learning_rate': 0.1645468414055689, 'model__subsample': 0.8418124247205072}. Best is trial 0 with value

Unnamed: 0,Metric,Value
0,MAE,0.449331
1,MSE,0.377999
2,RMSE,0.614817
3,R2,0.511927
4,Adjusted R2,0.506381


# Wilcoxon test

In [49]:
from statsmodels.stats.nonparametric import rank_compare_2indep
knn_probs = rf_pipe.predict(x_test)
logistic_probs = xgb_pipe.predict(x_test)

rank_compare_2indep(knn_probs, logistic_probs).summary()

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
prob(x1>x2) c0,0.5084,0.013,0.640,0.522,0.483,0.534
