In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import wilcoxon

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load Random Search results from best_models_summary.csv
rs_best_models = pd.read_csv('RandomForestData/best_models_summary.csv')

# Extract baseline brier scores (these are from RandomForest.ipynb baseline runs)
baseline_brier_scores = [0.1892, 0.1819, 0.1067, 0.1520]  # Bank, Gym, Heart Disease, Titanic

# Extract Random Search best parameters and test scores for each dataset
import ast

random_brier_adj = []
random_best_params = []

for dataset_name in ['Bank', 'Gym', 'Heart Disease', 'Titanic']:
    row = rs_best_models[rs_best_models['dataset'] == dataset_name].iloc[0]
    random_brier_adj.append(row['brier_score'])
    random_best_params.append(ast.literal_eval(row['params']))

print("Random Search - Loaded from files:")
for ds, brier in zip(['Bank', 'Gym', 'Heart Disease', 'Titanic'], random_brier_adj):
    print(f"  {ds}: {brier}")

Random Search - Loaded from files:
  Bank: 0.1858421833675464
  Gym: 0.1534136800030513
  Heart Disease: 0.1277791441181989
  Titanic: 0.1104537560930364


In [3]:
# Load Bayesian Optimization results from all_bayesian_results.csv
bayes_results = pd.read_csv('RandomForestData/all_bayesian_results.csv')

# Extract Bayesian best parameters and test scores for each dataset
bayes_brier_adj = []
bayes_best_params = []

for dataset_name in ['Bank', 'Gym', 'Heart Disease', 'Titanic']:
    # Filter for this dataset and completed trials
    ds_trials = bayes_results[bayes_results['dataset'] == dataset_name]
    ds_completed = ds_trials[ds_trials['state'] == 1]  # State 1 = COMPLETE
    
    # Get best trial (lowest brier_score)
    best_trial = ds_completed.loc[ds_completed['brier_score'].idxmin()]
    
    # Extract test brier score (unique per dataset)
    bayes_brier_adj.append(best_trial['test_brier_score'])
    
    # Extract parameters
    params = {
        'n_estimators': int(best_trial['n_estimators']),
        'criterion': best_trial['criterion'],
        'max_depth': None if pd.isna(best_trial['max_depth']) else int(best_trial['max_depth']),
        'min_samples_split': int(best_trial['min_samples_split']),
        'min_samples_leaf': int(best_trial['min_samples_leaf']),
        'max_features': best_trial['max_features'] if best_trial['max_features'] == 'sqrt' else float(best_trial['max_features']),
        'max_samples': None if pd.isna(best_trial['max_samples']) else float(best_trial['max_samples'])
    }
    bayes_best_params.append(params)

print("\nBayesian Optimization - Loaded from files:")
for ds, brier in zip(['Bank', 'Gym', 'Heart Disease', 'Titanic'], bayes_brier_adj):
    print(f"  {ds}: {brier}")


Bayesian Optimization - Loaded from files:
  Bank: 0.1843225378562384
  Gym: 0.1786213871377209
  Heart Disease: 0.1044106469027575
  Titanic: 0.1520592793866828


In [4]:
# Create random dict from loaded data
random = {
    'dataset': ['bank', 'gym', 'heart', 'titanic'],
    'brier_adj': random_brier_adj,
    'baseline_brier': baseline_brier_scores,
    'best_params': random_best_params
}

print("\nRandom Search dict created from files:")
print(f"  Datasets: {random['dataset']}")
print(f"  Brier scores: {random['brier_adj']}")


Random Search dict created from files:
  Datasets: ['bank', 'gym', 'heart', 'titanic']
  Brier scores: [np.float64(0.1858421833675464), np.float64(0.1534136800030513), np.float64(0.1277791441181989), np.float64(0.1104537560930364)]


In [5]:
# Create bayes dict from loaded data
bayes = {
    'dataset': ['bank', 'gym', 'heart', 'titanic'],
    'brier_adj': bayes_brier_adj,
    'baseline_brier': baseline_brier_scores,
    'best_params': bayes_best_params
}

print("\nBayesian Optimization dict created from files:")
print(f"  Datasets: {bayes['dataset']}")
print(f"  Brier scores: {bayes['brier_adj']}")


Bayesian Optimization dict created from files:
  Datasets: ['bank', 'gym', 'heart', 'titanic']
  Brier scores: [np.float64(0.1843225378562384), np.float64(0.1786213871377209), np.float64(0.1044106469027575), np.float64(0.1520592793866828)]


In [6]:
bayes_df = pd.DataFrame(bayes)
random_df = pd.DataFrame(random)

In [7]:
X1 = pd.read_csv("preprocessed_datasets/bank_data.csv")
y1 = pd.read_csv("preprocessed_datasets/bank_target.csv").squeeze()
X2 = pd.read_csv("preprocessed_datasets/gym_data.csv")
y2 = pd.read_csv("preprocessed_datasets/gym_target.csv").squeeze()
X3 = pd.read_csv("preprocessed_datasets/heartDisease_data.csv")
y3 = pd.read_csv("preprocessed_datasets/heartDisease_target.csv").squeeze()
X4 = pd.read_csv("preprocessed_datasets/titanic_data.csv")
y4 = pd.read_csv("preprocessed_datasets/titanic_target.csv").squeeze()

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, random_state=42, stratify=y1)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42, stratify=y2)
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.2, random_state=42, stratify=y3)
X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size=0.2, random_state=42, stratify=y4)


datasets = [(X1_train, X1_test, y1_train, y1_test),
            (X2_train, X2_test, y2_train, y2_test),
            (X3_train, X3_test, y3_train, y3_test),
            (X4_train, X4_test, y4_train, y4_test)]

In [8]:
df = pd.concat([random_df, bayes_df])
df = df.sort_values('brier_adj').reset_index(drop = True)
print(df.loc[3]['best_params'], "\n\n", df.loc[4]['best_params'])
df

{'n_estimators': 1216, 'criterion': 'log_loss', 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_samples': 0.9} 

 {'criterion': 'log_loss', 'max_depth': None, 'max_features': 0.33, 'max_samples': None, 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 1319}


Unnamed: 0,dataset,brier_adj,baseline_brier,best_params
0,heart,0.104411,0.1067,"{'n_estimators': 1449, 'criterion': 'entropy',..."
1,titanic,0.110454,0.152,"{'criterion': 'entropy', 'max_depth': None, 'm..."
2,heart,0.127779,0.1067,"{'criterion': 'entropy', 'max_depth': 10, 'max..."
3,titanic,0.152059,0.152,"{'n_estimators': 1216, 'criterion': 'log_loss'..."
4,gym,0.153414,0.1819,"{'criterion': 'log_loss', 'max_depth': None, '..."
5,gym,0.178621,0.1819,"{'n_estimators': 412, 'criterion': 'log_loss',..."
6,bank,0.184323,0.1892,"{'n_estimators': 1375, 'criterion': 'entropy',..."
7,bank,0.185842,0.1892,"{'criterion': 'entropy', 'max_depth': None, 'm..."


In [19]:
# wybierz najlepszy wiersz (najmniejszy brier_adj) dla każdego zbioru
best_rows = df.loc[df.groupby("dataset")["brier_adj"].idxmin()].reset_index(drop=True)

param_list = best_rows["best_params"].tolist()
param_df = pd.DataFrame(param_list)
param_df = param_df.replace({np.nan: None})


param_df



Unnamed: 0,n_estimators,criterion,max_depth,min_samples_split,min_samples_leaf,max_features,max_samples
0,1375,entropy,,19,2,0.25,0.9
1,1319,log_loss,,8,5,0.33,
2,1449,entropy,10.0,2,1,0.25,0.9
3,996,entropy,,3,1,sqrt,


In [37]:
import pandas as pd
import numpy as np
from collections import Counter

# --- przygotowanie param_df ---
param_df = param_df.copy().replace({np.nan: None})

numeric_params = [
    "n_estimators",
    "min_samples_split",
    "min_samples_leaf"
]

categorical_params = [
    "criterion",
    "max_samples",
    "max_depth",
    "max_features"
]

final_grid = {}

# --- numeric: median ---
for col in numeric_params:
    final_grid[col] = int(pd.to_numeric(param_df[col], errors="coerce").median())

# --- categorical: mode z pełną obsługą None ---
def mode_with_none(values):
    """mode obsługujący None jako wartość normalną"""
    values_list = list(values)

    # jeżeli wszystkie wartości są None → zwracamy None
    if all(v is None for v in values_list):
        return None

    # Counter działa również z None
    counts = Counter(values_list)
    
    # znajdź najlepszą wartość
    most_common_value, _ = counts.most_common(1)[0]
    return most_common_value


for col in categorical_params:
    final_grid[col] = mode_with_none(param_df[col])

final_grid['max_samples'] = None

final_grid


{'n_estimators': 1347,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'criterion': 'entropy',
 'max_samples': None,
 'max_depth': None,
 'max_features': 0.25}

In [42]:
import numpy as np
from sklearn.metrics import brier_score_loss
from sklearn.ensemble import RandomForestClassifier

# Baseline Brier scores for datasets:
# Bank, Gym, Heart, Titanic
baseline_brier_scores = np.array([0.1892, 0.1819, 0.1067, 0.1520])

# Results will be stored here
rf_brier = []
rf_improvement = []
rf_relative = []

dataset_names = ["Bank data", "Gym data", "Heart disease", "Titanic"]

for i in range(4):    
    model = RandomForestClassifier(**final_grid, random_state=42, n_jobs=-1)
    model.fit(datasets[i][0], datasets[i][2])

    brier = brier_score_loss(datasets[i][3], model.predict_proba(datasets[i][1])[:, 1])
    rf_brier.append(brier)

    improvement = baseline_brier_scores[i] - brier
    relative = improvement / baseline_brier_scores[i]

    rf_improvement.append(improvement)
    rf_relative.append(relative)

    print(f"{dataset_names[i]:<15} | Brier: {brier:.6f} | "
          f"Improvement: {improvement:+.6f} | Relative: {relative:+.2%}")

print("\n\n--- LATEX TABLE VALUES ---")
for name, imp, rel in zip(dataset_names, rf_improvement, rf_relative):
    print(f"{name} & ${imp:+.4f}$ & ${rel:+.1%}$ \\\\")

print("mean and std")
print(np.mean(rf_improvement), "     ", np.std(rf_improvement))

Bank data       | Brier: 0.186864 | Improvement: +0.002336 | Relative: +1.23%
Gym data        | Brier: 0.178418 | Improvement: +0.003482 | Relative: +1.91%
Heart disease   | Brier: 0.104385 | Improvement: +0.002315 | Relative: +2.17%
Titanic         | Brier: 0.151285 | Improvement: +0.000715 | Relative: +0.47%


--- LATEX TABLE VALUES ---
Bank data & $+0.0023$ & $+1.2%$ \\
Gym data & $+0.0035$ & $+1.9%$ \\
Heart disease & $+0.0023$ & $+2.2%$ \\
Titanic & $+0.0007$ & $+0.5%$ \\
mean and std
0.00221215876780299       0.0009847874452360336
