In [5]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Generate a synthetic dataset
data = pd.read_csv('pseudodata_præoperation.csv')

# # #make pd to np
X = data.to_numpy()
y = np.random.choice([0, 1], size=len(data))


# Split data into holdout set and remaining set
X_remain, X_holdout, y_remain, y_holdout = train_test_split(X, y, test_size=500, random_state=42)

# Hyperparameter grid for RandomForest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

# Function to train and evaluate the RandomForest model with hyperparameter tuning
def train_and_evaluate_model_with_hyperparameter_tuning(X_train, y_train, X_val, y_val):
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy, best_model

# Outer 10-Fold Cross-Validation
outer_kf = KFold(n_splits=2, shuffle=True, random_state=42)
outer_accuracies = []

for outer_train_index, outer_test_index in outer_kf.split(X_remain):
    X_outer_train, X_outer_test = X_remain[outer_train_index], X_remain[outer_test_index]
    y_outer_train, y_outer_test = y_remain[outer_train_index], y_remain[outer_test_index]

    # Inner 10-Fold Cross-Validation
    inner_kf = KFold(n_splits=2, shuffle=True, random_state=42)
    inner_accuracies = []

    for inner_train_index, inner_val_index in inner_kf.split(X_outer_train):
        X_inner_train, X_inner_val = X_outer_train[inner_train_index], X_outer_train[inner_val_index]
        y_inner_train, y_inner_val = y_outer_train[inner_train_index], y_outer_train[inner_val_index]

        accuracy, _ = train_and_evaluate_model_with_hyperparameter_tuning(X_inner_train, y_inner_train, X_inner_val, y_inner_val)
        inner_accuracies.append(accuracy)
        print(f'Inner Fold Accuracy: {accuracy * 100:.2f}%')

    mean_inner_accuracy = np.mean(inner_accuracies)
    print(f'Outer Fold Mean Inner Accuracy: {mean_inner_accuracy * 100:.2f}%')

    outer_accuracy, best_outer_model = train_and_evaluate_model_with_hyperparameter_tuning(X_outer_train, y_outer_train, X_outer_test, y_outer_test)
    outer_accuracies.append(outer_accuracy)
    print(f'Outer Fold Accuracy: {outer_accuracy * 100:.2f}%')

print(f'Mean Outer Accuracy: {np.mean(outer_accuracies) * 100:.2f}%')

# Evaluate the final model on the holdout set
final_model = best_outer_model
final_model.fit(X_remain, y_remain)
y_holdout_pred = final_model.predict(X_holdout)
holdout_accuracy = accuracy_score(y_holdout, y_holdout_pred)
print(f'Holdout Set Accuracy: {holdout_accuracy * 100:.2f}%')


Inner Fold Accuracy: 46.50%
Inner Fold Accuracy: 46.94%
Outer Fold Mean Inner Accuracy: 46.72%
Outer Fold Accuracy: 50.93%
Inner Fold Accuracy: 49.85%
Inner Fold Accuracy: 53.65%
Outer Fold Mean Inner Accuracy: 51.75%
Outer Fold Accuracy: 51.04%
Mean Outer Accuracy: 50.99%
Holdout Set Accuracy: 48.80%
