In [10]:
import pandas as pd
import os
import numpy as np
import scipy
from scipy.special import expit
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, confusion_matrix
from sklearn.metrics import classification_report, precision_score, recall_score
import pyspark
from pathlib import Path
import csv
import datetime
import statistics
import xgboost as xgb
from xgboost import XGBClassifier
from shiny import App, ui, render, reactive, run_app
from shinywidgets import output_widget, render_widget
import ipyleaflet as ipyl

LOKY_MAX_CPU_COUNT = 1

In [11]:
synthetic_nhanes = pd.read_csv("../Datasets/synthetic_nhanes.csv")
X = synthetic_nhanes.drop(columns=['diabetes_status'])
y = synthetic_nhanes['diabetes_status'].map({'No': 0, 'Yes': 1})

# initial run resulted in all perfect prediciton, so we need to clean the dataset of leakage
leak_cols = ['glucose_mean', 'glucose_score', 'glucose_weight',
    'DIQ050', 'DIQ070', 'DIQ160', 'DIQ180', 'DID040', 'SEQN', 'index']

X = X.drop(columns=[col for col in leak_cols if col in X.columns])

In [12]:
seed = 100

models = {
    'RandomForest (default)': RandomForestClassifier(random_state=seed),
    'RandomForest (max_depth=5)': RandomForestClassifier(max_depth=5, random_state=seed),
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=seed),
    'HistGradientBoost': HistGradientBoostingClassifier(random_state=seed)
}

results = []

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for name, model in models.items():
    metrics = {
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'roc_auc': []
    }

    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        metrics['accuracy'].append(accuracy_score(y_test, y_pred))
        metrics['precision'].append(precision_score(y_test, y_pred, zero_division=0))
        metrics['recall'].append(recall_score(y_test, y_pred, zero_division=0))
        metrics['f1'].append(f1_score(y_test, y_pred, zero_division=0))
    results.append({
        'Model': name,
        'Accuracy (Mean)': np.mean(metrics['accuracy']),
        'Precision (Mean)': np.mean(metrics['precision']),
        'Recall (Mean)': np.mean(metrics['recall']),
        'F1 Score (Mean)': np.mean(metrics['f1'])
    })
    
results_df = pd.DataFrame(results)

In [13]:
results_df

Unnamed: 0,Model,Accuracy (Mean),Precision (Mean),Recall (Mean),F1 Score (Mean)
0,RandomForest (default),0.934771,0.978742,0.316359,0.476735
1,RandomForest (max_depth=5),0.934334,1.0,0.304344,0.465994
2,XGBoost,0.930667,0.772395,0.376493,0.505346
3,HistGradientBoost,0.93285,0.829379,0.363535,0.504685
