# Coding Block 4 - Automated model and hyperparameter tuning with AutoGluon

### Load the packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# AutoML
from autogluon.tabular import TabularPredictor
'''
...
'''

'\n...\n'

### Read the dataset 
You can also compare processed and non-processed data. The autogluon library will do some preprocessing as well.

In [2]:
diab_cleaned=pd.read_csv('diabetes_cleaned.csv')
diab_cleaned.drop(columns=['outlier_z_score', 'outlier_Tukey'],
                   errors='ignore', inplace=True)

In [3]:
diab_cleaned.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.716146,72.522135,29.438802,155.583333,32.410677,0.471876,33.240885,0.348958
std,3.369578,30.649214,12.437674,10.488407,118.777435,6.930788,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,77.0,27.4,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.5,32.25,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.25,194.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### Use the Autogluon library
Use the library autogluon for automated hyperparametertuning and model benchmarking. The fit function of the TabularPredictor object allows for setting the option: <br>
<i>presets='medium_quality'</i> <br>
which limits the depths of hyperparameter optimization

In [4]:
# Define target variable
target = 'Outcome'

# Define time limit for model training (in seconds)
TIME_LIMIT = 600  # 10 minutes


# Split data into train/test sets
train_data, test_data = train_test_split(diab_cleaned, test_size=0.2, 
                                         random_state=42, stratify=diab_cleaned[target])

# Train AutoGluon model with the desired adjustments
predictor = TabularPredictor(label=target, eval_metric='roc_auc').fit(
    train_data,
    presets='medium_quality',  # Using the best quality preset for higher accuracy
    time_limit=TIME_LIMIT,  # You can set a time limit to control how long the training runs (in seconds)
    num_bag_folds=5,  # Enable ensembling by using bagging with 8 folds
    num_stack_levels=1,  # Enable stacking to further improve model performance
    ag_args_fit={'use_gpu': False},  # Set to True to use GPU for model training
    verbosity=0  # Set verbosity level to suppress too much logging output (0: most silent, 4: most detailed)
)

# Evaluate model performance on test data

print("\n--- Model Performance on Test Data ---")
test_performance = predictor.evaluate(test_data, detailed_report=True)
print(f"Test ROC-AUC: {test_performance['roc_auc']:.4f}")
print(f"Test Accuracy: {test_performance['accuracy']:.4f}")
print(f"Test F1 Score: {test_performance['f1']:.4f}")


No path specified. Models will be saved in: "AutogluonModels/ag-20250319_223752"
		Exception occured in `AgSaveModelCallback` when calling event `after_fit`:
	load_model() got an unexpected keyword argument 'weights_only'
		'weights_only' is an invalid keyword argument for Unpickler()
		Exception occured in `AgSaveModelCallback` when calling event `after_fit`:
	load_model() got an unexpected keyword argument 'weights_only'
		'weights_only' is an invalid keyword argument for Unpickler()



--- Model Performance on Test Data ---
Test ROC-AUC: 0.8194
Test Accuracy: 0.7597
Test F1 Score: 0.6263


### Show the leaderboard
The resulting "predictor" object of the TabularPredictor function from Autogluon has a function "leaderboard"

In [5]:
# Get model leaderboard
print("\n--- Model Leaderboard ---")
leaderboard = predictor.leaderboard(test_data)
print(leaderboard)


--- Model Leaderboard ---
                      model  score_test  score_val eval_metric  \
0         LightGBMXT_BAG_L1    0.821481   0.843353     roc_auc   
1      LightGBMLarge_BAG_L1    0.821111   0.796110     roc_auc   
2       WeightedEnsemble_L2    0.819444   0.845140     roc_auc   
3       WeightedEnsemble_L3    0.819444   0.845140     roc_auc   
4   RandomForestEntr_BAG_L2    0.818519   0.815835     roc_auc   
5           LightGBM_BAG_L1    0.818519   0.826168     roc_auc   
6            XGBoost_BAG_L1    0.818333   0.827027     roc_auc   
7   RandomForestEntr_BAG_L1    0.818241   0.819842     roc_auc   
8   RandomForestGini_BAG_L1    0.815833   0.817518     roc_auc   
9           CatBoost_BAG_L2    0.815556   0.832401     roc_auc   
10        LightGBMXT_BAG_L2    0.814815   0.833458     roc_auc   
11    ExtraTreesGini_BAG_L1    0.814352   0.830333     roc_auc   
12    ExtraTreesGini_BAG_L2    0.813056   0.813814     roc_auc   
13    ExtraTreesEntr_BAG_L1    0.812685   0.82668

### Show the feature importance table
The resulting "predictor" object of the TabularPredictor class from Autogluon also has a function "feature_importance"

In [6]:
# Feature importance
print("\n--- Feature Importance ---")
try:
    importance = predictor.feature_importance(test_data)
    print(importance)
except:
    print("Feature importance calculation not supported for the best model")

# Make predictions on test data
y_pred = predictor.predict(test_data)
y_pred_proba = predictor.predict_proba(test_data)

# Get confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
print("\n--- Confusion Matrix ---")
print(confusion_matrix(test_data[target], y_pred))

# Get classification report
print("\n--- Classification Report ---")
print(classification_report(test_data[target], y_pred))


--- Feature Importance ---
                          importance    stddev   p_value  n  p99_high  \
Glucose                     0.118037  0.045355  0.002171  5  0.211424   
BMI                         0.045148  0.027065  0.010148  5  0.100876   
Age                         0.034222  0.015587  0.003995  5  0.066316   
DiabetesPedigreeFunction    0.011741  0.008541  0.018580  5  0.029327   
Pregnancies                 0.008333  0.009862  0.065914  5  0.028639   
SkinThickness               0.004370  0.003011  0.015747  5  0.010569   
BloodPressure              -0.000593  0.003309  0.645372  5  0.006220   
Insulin                    -0.001333  0.002333  0.864812  5  0.003470   

                           p99_low  
Glucose                   0.024650  
BMI                      -0.010580  
Age                       0.002128  
DiabetesPedigreeFunction -0.005846  
Pregnancies              -0.011972  
SkinThickness            -0.001829  
BloodPressure            -0.007405  
Insulin           