# Coding Block 4 - Automated model and hyperparameter tuning with AutoGluon

### Load the packages

In [1]:
#!pip install autogluon.tabular  > /dev/null 2>&1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# AutoML
from autogluon.tabular import TabularPredictor
'''
...
'''

'\n...\n'

### Read the dataset 
You can also compare processed and non-processed data. The autogluon library will do some preprocessing as well.

In [2]:
diab=pd.read_csv('C:\\Users\\v.weber\\Documents\\000 Master Wirtschaftsinformatik FU Berlin\\I\\Applied Analytics\\github stuff\\fork\\Applied-Analytics\\data\\diabetes_data_cleaned.csv')
diab = diab.drop(columns=['outlier_z_score', 'outlier_Tukey'])


### Use the Autogluon library
Use the library autogluon for automated hyperparametertuning and model benchmarking. The fit function of the TabularPredictor object allows for setting the option: <br>
<i>presets = {‘best_quality’, ‘high_quality’, ‘good_quality’, ‘medium_quality’, ‘experimental_quality’, ‘optimize_for_deployment’, ‘interpretable’, ‘ignore_text’}</i> <br>

medium_quality can limit the depths of hyperparameter optimization..

In [4]:
from autogluon.tabular import TabularPredictor

# Define the target variable
target = 'Outcome'

# Perform automated hyperparameter tuning
predictor = TabularPredictor(label=target).fit(
    train_data=diab, 
    presets='good_quality',  # Adjusted for faster training
    time_limit=300  # 5-minute time limit
)

print("Model training and hyperparameter tuning completed.")

No path specified. Models will be saved in: "AutogluonModels\ag-20250320_140702"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.9.21
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22631
CPU Count:          16
Memory Avail:       44.39 GB / 63.73 GB (69.6%)
Disk Space Avail:   1508.06 GB / 1888.04 GB (79.9%)
Presets specified: ['good_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is a

Model training and hyperparameter tuning completed.


### Show the leaderboard
TabularPredictor objects from Autogluon provide a function "leaderboard"

In [5]:
# Display the leaderboard of trained models
leaderboard = predictor.leaderboard(silent=True)
print(leaderboard)

                           model  score_val eval_metric  pred_time_val  \
0            WeightedEnsemble_L2   0.791667    accuracy       0.006721   
1                CatBoost_BAG_L1   0.791667    accuracy       0.006721   
2              LightGBMXT_BAG_L1   0.789062    accuracy       0.034144   
3          NeuralNetTorch_BAG_L1   0.789062    accuracy       0.084738   
4         NeuralNetFastAI_BAG_L1   0.785156    accuracy       0.086087   
5                LightGBM_BAG_L1   0.779948    accuracy       0.025446   
6                 XGBoost_BAG_L1   0.773438    accuracy       0.034424   
7          ExtraTreesGini_BAG_L1   0.770833    accuracy       0.066921   
8           LightGBMLarge_BAG_L1   0.760417    accuracy       0.033513   
9          ExtraTreesEntr_BAG_L1   0.757812    accuracy       0.066660   
10       RandomForestGini_BAG_L1   0.752604    accuracy       0.068281   
11       RandomForestEntr_BAG_L1   0.742188    accuracy       0.067895   
12    ExtraTreesEntr_BAG_L1_FULL      

### Show the feature importance table
The TabularPredictor class from Autogluon also provides a function "feature_importance"

In [8]:
# Display feature importance for each valid model
for model in predictor.get_model_names():
    try:
        print(f"Feature importance for model: {model}")
        # Use the training dataset (diab) for feature importance calculation
        feature_importance = predictor.feature_importance(data=diab, model=model)
        print(feature_importance)
    except AttributeError as e:
        print(f"Skipping model {model} due to error: {e}")

  for model in predictor.get_model_names():
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	3.51s	= Expected runtime (0.7s per shuffle set)


Feature importance for model: LightGBMXT_BAG_L1
Skipping model LightGBMXT_BAG_L1 due to error: 'NoneType' object has no attribute 'predict'
Feature importance for model: LightGBM_BAG_L1
Skipping model LightGBM_BAG_L1 due to error: 'NoneType' object has no attribute 'predict'
Feature importance for model: RandomForestGini_BAG_L1


	0.51s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	3.77s	= Expected runtime (0.75s per shuffle set)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.200260  0.009040  4.969023e-07  5  0.218874   
BMI                         0.089323  0.008214  8.487297e-06  5  0.106237   
Age                         0.072917  0.008079  1.779405e-05  5  0.089552   
DiabetesPedigreeFunction    0.051302  0.007400  5.054148e-05  5  0.066539   
Insulin                     0.044271  0.006107  4.238092e-05  5  0.056846   
Pregnancies                 0.032292  0.006536  1.909038e-04  5  0.045750   
SkinThickness               0.029687  0.003613  2.581633e-05  5  0.037127   
BloodPressure               0.014583  0.002140  5.403493e-05  5  0.018989   

                           p99_low  
Glucose                   0.181647  
BMI                       0.072409  
Age                       0.056281  
DiabetesPedigreeFunction  0.036065  
Insulin                   0.031696  
Pregnancies               0.018833  
SkinThickness             0.022248  
BloodPress

	0.53s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	3.05s	= Expected runtime (0.61s per shuffle set)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.199479  0.010871  1.054130e-06  5  0.221862   
Age                         0.085417  0.008163  9.887380e-06  5  0.102224   
BMI                         0.083854  0.009326  1.806108e-05  5  0.103057   
DiabetesPedigreeFunction    0.055208  0.004924  7.512731e-06  5  0.065347   
Insulin                     0.047135  0.005779  2.658423e-05  5  0.059035   
Pregnancies                 0.029167  0.006419  2.641593e-04  5  0.042383   
SkinThickness               0.027604  0.001426  8.524242e-07  5  0.030541   
BloodPressure               0.012760  0.001931  6.109038e-05  5  0.016737   

                           p99_low  
Glucose                   0.177096  
Age                       0.068610  
BMI                       0.064652  
DiabetesPedigreeFunction  0.045070  
Insulin                   0.035236  
Pregnancies               0.015951  
SkinThickness             0.024667  
BloodPress

	0.57s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	3.55s	= Expected runtime (0.71s per shuffle set)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.188542  0.011514  1.660978e-06  5  0.212250   
BMI                         0.079167  0.004640  1.409898e-06  5  0.088721   
Pregnancies                 0.073698  0.005493  3.677434e-06  5  0.085009   
Age                         0.071354  0.003949  1.121664e-06  5  0.079486   
DiabetesPedigreeFunction    0.064062  0.005779  7.862404e-06  5  0.075962   
SkinThickness               0.049740  0.003370  2.514336e-06  5  0.056679   
Insulin                     0.048698  0.001485  1.035246e-07  5  0.051755   
BloodPressure               0.041406  0.001698  3.383746e-07  5  0.044902   

                           p99_low  
Glucose                   0.164833  
BMI                       0.069612  
Pregnancies               0.062387  
Age                       0.063222  
DiabetesPedigreeFunction  0.052163  
SkinThickness             0.042800  
Insulin                   0.045641  
BloodPress

	0.62s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	0.23s	= Expected runtime (0.05s per shuffle set)
	0.11s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.191927  0.015368  4.890914e-06  5  0.223570   
BMI                         0.081250  0.006678  5.425669e-06  5  0.094999   
Pregnancies                 0.068229  0.004179  1.680202e-06  5  0.076833   
Age                         0.067448  0.004992  3.575454e-06  5  0.077727   
DiabetesPedigreeFunction    0.059635  0.007503  2.943647e-05  5  0.075083   
SkinThickness               0.046875  0.002762  1.440086e-06  5  0.052562   
Insulin                     0.043750  0.001975  4.967079e-07  5  0.047816   
BloodPressure               0.035937  0.002365  2.239074e-06  5  0.040808   

                           p99_low  
Glucose                   0.160284  
BMI                       0.067501  
Pregnancies               0.059625  
Age                       0.057169  
DiabetesPedigreeFunction  0.044188  
SkinThickness             0.041188  
Insulin                   0.039684  
BloodPress

	0.37s	= Expected runtime (0.07s per shuffle set)
	0.12s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	5.29s	= Expected runtime (1.06s per shuffle set)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.181250  0.006665  2.190001e-07  5  0.194973   
Age                         0.096094  0.006204  2.072993e-06  5  0.108867   
BMI                         0.088802  0.003370  2.485191e-07  5  0.095742   
DiabetesPedigreeFunction    0.045312  0.003729  5.452319e-06  5  0.052990   
Pregnancies                 0.023958  0.004377  1.279331e-04  5  0.032971   
Insulin                     0.018750  0.002998  7.579241e-05  5  0.024922   
SkinThickness               0.014844  0.005257  1.609119e-03  5  0.025668   
BloodPressure               0.013281  0.001698  3.135273e-05  5  0.016777   

                           p99_low  
Glucose                   0.167527  
Age                       0.083320  
BMI                       0.081862  
DiabetesPedigreeFunction  0.037635  
Pregnancies               0.014946  
Insulin                   0.012578  
SkinThickness             0.004020  
BloodPress

	0.59s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	3.64s	= Expected runtime (0.73s per shuffle set)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.200260  0.009040  4.969023e-07  5  0.218874   
BMI                         0.089323  0.008214  8.487297e-06  5  0.106237   
Age                         0.072917  0.008079  1.779405e-05  5  0.089552   
DiabetesPedigreeFunction    0.051302  0.007400  5.054148e-05  5  0.066539   
Insulin                     0.044271  0.006107  4.238092e-05  5  0.056846   
Pregnancies                 0.032292  0.006536  1.909038e-04  5  0.045750   
SkinThickness               0.029687  0.003613  2.581633e-05  5  0.037127   
BloodPressure               0.014583  0.002140  5.403493e-05  5  0.018989   

                           p99_low  
Glucose                   0.181647  
BMI                       0.072409  
Age                       0.056281  
DiabetesPedigreeFunction  0.036065  
Insulin                   0.031696  
Pregnancies               0.018833  
SkinThickness             0.022248  
BloodPress

	0.55s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	0.0s	= Expected runtime (0.0s per shuffle set)
	0.07s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	2.99s	= Expected runtime (0.6s per shuffle set)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.199479  0.010871  1.054130e-06  5  0.221862   
Age                         0.085417  0.008163  9.887380e-06  5  0.102224   
BMI                         0.083854  0.009326  1.806108e-05  5  0.103057   
DiabetesPedigreeFunction    0.055208  0.004924  7.512731e-06  5  0.065347   
Insulin                     0.047135  0.005779  2.658423e-05  5  0.059035   
Pregnancies                 0.029167  0.006419  2.641593e-04  5  0.042383   
SkinThickness               0.027604  0.001426  8.524242e-07  5  0.030541   
BloodPressure               0.012760  0.001931  6.109038e-05  5  0.016737   

                           p99_low  
Glucose                   0.177096  
Age                       0.068610  
BMI                       0.064652  
DiabetesPedigreeFunction  0.045070  
Insulin                   0.035236  
Pregnancies               0.015951  
SkinThickness             0.024667  
BloodPress

	0.57s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	3.73s	= Expected runtime (0.75s per shuffle set)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.188542  0.011514  1.660978e-06  5  0.212250   
BMI                         0.079167  0.004640  1.409898e-06  5  0.088721   
Pregnancies                 0.073698  0.005493  3.677434e-06  5  0.085009   
Age                         0.071354  0.003949  1.121664e-06  5  0.079486   
DiabetesPedigreeFunction    0.064062  0.005779  7.862404e-06  5  0.075962   
SkinThickness               0.049740  0.003370  2.514336e-06  5  0.056679   
Insulin                     0.048698  0.001485  1.035246e-07  5  0.051755   
BloodPressure               0.041406  0.001698  3.383746e-07  5  0.044902   

                           p99_low  
Glucose                   0.164833  
BMI                       0.069612  
Pregnancies               0.062387  
Age                       0.063222  
DiabetesPedigreeFunction  0.052163  
SkinThickness             0.042800  
Insulin                   0.045641  
BloodPress

	0.58s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
	0.74s	= Expected runtime (0.15s per shuffle set)
If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn(

                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.191927  0.015368  4.890914e-06  5  0.223570   
BMI                         0.081250  0.006678  5.425669e-06  5  0.094999   
Pregnancies                 0.068229  0.004179  1.680202e-06  5  0.076833   
Age                         0.067448  0.004992  3.575454e-06  5  0.077727   
DiabetesPedigreeFunction    0.059635  0.007503  2.943647e-05  5  0.075083   
SkinThickness               0.046875  0.002762  1.440086e-06  5  0.052562   
Insulin                     0.043750  0.001975  4.967079e-07  5  0.047816   
BloodPressure               0.035937  0.002365  2.239074e-06  5  0.040808   

                           p99_low  
Glucose                   0.160284  
BMI                       0.067501  
Pregnancies               0.059625  
Age                       0.057169  
DiabetesPedigreeFunction  0.044188  
SkinThickness             0.041188  
Insulin                   0.039684  
BloodPress

If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Learner.load` instead.")
If you only need to load model weights and optimizer state, use the safe `Learner.load` instead.
  warn("load_learner` uses Python's insecure pickle module, which can execute malicious arbitrary code when loading. Only load files you trust.\nIf you only need to load model weights and optimizer state, use the safe `Le

                          importance    stddev   p_value  n  p99_high  \
Glucose                     0.150260  0.010917  0.000003  5  0.172739   
BMI                         0.045833  0.006728  0.000054  5  0.059687   
Age                         0.039063  0.007012  0.000119  5  0.053500   
Pregnancies                 0.038281  0.005938  0.000067  5  0.050509   
Insulin                     0.030729  0.004179  0.000040  5  0.039333   
SkinThickness               0.020573  0.006976  0.001369  5  0.034936   
DiabetesPedigreeFunction    0.017708  0.003518  0.000177  5  0.024952   
BloodPressure               0.016667  0.006204  0.001933  5  0.029440   

                           p99_low  
Glucose                   0.127781  
BMI                       0.031980  
Age                       0.024625  
Pregnancies               0.026054  
Insulin                   0.022125  
SkinThickness             0.006210  
DiabetesPedigreeFunction  0.010465  
BloodPressure             0.003893  
Feature i

Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	0.84s	= Expected runtime (0.17s per shuffle set)
	0.22s	= Actual runtime (Completed 5 of 5 shuffle sets)
Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	0.38s	= Expected runtime (0.08s per shuffle set)
	0.18s	= Actual runtime (Completed 5 of 5 shuffle sets)


                          importance    stddev   p_value  n  p99_high  \
Glucose                     0.144271  0.013700  0.000010  5  0.172479   
BMI                         0.030208  0.007389  0.000397  5  0.045422   
DiabetesPedigreeFunction    0.022917  0.008266  0.001722  5  0.039936   
Age                         0.019010  0.007457  0.002340  5  0.034365   
Insulin                     0.017448  0.001485  0.000006  5  0.020505   
Pregnancies                 0.014844  0.012026  0.025424  5  0.039605   
BloodPressure               0.008073  0.006976  0.030411  5  0.022436   
SkinThickness               0.005469  0.004358  0.024251  5  0.014441   

                           p99_low  
Glucose                   0.116063  
BMI                       0.014995  
DiabetesPedigreeFunction  0.005897  
Age                       0.003656  
Insulin                   0.014391  
Pregnancies              -0.009917  
BloodPressure            -0.006290  
SkinThickness            -0.003504  
Feature i

Computing feature importance via permutation shuffling for 8 features using 768 rows with 5 shuffle sets...
	0.44s	= Expected runtime (0.09s per shuffle set)
	0.11s	= Actual runtime (Completed 5 of 5 shuffle sets)


                          importance    stddev       p_value  n  p99_high  \
Glucose                     0.254688  0.006678  5.665254e-08  5  0.268437   
BMI                         0.110677  0.005676  8.269733e-07  5  0.122363   
Age                         0.100781  0.005416  9.968717e-07  5  0.111932   
DiabetesPedigreeFunction    0.057031  0.005321  8.988636e-06  5  0.067987   
Insulin                     0.037500  0.004548  2.546018e-05  5  0.046864   
SkinThickness               0.019271  0.003613  1.415889e-04  5  0.026710   
Pregnancies                 0.015885  0.004640  7.825060e-04  5  0.025440   
BloodPressure               0.010156  0.001698  9.030075e-05  5  0.013652   

                           p99_low  
Glucose                   0.240938  
BMI                       0.098991  
Age                       0.089630  
DiabetesPedigreeFunction  0.046075  
Insulin                   0.028136  
SkinThickness             0.011831  
Pregnancies               0.006331  
BloodPress