In [1]:
from autogluon.tabular import TabularPredictor
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class AutoMLWithAutoGluon:
    def __init__(self):
        self.predictor:TabularPredictor|None = None
    def train_automl_model(self, dataset:pd.DataFrame, target:str, time_limit=120):
        self.predictor = TabularPredictor(label=target)
        self.predictor.fit(dataset, time_limit=time_limit)
    def test_automl_model(self, dataset:pd.DataFrame, target:str):
        y_true = dataset[target]
        y_pred = self.predictor.predict(dataset)
        return self.predictor.eval_metric(y_true, y_pred)
    def print_leaderboard(self):
        print(self.predictor.leaderboard())
    def print_feature_importance(self, dataset):
        print(self.predictor.feature_importance(dataset))

In [3]:
dataset = pd.read_csv("datasets/loan_data.csv")

dataset.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dataset, test_size=0.2)
train, test = pd.DataFrame(train, columns=dataset.columns), pd.DataFrame(test, columns=dataset.columns)

print(train.size, test.size, dataset.size)

504000 126000 630000


In [5]:
model =  AutoMLWithAutoGluon()
model.train_automl_model(train, "loan_status")

No path specified. Models will be saved in: "AutogluonModels/ag-20241117_151758"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.2b20241117
Python Version:     3.11.9
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #48~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Mon Oct  7 11:24:13 UTC 2
CPU Count:          16
Memory Avail:       7.78 GB / 14.86 GB (52.3%)
Disk Space Avail:   9.38 GB / 118.32 GB (7.9%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium_quality'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference s

In [6]:
model.test_automl_model(test, target="loan_status")

0.9325555555555556

In [7]:
model.print_leaderboard()

                  model  score_val eval_metric  pred_time_val   fit_time  \
0   WeightedEnsemble_L2     0.9364    accuracy       0.022827   1.167248   
1         LightGBMLarge     0.9360    accuracy       0.005808   0.996938   
2               XGBoost     0.9336    accuracy       0.008931   0.527533   
3              LightGBM     0.9332    accuracy       0.007315   0.586140   
4              CatBoost     0.9284    accuracy       0.002805   6.678607   
5      RandomForestEntr     0.9212    accuracy       0.036833   1.138601   
6      RandomForestGini     0.9208    accuracy       0.035609   1.091131   
7            LightGBMXT     0.9188    accuracy       0.006198   0.532670   
8        ExtraTreesEntr     0.9188    accuracy       0.037059   0.729455   
9       NeuralNetFastAI     0.9168    accuracy       0.018558  20.708769   
10       ExtraTreesGini     0.9160    accuracy       0.036160   0.736362   
11       NeuralNetTorch     0.9144    accuracy       0.014999  18.785256   
12       KNe

<p>
The best model is WeightedEnsemble_L2 which, as we see, outperforms even neural networks like NeuralNetFastAI and NeuralNetTorch while
taking significantly less time compared to them
</p>

In [8]:
model.print_feature_importance(test)

Computing feature importance via permutation shuffling for 13 features using 5000 rows with 5 shuffle sets...
	3.78s	= Expected runtime (0.76s per shuffle set)
	2.08s	= Actual runtime (Completed 5 of 5 shuffle sets)


                                importance    stddev       p_value  n  \
previous_loan_defaults_on_file     0.10076  0.003910  2.715554e-07  5   
loan_int_rate                      0.04744  0.004297  7.992919e-06  5   
person_income                      0.04524  0.003877  6.406979e-06  5   
loan_percent_income                0.04076  0.002242  1.094674e-06  5   
person_home_ownership              0.02952  0.001869  1.916670e-06  5   
loan_intent                        0.01508  0.002524  9.079747e-05  5   
loan_amnt                          0.00924  0.001769  1.534985e-04  5   
credit_score                       0.00500  0.001594  1.087208e-03  5   
person_age                         0.00184  0.000829  3.852211e-03  5   
person_emp_exp                     0.00092  0.000769  2.779590e-02  5   
person_gender                      0.00048  0.000363  2.089734e-02  5   
cb_person_cred_hist_length         0.00028  0.000460  1.227460e-01  5   
person_education                  -0.00024  0.00060

<p>The most important feature was previous_loan_defaults_on_file. If a user has defaulted on previous loans then he is much less likely to get a loan again</p>