In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from src.models.tuning import tune_model, tune_and_evaluate, build_best_pipeline
from src.data_loader import load_data, prepare_data


# BASELINE

In [None]:
df = load_data('../home-credit-default-risk', handle_outliers=False, add_features=False, merge_bureau=False, merge_previous=False)

df['TARGET'].value_counts(normalize=True)

df shape: (307511, 122)


TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64

In [3]:
X1, y1 = prepare_data(df, encode=True)

In [None]:
X2, y2 = prepare_data(df, encode=False)

## Logistic Regression


In [None]:
results_lr = tune_and_evaluate(
    X1, y1,
    model_type='logistic',
    n_trials=10,
    n_folds=3
)

[32m[I 2026-02-08 18:24:39,692][0m A new study created in memory with name: no-name-e54a8bd7-1701-4362-8261-9ff70e1ceade[0m
Best trial: 0. Best value: 0.743422:  10%|█         | 1/10 [00:12<01:55, 12.80s/it]

[32m[I 2026-02-08 18:24:52,494][0m Trial 0 finished with value: 0.7434221280117628 and parameters: {'C': 0.0074593432857265485, 'solver': 'lbfgs', 'class_weight': None}. Best is trial 0 with value: 0.7434221280117628.[0m


Best trial: 0. Best value: 0.743422:  20%|██        | 2/10 [01:20<06:01, 45.21s/it]

[32m[I 2026-02-08 18:26:00,392][0m Trial 1 finished with value: 0.7426671757199689 and parameters: {'C': 0.000602521573620386, 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.7434221280117628.[0m


Best trial: 0. Best value: 0.743422:  30%|███       | 3/10 [01:31<03:25, 29.38s/it]

[32m[I 2026-02-08 18:26:10,931][0m Trial 2 finished with value: 0.7396760800241832 and parameters: {'C': 0.00012674255898937226, 'solver': 'lbfgs', 'class_weight': None}. Best is trial 0 with value: 0.7434221280117628.[0m


Best trial: 0. Best value: 0.743422:  40%|████      | 4/10 [02:53<05:01, 50.17s/it]

[32m[I 2026-02-08 18:27:32,985][0m Trial 3 finished with value: 0.7421311355786319 and parameters: {'C': 0.0008260808399079611, 'solver': 'saga', 'class_weight': None}. Best is trial 0 with value: 0.7434221280117628.[0m


Best trial: 4. Best value: 0.744255:  50%|█████     | 5/10 [08:08<12:08, 145.63s/it]

[32m[I 2026-02-08 18:32:47,871][0m Trial 4 finished with value: 0.7442553457855022 and parameters: {'C': 0.11462107403425033, 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 4 with value: 0.7442553457855022.[0m


Best trial: 4. Best value: 0.744255:  60%|██████    | 6/10 [13:49<14:08, 212.13s/it]

[32m[I 2026-02-08 18:38:29,096][0m Trial 5 finished with value: 0.7439157486719118 and parameters: {'C': 0.8431013932082461, 'solver': 'saga', 'class_weight': None}. Best is trial 4 with value: 0.7442553457855022.[0m


Best trial: 6. Best value: 0.74468:  70%|███████   | 7/10 [14:11<07:29, 149.91s/it] 

[32m[I 2026-02-08 18:38:50,889][0m Trial 6 finished with value: 0.7446802447124606 and parameters: {'C': 0.10907475835157694, 'solver': 'lbfgs', 'class_weight': 'balanced'}. Best is trial 6 with value: 0.7446802447124606.[0m


Best trial: 6. Best value: 0.74468:  80%|████████  | 8/10 [14:23<03:32, 106.25s/it]

[32m[I 2026-02-08 18:39:03,656][0m Trial 7 finished with value: 0.7440255207531398 and parameters: {'C': 1.1015056790269626, 'solver': 'lbfgs', 'class_weight': None}. Best is trial 6 with value: 0.7446802447124606.[0m


Best trial: 6. Best value: 0.74468:  90%|█████████ | 9/10 [14:34<01:16, 76.35s/it] 

[32m[I 2026-02-08 18:39:14,262][0m Trial 8 finished with value: 0.7417566370934523 and parameters: {'C': 0.0004075596440072873, 'solver': 'lbfgs', 'class_weight': None}. Best is trial 6 with value: 0.7446802447124606.[0m


## Random Forest

In [None]:
results_rf = tune_and_evaluate(
    X1, y1,
    model_type='random_forest',
    n_trials=5,
    n_folds=3
)

## Gradient Boosting

In [None]:
results_gb = tune_and_evaluate(
    X1, y1,
    model_type='gradient_boosting',
    n_trials=2,
    n_folds=3,
    timeout=1800
)

## XGBoost

In [None]:
results_xgb = tune_and_evaluate(
    X2, y2,
    model_type='xgboost',
    n_trials=30,
    n_folds=3,
    use_gpu=True
)


[32m[I 2026-02-07 21:35:29,621][0m A new study created in memory with name: no-name-cc1d71a2-1556-4682-8cd5-8266110741b7[0m
Best trial: 0. Best value: 0.72058:   3%|▎         | 1/30 [00:19<09:19, 19.28s/it]

[32m[I 2026-02-07 21:35:48,904][0m Trial 0 finished with value: 0.7205804517643486 and parameters: {'n_estimators': 144, 'max_depth': 10, 'learning_rate': 0.1205712628744377, 'subsample': 0.8394633936788146, 'colsample_bytree': 0.6624074561769746, 'reg_alpha': 2.5348407664333426e-07, 'reg_lambda': 3.3323645788192616e-08, 'scale_pos_weight': 13.126466040849092}. Best is trial 0 with value: 0.7205804517643486.[0m


Best trial: 1. Best value: 0.752353:   7%|▋         | 2/30 [00:39<09:14, 19.81s/it]

[32m[I 2026-02-07 21:36:09,082][0m Trial 1 finished with value: 0.7523528682982107 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.010725209743171996, 'subsample': 0.9879639408647978, 'colsample_bytree': 0.9329770563201687, 'reg_alpha': 8.148018307012941e-07, 'reg_lambda': 4.329370014459266e-07, 'scale_pos_weight': 3.5676631379480734}. Best is trial 1 with value: 0.7523528682982107.[0m


Best trial: 2. Best value: 0.760426:  10%|█         | 3/30 [00:51<07:19, 16.26s/it]

[32m[I 2026-02-07 21:36:21,128][0m Trial 2 finished with value: 0.7604260798011128 and parameters: {'n_estimators': 126, 'max_depth': 6, 'learning_rate': 0.04345454109729477, 'subsample': 0.7164916560792167, 'colsample_bytree': 0.8447411578889518, 'reg_alpha': 1.8007140198129195e-07, 'reg_lambda': 4.258943089524393e-06, 'scale_pos_weight': 6.129065806111684}. Best is trial 2 with value: 0.7604260798011128.[0m


Best trial: 2. Best value: 0.760426:  13%|█▎        | 4/30 [01:12<07:48, 18.00s/it]

[32m[I 2026-02-07 21:36:41,794][0m Trial 3 finished with value: 0.7578289001294006 and parameters: {'n_estimators': 164, 'max_depth': 9, 'learning_rate': 0.019721610970574007, 'subsample': 0.8056937753654446, 'colsample_bytree': 0.836965827544817, 'reg_alpha': 2.6185068507773707e-08, 'reg_lambda': 0.0029369981104377003, 'scale_pos_weight': 3.3873377316220816}. Best is trial 2 with value: 0.7604260798011128.[0m


Best trial: 2. Best value: 0.760426:  17%|█▋        | 5/30 [01:25<06:45, 16.22s/it]

[32m[I 2026-02-07 21:36:54,854][0m Trial 4 finished with value: 0.7143534795846543 and parameters: {'n_estimators': 66, 'max_depth': 10, 'learning_rate': 0.26690431824362526, 'subsample': 0.9233589392465844, 'colsample_bytree': 0.7218455076693483, 'reg_alpha': 7.569183361880229e-08, 'reg_lambda': 0.014391207615728067, 'scale_pos_weight': 7.162134912354418}. Best is trial 2 with value: 0.7604260798011128.[0m


Best trial: 2. Best value: 0.760426:  20%|██        | 6/30 [01:35<05:40, 14.20s/it]

[32m[I 2026-02-07 21:37:05,143][0m Trial 5 finished with value: 0.743465005624151 and parameters: {'n_estimators': 80, 'max_depth': 6, 'learning_rate': 0.011240768803005551, 'subsample': 0.9637281608315128, 'colsample_bytree': 0.7035119926400067, 'reg_alpha': 0.009176996354542699, 'reg_lambda': 6.388511557344611e-06, 'scale_pos_weight': 8.280952296489351}. Best is trial 2 with value: 0.7604260798011128.[0m


Best trial: 6. Best value: 0.761814:  23%|██▎       | 7/30 [01:46<04:59, 13.02s/it]

[32m[I 2026-02-07 21:37:15,714][0m Trial 6 finished with value: 0.761814393430083 and parameters: {'n_estimators': 187, 'max_depth': 3, 'learning_rate': 0.27051668818999286, 'subsample': 0.9100531293444458, 'colsample_bytree': 0.9757995766256756, 'reg_alpha': 1.1309571585271483, 'reg_lambda': 0.002404915432737351, 'scale_pos_weight': 13.906239290323636}. Best is trial 6 with value: 0.761814393430083.[0m


Best trial: 6. Best value: 0.761814:  27%|██▋       | 8/30 [01:55<04:21, 11.90s/it]

[32m[I 2026-02-07 21:37:25,216][0m Trial 7 finished with value: 0.7231611275674487 and parameters: {'n_estimators': 72, 'max_depth': 3, 'learning_rate': 0.011662890273931383, 'subsample': 0.7301321323053057, 'colsample_bytree': 0.7554709158757928, 'reg_alpha': 2.7678419414850017e-06, 'reg_lambda': 0.28749982347407854, 'scale_pos_weight': 5.99454657371025}. Best is trial 6 with value: 0.761814393430083.[0m


Best trial: 6. Best value: 0.761814:  30%|███       | 9/30 [02:06<04:05, 11.71s/it]

[32m[I 2026-02-07 21:37:36,498][0m Trial 8 finished with value: 0.7498489136759625 and parameters: {'n_estimators': 120, 'max_depth': 6, 'learning_rate': 0.016149614799999188, 'subsample': 0.9208787923016158, 'colsample_bytree': 0.6298202574719083, 'reg_alpha': 7.620481786158549, 'reg_lambda': 0.08916674715636537, 'scale_pos_weight': 3.7820195414784137}. Best is trial 6 with value: 0.761814393430083.[0m


Best trial: 6. Best value: 0.761814:  33%|███▎      | 10/30 [02:18<03:52, 11.61s/it]

[32m[I 2026-02-07 21:37:47,904][0m Trial 9 finished with value: 0.7539136923272572 and parameters: {'n_estimators': 51, 'max_depth': 9, 'learning_rate': 0.11069143219393454, 'subsample': 0.8916028672163949, 'colsample_bytree': 0.9085081386743783, 'reg_alpha': 4.638759594322625e-08, 'reg_lambda': 1.683416412018213e-05, 'scale_pos_weight': 2.622166833351816}. Best is trial 6 with value: 0.761814393430083.[0m


Best trial: 10. Best value: 0.76268:  37%|███▋      | 11/30 [02:29<03:37, 11.44s/it]

[32m[I 2026-02-07 21:37:58,954][0m Trial 10 finished with value: 0.7626801174665564 and parameters: {'n_estimators': 268, 'max_depth': 2, 'learning_rate': 0.2704729722717776, 'subsample': 0.6071847502459279, 'colsample_bytree': 0.9935496864584777, 'reg_alpha': 1.475649304728376, 'reg_lambda': 4.3444691085504035, 'scale_pos_weight': 14.674749993026234}. Best is trial 10 with value: 0.7626801174665564.[0m


Best trial: 11. Best value: 0.76269:  40%|████      | 12/30 [02:40<03:22, 11.27s/it]

[32m[I 2026-02-07 21:38:09,827][0m Trial 11 finished with value: 0.7626899326077435 and parameters: {'n_estimators': 280, 'max_depth': 2, 'learning_rate': 0.29234877806013426, 'subsample': 0.6068744601075835, 'colsample_bytree': 0.9895680834513327, 'reg_alpha': 8.440370204782154, 'reg_lambda': 7.556243125116086, 'scale_pos_weight': 14.558904047418146}. Best is trial 11 with value: 0.7626899326077435.[0m


Best trial: 12. Best value: 0.764365:  43%|████▎     | 13/30 [02:51<03:09, 11.16s/it]

[32m[I 2026-02-07 21:38:20,733][0m Trial 12 finished with value: 0.7643647645884263 and parameters: {'n_estimators': 289, 'max_depth': 2, 'learning_rate': 0.1640133239971726, 'subsample': 0.6014562981232444, 'colsample_bytree': 0.9934384626352483, 'reg_alpha': 0.07352386889584703, 'reg_lambda': 9.418748792456528, 'scale_pos_weight': 11.490852870972876}. Best is trial 12 with value: 0.7643647645884263.[0m


Best trial: 12. Best value: 0.764365:  47%|████▋     | 14/30 [03:03<03:06, 11.64s/it]

[32m[I 2026-02-07 21:38:33,504][0m Trial 13 finished with value: 0.7611686830189832 and parameters: {'n_estimators': 298, 'max_depth': 4, 'learning_rate': 0.13811269358793526, 'subsample': 0.6038863909047579, 'colsample_bytree': 0.9052945985771664, 'reg_alpha': 0.018495296668810585, 'reg_lambda': 9.868745464853728, 'scale_pos_weight': 11.091584116998652}. Best is trial 12 with value: 0.7643647645884263.[0m


Best trial: 12. Best value: 0.764365:  50%|█████     | 15/30 [03:14<02:50, 11.35s/it]

[32m[I 2026-02-07 21:38:44,162][0m Trial 14 finished with value: 0.7587687898329811 and parameters: {'n_estimators': 241, 'max_depth': 2, 'learning_rate': 0.06693126356989657, 'subsample': 0.6638123593227387, 'colsample_bytree': 0.9483908569515749, 'reg_alpha': 0.028504274995185333, 'reg_lambda': 0.6124987461165775, 'scale_pos_weight': 11.193840136904178}. Best is trial 12 with value: 0.7643647645884263.[0m


Best trial: 12. Best value: 0.764365:  53%|█████▎    | 16/30 [03:26<02:40, 11.49s/it]

[32m[I 2026-02-07 21:38:55,996][0m Trial 15 finished with value: 0.7585842128160304 and parameters: {'n_estimators': 236, 'max_depth': 4, 'learning_rate': 0.17044537778321608, 'subsample': 0.6733089259263372, 'colsample_bytree': 0.8645057419353053, 'reg_alpha': 6.765582267028678e-05, 'reg_lambda': 0.00013205054528414914, 'scale_pos_weight': 11.620658224690139}. Best is trial 12 with value: 0.7643647645884263.[0m


Best trial: 16. Best value: 0.765821:  57%|█████▋    | 17/30 [03:39<02:34, 11.91s/it]

[32m[I 2026-02-07 21:39:08,861][0m Trial 16 finished with value: 0.7658212080681217 and parameters: {'n_estimators': 288, 'max_depth': 4, 'learning_rate': 0.06744580472836788, 'subsample': 0.6509544144073132, 'colsample_bytree': 0.9994794359349198, 'reg_alpha': 0.2340338077077156, 'reg_lambda': 1.4163391413823898, 'scale_pos_weight': 9.353346650423793}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  60%|██████    | 18/30 [03:50<02:21, 11.83s/it]

[32m[I 2026-02-07 21:39:20,497][0m Trial 17 finished with value: 0.7639885380806359 and parameters: {'n_estimators': 219, 'max_depth': 4, 'learning_rate': 0.05608954129727098, 'subsample': 0.7419591792699157, 'colsample_bytree': 0.777693726921876, 'reg_alpha': 0.0010877849299928224, 'reg_lambda': 0.04830842489993285, 'scale_pos_weight': 9.394971821874737}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  63%|██████▎   | 19/30 [04:04<02:14, 12.24s/it]

[32m[I 2026-02-07 21:39:33,711][0m Trial 18 finished with value: 0.762689222110312 and parameters: {'n_estimators': 261, 'max_depth': 5, 'learning_rate': 0.08224314089148582, 'subsample': 0.6680065384301368, 'colsample_bytree': 0.8771122534160314, 'reg_alpha': 0.18926470587942834, 'reg_lambda': 0.9281470765976744, 'scale_pos_weight': 9.541439342691735}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  67%|██████▋   | 20/30 [04:16<02:01, 12.16s/it]

[32m[I 2026-02-07 21:39:45,685][0m Trial 19 finished with value: 0.7601257565603272 and parameters: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.036046969689776935, 'subsample': 0.7730982444721965, 'colsample_bytree': 0.8159759138038158, 'reg_alpha': 0.00017746110302410047, 'reg_lambda': 0.0011906175810361388, 'scale_pos_weight': 12.420217722333597}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  70%|███████   | 21/30 [04:29<01:52, 12.50s/it]

[32m[I 2026-02-07 21:39:58,987][0m Trial 20 finished with value: 0.7619483608222319 and parameters: {'n_estimators': 245, 'max_depth': 5, 'learning_rate': 0.027544153225023304, 'subsample': 0.6472523886630981, 'colsample_bytree': 0.9433652849644488, 'reg_alpha': 0.0015887754340882438, 'reg_lambda': 0.02211303158371505, 'scale_pos_weight': 9.856825498672457}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  73%|███████▎  | 22/30 [04:40<01:37, 12.21s/it]

[32m[I 2026-02-07 21:40:10,519][0m Trial 21 finished with value: 0.7635817886711601 and parameters: {'n_estimators': 214, 'max_depth': 4, 'learning_rate': 0.06269783760832018, 'subsample': 0.7178278149418893, 'colsample_bytree': 0.8075301549946484, 'reg_alpha': 0.001415584729065138, 'reg_lambda': 0.06785352587085913, 'scale_pos_weight': 8.98242456267861}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  77%|███████▋  | 23/30 [04:53<01:25, 12.29s/it]

[32m[I 2026-02-07 21:40:22,971][0m Trial 22 finished with value: 0.7637365092743439 and parameters: {'n_estimators': 225, 'max_depth': 5, 'learning_rate': 0.08417012109145329, 'subsample': 0.7601629220248443, 'colsample_bytree': 0.7615652409577959, 'reg_alpha': 0.1370734033377153, 'reg_lambda': 1.099581039062647, 'scale_pos_weight': 10.011921661972082}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  80%|████████  | 24/30 [05:11<01:23, 14.00s/it]

[32m[I 2026-02-07 21:40:40,972][0m Trial 23 finished with value: 0.7617507466821017 and parameters: {'n_estimators': 275, 'max_depth': 7, 'learning_rate': 0.046491066136025834, 'subsample': 0.6410204603741272, 'colsample_bytree': 0.7786722350006485, 'reg_alpha': 1.8296060741466316e-05, 'reg_lambda': 0.12030159805456588, 'scale_pos_weight': 7.262611200858355}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  83%|████████▎ | 25/30 [05:23<01:07, 13.46s/it]

[32m[I 2026-02-07 21:40:53,180][0m Trial 24 finished with value: 0.7568084690052963 and parameters: {'n_estimators': 257, 'max_depth': 3, 'learning_rate': 0.030537563629658187, 'subsample': 0.6951237039043777, 'colsample_bytree': 0.7247211750797107, 'reg_alpha': 0.0015787857536935321, 'reg_lambda': 1.8165521353102092, 'scale_pos_weight': 10.638712371722724}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  87%|████████▋ | 26/30 [05:36<00:53, 13.28s/it]

[32m[I 2026-02-07 21:41:06,040][0m Trial 25 finished with value: 0.7537949592244898 and parameters: {'n_estimators': 287, 'max_depth': 4, 'learning_rate': 0.18780075531180868, 'subsample': 0.635427332131232, 'colsample_bytree': 0.9630116542487013, 'reg_alpha': 0.10472967684077371, 'reg_lambda': 0.025366151972783267, 'scale_pos_weight': 12.373220416552446}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  90%|█████████ | 27/30 [05:46<00:37, 12.39s/it]

[32m[I 2026-02-07 21:41:16,361][0m Trial 26 finished with value: 0.7600812426827973 and parameters: {'n_estimators': 210, 'max_depth': 2, 'learning_rate': 0.09351083477945314, 'subsample': 0.7566902373066845, 'colsample_bytree': 0.9057783368959028, 'reg_alpha': 0.599733796779698, 'reg_lambda': 0.2944960256905996, 'scale_pos_weight': 8.458551757836911}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  93%|█████████▎| 28/30 [05:58<00:24, 12.24s/it]

[32m[I 2026-02-07 21:41:28,245][0m Trial 27 finished with value: 0.7636394597650445 and parameters: {'n_estimators': 171, 'max_depth': 5, 'learning_rate': 0.050787151544017986, 'subsample': 0.6924920476396864, 'colsample_bytree': 0.9998105759227905, 'reg_alpha': 0.005908457950968609, 'reg_lambda': 2.26021527332864, 'scale_pos_weight': 5.569828236529508}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821:  97%|█████████▋| 29/30 [06:14<00:13, 13.21s/it]

[32m[I 2026-02-07 21:41:43,731][0m Trial 28 finished with value: 0.7451439429967909 and parameters: {'n_estimators': 254, 'max_depth': 7, 'learning_rate': 0.1673775977369503, 'subsample': 0.833749891517062, 'colsample_bytree': 0.6907939095057634, 'reg_alpha': 0.03993986930302895, 'reg_lambda': 0.009345429635108583, 'scale_pos_weight': 1.3546003339046369}. Best is trial 16 with value: 0.7658212080681217.[0m


Best trial: 16. Best value: 0.765821: 100%|██████████| 30/30 [06:24<00:00, 12.83s/it]


[32m[I 2026-02-07 21:41:54,448][0m Trial 29 finished with value: 0.7644703625403508 and parameters: {'n_estimators': 228, 'max_depth': 3, 'learning_rate': 0.11731834739546346, 'subsample': 0.8473026957464376, 'colsample_bytree': 0.6477639824328912, 'reg_alpha': 0.000335864258771428, 'reg_lambda': 0.00028624767875454774, 'scale_pos_weight': 12.928484745628687}. Best is trial 16 with value: 0.7658212080681217.[0m
Best params: {'n_estimators': 288, 'max_depth': 4, 'learning_rate': 0.06744580472836788, 'subsample': 0.6509544144073132, 'colsample_bytree': 0.9994794359349198, 'reg_alpha': 0.2340338077077156, 'reg_lambda': 1.4163391413823898, 'scale_pos_weight': 9.353346650423793}
Results:
Train ROC-AUC: 0.8014
Test ROC-AUC:  0.7681


## LightGBM

In [None]:
results_lgbm = tune_and_evaluate(
    X2, y2,
    model_type='lightgbm',
    n_trials=30,
    n_folds=3,
    use_gpu=True
)


## Compared Results

In [None]:
results = {
    'Logistic': locals().get('results_lr'),
    'Random Forest': locals().get('results_rf'),
    'Gradient Boosting': locals().get('results_gb'),
    'XGBoost': locals().get('results_xgb'),
    'LightGBM': locals().get('results_lgbm'),
}

rows = []
for name, r in results.items():
    rows.append({
        'Model': name,
        'CV Score': r['cv_score'],
        'Train Score': r['train_score'],
        'Test Score': r['test_score']
    })

comp = pd.DataFrame(rows).sort_values('Test Score', ascending=False)
comp.columns = pd.MultiIndex.from_tuples([
    ('BASELINE', col) for col in comp.columns
])

display(comp)

Unnamed: 0,Model,CV Score,Train Score,Test Score
0,XGBoost,0.765821,0.801385,0.768079


maybe add optuna plots - look at warszat ai 

# NO OUTLIERS, ADD FEATURES, MERGE BUREAU, MERGE PREVIOUS_APP

In [None]:
df_full = load_data('../home-credit-default-risk', handle_outliers=True, add_features=True, merge_bureau=True, merge_previous=True)

df_full['TARGET'].value_counts(normalize=True)

In [None]:
X1, y1 = prepare_data(df_full, encode=True)
X2, y2 = prepare_data(df_full, encode=False)

## Logistic Regression

In [None]:
results_lr = tune_and_evaluate(
    X1, y1,
    model_type='logistic',
    n_trials=10,
    n_folds=3
)

## Random Forest

In [None]:
results_rf = tune_and_evaluate(
    X1, y1,
    model_type='random_forest',
    n_trials=20,
    n_folds=3
)

## Gradient Boosting

In [None]:
results_gb = tune_and_evaluate(
    X1, y1,
    model_type='gradient_boosting',
    n_trials=2,
    n_folds=3,
    timeout=1800
)

## XGBoost

In [None]:
results_xgb = tune_and_evaluate(
    X2, y2,
    model_type='xgboost',
    n_trials=30,
    n_folds=3,
    use_gpu=True
)


## LightGBM

In [None]:
results_lgbm = tune_and_evaluate(
    X2, y2,
    model_type='lightgbm',
    n_trials=30,
    n_folds=3,
    use_gpu=True
)


In [None]:
results = {
    'Logistic': locals().get('results_lr'),
    'Random Forest': locals().get('results_rf'),
    'Gradient Boosting': locals().get('results_gb'),
    'XGBoost': locals().get('results_xgb'),
    'LightGBM': locals().get('results_lgbm'),
}

rows = []
for name, r in results.items():
    rows.append({
        'Model': name,
        'CV Score': r['cv_score'],
        'Train Score': r['train_score'],
        'Test Score': r['test_score']
    })

comp = pd.DataFrame(rows).sort_values('Test Score', ascending=False)
comp.columns = pd.MultiIndex.from_tuples([
    ('FULL', col) for col in comp.columns
])

display(comp)