## AutoML - Train

https://www.kaggle.com/code/alexryzhkov/lightautoml-nn-test

### Configuration

In [1]:
# モジュールの動的import(import先のファイルが更新されたときに追従する)
%load_ext autoreload
%autoreload 2

In [2]:
! python -m pip install --no-index --find-links=/kaggle/input/autogluon-pkgs autogluon > /dev/null
! python -m pip install --no-index --find-links=/kaggle/input/ray-pkgs --upgrade --force-reinstall -q ray==2.6.3

Processing ./input/lightautoml-v0-3-8/lightautoml-0.3.8-py3-none-any.whl
lightautoml is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.


In [3]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
# LightAutoML presets, task and report generation
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer



In [4]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")

In [5]:
DRY_RUN = True

### Def utility classes

In [6]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [7]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_static_0_0 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_8 loaded into memory.
File train_credit_bureau_a_2_7 loaded into memory.
File train_credit_bureau_a_2_5 loaded into memory.
File train_credit_bureau_a_2_0 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,last_collater_typofvalofguarant_298M,last_collater_typofvalofguarant_407M,last_collaterals_typeofguarante_359M,last_collaterals_typeofguarante_669M,last_num_group1_12,last_num_group2,last_pmts_month_158T,last_pmts_month_706T,last_pmts_year_1139T,last_pmts_year_507T,last_subjectroles_name_541M,last_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,str,u16,u8,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
1351709,201904,16,0,,,-13629.0,,-13629,2.0,4.0,0.0,11.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",6.0,8.0,"""a7fcb6e5""","""a55475b1""",11.0,,,,,6.0,10004.242188,,14.0,,,7.0,6.0,0.0,9996.600586,4000.0,…,,"""a55475b1""","""c7a5ad39""",0,35,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",0,35,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.0,,0.0,,0.0,,0.0,,2019,26
1391295,201906,22,0,,,-11572.0,,-11572,4.0,4.0,2.0,8.0,3.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",10.0,5.0,"""3439d993""","""a55475b1""",8.0,,,,,6.0,50244.800781,,14.0,,,9.0,4.0,0.0,0.0,3673.600098,…,0.0,"""c7a5ad39""","""c7a5ad39""",13,35,0.0,23.0,12.0,12.0,0.0,12109.618164,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",13,11,,1.0,,2019.0,"""a55475b1""","""a55475b1""",0.0,0.333333,0.0,248.332977,0.0,6.192,0.0,1848400.0,2019,7
1592107,201910,41,0,,14.0,,,-15602,0.0,1.0,0.0,2.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",1.0,1.0,"""3439d993""","""a55475b1""",2.0,,26043.400391,6.0,,,,"""PENSION_6""",,14.0,,2.0,0.0,0.0,12276.902344,1998.200073,…,0.0,"""c7a5ad39""","""c7a5ad39""",4,23,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",4,23,,1.0,,2019.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,21
253257,202009,89,0,,,,1137200.0,-20743,1.0,1.0,1.0,2.0,1.0,"""2fc785b2""","""a55475b1""","""a55475b1""",0.0,2.0,"""3439d993""","""a55475b1""",2.0,,,,,,,,,,14.0,2.0,4.0,0.0,49844.0,1513.200073,…,250000.0,"""c7a5ad39""","""c7a5ad39""",6,35,9.0,12.0,12.0,12.0,11457.360352,1949.400024,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",6,23,,1.0,,2021.0,"""a55475b1""","""a55475b1""",0.315789,0.2,586.896851,25.92415,2.167852,1.67395,5114673.0,39758.21875,2020,15
736158,201907,26,0,,,-12455.0,,-12455,0.0,0.0,0.0,2.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,3.0,"""a55475b1""","""a55475b1""",2.0,,,,,6.0,5700.0,,14.0,,,0.0,1.0,,,4016.600098,…,,"""a55475b1""","""c7a5ad39""",1,35,1.0,,12.0,,2412.800049,,2020.0,,"""a55475b1""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1,35,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.022222,,53.617779,,0.022222,,129368.976562,,2019,8
1798686,202002,60,0,,,,,-11592,5.0,7.0,2.0,12.0,4.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",9.0,6.0,"""3439d993""","""a55475b1""",12.0,,,,,,,"""DEDUCTION_6""",,14.0,,3.0,2.0,0.0,85319.804688,2509.600098,…,0.0,"""c7a5ad39""","""c7a5ad39""",10,35,13.0,1583.0,12.0,12.0,4727.503906,43247.296875,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",10,11,,1.0,,2021.0,"""a55475b1""","""a55475b1""",1.371429,717.837219,928.587891,17029.513672,11.416807,437757.71875,3385189.0,307756256.0,2020,26
1818875,202003,63,0,,,,,-14289,0.0,0.0,0.0,4.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,5.0,4.0,36788.402344,1289.599976,…,0.0,"""c7a5ad39""","""c7a5ad39""",0,35,0.0,969.0,12.0,12.0,0.0,109792.203125,2021.0,2019.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",0,35,,1.0,,2019.0,"""a55475b1""","""a55475b1""",0.0,667.541687,0.0,60830.613281,0.0,37092.605469,0.0,1183300000.0,2020,17
711512,201906,23,0,,,-18120.0,,-18120,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,0.0,0.0,,14.0,,,0.0,0.0,,,2752.800049,…,6149203.0,"""c7a5ad39""","""c7a5ad39""",5,35,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2016.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",5,23,,1.0,,2012.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,12
112214,201902,7,0,-1617.0,,-22817.0,,-22817,0.0,0.0,0.0,2.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,0.0,"""b6cabe76""","""a55475b1""",2.0,6907.133789,,,6.0,,,,14.0,,,0.0,6.0,0.0,,4683.399902,…,,"""a55475b1""","""c7a5ad39""",2,35,15.0,,12.0,,0.8,,2020.0,,"""a55475b1""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2,23,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.481481,,0.02963,,6.292103,,0.023256,,2019,20
1789254,202002,58,0,,,,,-10366,5.0,6.0,1.0,9.0,5.0,"""a55475b1""","""a55475b1""","""a55475b1""",5.0,7.0,"""a55475b1""","""a55475b1""",9.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,7.0,0.0,36605.589844,2945.600098,…,0.0,"""c7a5ad39""","""c7a5ad39""",6,35,0.0,24.0,12.0,12.0,0.0,1281.800049,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",6,23,,1.0,,2020.0,"""a55475b1""","""a55475b1""",0.0,1.463768,0.0,80.356522,0.0,21.164108,0.0,88686.59375,2020,17


CPU times: user 3min 11s, sys: 47.8 s, total: 3min 58s
Wall time: 24.8 s


In [8]:
train_df, cat_cols = Utility.to_pandas(train_df)

### Train

In [9]:
device: str = "gpu"
iterations: int = 6000

if DRY_RUN:
    device = "cpu"
    train_df = train_df.iloc[:50000]
    iterations: int = 600

In [10]:
X = train_df.drop(columns=["target", "case_id", "week_num"])
y = train_df["target"]
weeks = train_df["week_num"]

del train_df
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [11]:
automl_params = {
    "N_THREADS": 8,
    "RANDOM_STATE": 42,
    "TASK": "binary",
    "TIMEOUT": 10000,
    "ADVANCED_ROLES": False,
    "USE_QNT": True,
    "USE_PLR": False,
    "TRAIN_BS": 512,
}

In [16]:
%%time

fitted_models_automl = []
cv_scores_automl = []

iterator = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    train_data = TabularDataset(X_train)
    valid_data = TabularDataset(X_valid)

    ##########
    # AutoML #
    ##########
    model = TabularPredictor(
        label="target",
        problem_type="binary",
        eval_metric="roc_auc",
        path="predictor",
    )

    # モデルのトレーニングと予測
    oof_pred = model.fit_predict(X_train, y_train)
    y_pred_valid = model.predict(X_valid).data[:, 0]

    # AUCスコアの計算
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_automl.append(model)
    cv_scores_automl.append(auc_score)

    iterator += 1

print("CV AUC scores for CatBoost: ", cv_scores_cat)
print("Maximum CV AUC score for CatBoost: ", max(cv_scores_cat))

print("CV AUC scores for LGBM: ", cv_scores_lgb)
print("Maximum CV AUC score for LGBM: ", max(cv_scores_lgb))

print("CV AUC scores for XGB: ", cv_scores_xgb)
print("Maximum CV AUC score for XGB: ", max(cv_scores_xgb))

TypeError: 'numpy.uint8' object is not iterable

### Ensemble

In [None]:
from src.models.voting_model import VotingModel

model = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb)

In [None]:
with open('voting_model.pkl', 'wb') as f:
    pickle.dump(model, f)