## Baseline - Train

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
# モジュールの動的import(import先のファイルが更新されたときに追従する)
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer



In [3]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")

In [4]:
DRY_RUN = True

### Def utility classes

In [5]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [6]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_static_0_0 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_8 loaded into memory.
File train_credit_bureau_a_2_7 loaded into memory.
File train_credit_bureau_a_2_5 loaded into memory.
File train_credit_bureau_a_2_0 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,last_collater_typofvalofguarant_298M,last_collater_typofvalofguarant_407M,last_collaterals_typeofguarante_359M,last_collaterals_typeofguarante_669M,last_num_group1_12,last_num_group2,last_pmts_month_158T,last_pmts_month_706T,last_pmts_year_1139T,last_pmts_year_507T,last_subjectroles_name_541M,last_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,str,u16,u8,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
963182,202002,59,0,,,,,-9884.0,0.0,0.0,0.0,4.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,2.0,,,1978.599976,…,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2019.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2.0,23.0,,1.0,,2016.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,24
882170,201911,47,0,,,,,-10805.0,2.0,3.0,0.0,5.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,2.0,"""a55475b1""","""a55475b1""",5.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,2.0,0.0,6000.0,1666.800049,…,0.0,"""c7a5ad39""","""c7a5ad39""",1.0,23.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1.0,23.0,,1.0,,2020.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,30
827906,201910,40,0,,,,,-8807.0,2.0,2.0,1.0,6.0,2.0,"""a55475b1""","""a55475b1""","""a55475b1""",5.0,10.0,"""a55475b1""","""a55475b1""",6.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,5.0,,,1961.599976,…,0.0,"""c7a5ad39""","""c7a5ad39""",0.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",0.0,35.0,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,12
1240543,201901,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,3153.199951,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,4
12466,201904,15,0,-461.0,,-23487.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,10173.600586,,,6.0,,,,14.0,,,,,,,2067.0,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,22
929619,202001,53,0,,,,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,,,"""DEDUCTION_6""",,14.0,,,,,,2405.199951,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2020,7
1024139,202009,90,0,,,,1110100.0,-11866.0,3.0,3.0,2.0,9.0,2.0,"""2fc785b2""","""6b2ae0fa""","""a55475b1""",0.0,6.0,"""38c061ee""","""a55475b1""",9.0,,,,,,,,,,14.0,3.0,3.0,,,6561.600098,…,0.0,"""c7a5ad39""","""c7a5ad39""",4.0,35.0,0.0,1081.0,12.0,12.0,0.0,88006.898438,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",4.0,23.0,,1.0,,2021.0,"""a55475b1""","""a55475b1""",0.0,410.817078,0.0,21811.820312,0.0,358.697052,0.0,21661.007812,2020,26
1503881,201908,33,0,-1815.0,,-23003.0,,-23003.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,0.0,"""a55475b1""","""a55475b1""",0.0,6440.200195,,,11.0,,,,14.0,,,0.0,0.0,0.0,0.0,3578.400146,…,0.0,"""c7a5ad39""","""a55475b1""",1.0,35.0,,1023.0,,12.0,,51015.800781,,2020.0,"""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1.0,35.0,,1.0,,2020.0,"""a55475b1""","""a55475b1""",,337.604156,,16085.975586,,402.070099,,21391.449219,2019,25
1555004,201909,38,0,,,,,-13997.0,0.0,3.0,0.0,5.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,0.0,"""a55475b1""","""a55475b1""",5.0,,,,,6.0,12780.200195,"""DEDUCTION_6""",14.0,14.0,,3.0,0.0,0.0,21858.548828,3018.600098,…,0.0,"""c7a5ad39""","""c7a5ad39""",1.0,23.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1.0,11.0,1.0,1.0,2020.0,2020.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,27
1703648,201912,50,0,,,,,-14906.0,1.0,3.0,0.0,4.0,0.0,"""a55475b1""","""717ddd49""","""a55475b1""",1.0,0.0,"""3439d993""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,3.0,0.0,0.0,7499.0,…,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,35.0,25.0,78.0,12.0,12.0,519.370056,10398.600586,2020.0,2013.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2.0,11.0,1.0,,2020.0,,"""a55475b1""","""a55475b1""",1.705882,24.85,65.074768,4072.820068,5.413392,25.984358,159.783585,3482.141846,2019,22


CPU times: user 3min 8s, sys: 38.2 s, total: 3min 46s
Wall time: 21.4 s


In [7]:
train_df, cat_cols = Utility.to_pandas(train_df)

### Train

In [8]:
device: str = "gpu"
est_cnt: int = 6000

if DRY_RUN:
    device = "cpu"
    train_df = train_df.iloc[:50000]
    est_cnt: int = 600

In [9]:
X = train_df.drop(columns=["target", "case_id", "week_num"])
y = train_df["target"]
weeks = train_df["week_num"]

del train_df
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
catboost_params = {
    "best_model_min_trees": 1200,
    "boosting_type": "Plain",
    "eval_metric": "AUC",
    "iterations": est_cnt,
    "learning_rate": 0.05,
    "l2_leaf_reg": 10,
    "max_leaves": 64,
    "random_seed": 42,
    "task_type": "GPU",
    "use_best_model": True,
}

lgb_params = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.05,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

lgb_params_2 = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.03,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 16,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 72,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

In [11]:
%%time

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

iterator = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    #######
    # cat #
    #######
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(**catboost_params)

    clf.fit(train_pool, eval_set=val_pool, verbose=300)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_cat.append(clf)
    cv_scores_cat.append(auc_score)

    #######
    # lgb #
    #######
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    if iterator % 2 == 0:
        model = lgb.LGBMClassifier(**lgb_params)
    else:
        model = lgb.LGBMClassifier(**lgb_params_2)

    model.fit(
        X_train,
        y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)],
    )

    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_lgb.append(model)
    cv_scores_lgb.append(auc_score)

    iterator += 1

print("CV AUC scores for CatBoost: ", cv_scores_cat)
print("Maximum CV AUC score for CatBoost: ", max(cv_scores_cat))

print("CV AUC scores for LGBM: ", cv_scores_lgb)
print("Maximum CV AUC score for LGBM: ", max(cv_scores_lgb))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5878539	best: 0.5878539 (0)	total: 28.9ms	remaining: 17.3s
300:	test: 0.8055062	best: 0.8055062 (300)	total: 7.51s	remaining: 7.46s
599:	test: 0.8146160	best: 0.8148476 (590)	total: 15.1s	remaining: 0us
bestTest = 0.8148475885
bestIteration = 590


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.83725
Early stopping, best iteration is:
[196]	valid_0's auc: 0.837452


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5250000	best: 0.5250000 (0)	total: 25.3ms	remaining: 15.1s
300:	test: 0.7815741	best: 0.7818953 (295)	total: 7.42s	remaining: 7.38s
599:	test: 0.7906193	best: 0.7907371 (585)	total: 14.8s	remaining: 0us
bestTest = 0.7907370925
bestIteration = 585


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.803611
[400]	valid_0's auc: 0.806993
Early stopping, best iteration is:
[388]	valid_0's auc: 0.807278


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6280893	best: 0.6280893 (0)	total: 25.9ms	remaining: 15.5s
300:	test: 0.8065309	best: 0.8065309 (300)	total: 7.63s	remaining: 7.58s
599:	test: 0.8181958	best: 0.8181958 (599)	total: 15.1s	remaining: 0us
bestTest = 0.8181958199
bestIteration = 599


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.830071
Early stopping, best iteration is:
[229]	valid_0's auc: 0.831172


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6176998	best: 0.6176998 (0)	total: 25.9ms	remaining: 15.5s
300:	test: 0.8080177	best: 0.8080177 (300)	total: 7.51s	remaining: 7.46s
599:	test: 0.8130322	best: 0.8131912 (575)	total: 15.1s	remaining: 0us
bestTest = 0.8131911755
bestIteration = 575


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.828785
[400]	valid_0's auc: 0.832085
Early stopping, best iteration is:
[442]	valid_0's auc: 0.832515


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5794048	best: 0.5794048 (0)	total: 25.7ms	remaining: 15.4s
300:	test: 0.8084282	best: 0.8085112 (295)	total: 7.56s	remaining: 7.51s
599:	test: 0.8167679	best: 0.8167917 (590)	total: 15.1s	remaining: 0us
bestTest = 0.8167917132
bestIteration = 590


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.816645
Early stopping, best iteration is:
[231]	valid_0's auc: 0.819727
CV AUC scores for CatBoost:  [0.8146160330194163, 0.7906193299116822, 0.81819608102194, 0.8130323551258799, 0.8167682781575546]
Maximum CV AUC score for CatBoost:  0.81819608102194
CV AUC scores for LGBM:  [0.8374520404604116, 0.8072776047613028, 0.8311722110588677, 0.8325152801710323, 0.8197270864855896]
Maximum CV AUC score for LGBM:  0.8374520404604116
CPU times: user 17min 41s, sys: 6.8 s, total: 17min 48s
Wall time: 2min 3s


### Ensemble

In [12]:
from src.models.voting_model import VotingModel

model = VotingModel(fitted_models_cat+fitted_models_lgb)

In [13]:
with open('voting_model.pkl', 'wb') as f:
    pickle.dump(model, f)