## Baseline - Train

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [4]:
# モジュールの動的import(import先のファイルが更新されたときに追従する)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import xgboost as xgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer

In [6]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")

In [7]:
DRY_RUN = False

### Def utility classes

In [8]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [9]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_static_0_0 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_8 loaded into memory.
File train_credit_bureau_a_2_7 loaded into memory.
File train_credit_bureau_a_2_5 loaded into memory.
File train_credit_bureau_a_2_0 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,last_collater_typofvalofguarant_298M,last_collater_typofvalofguarant_407M,last_collaterals_typeofguarante_359M,last_collaterals_typeofguarante_669M,last_num_group1_12,last_num_group2,last_pmts_month_158T,last_pmts_month_706T,last_pmts_year_1139T,last_pmts_year_507T,last_subjectroles_name_541M,last_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,str,u16,u8,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
1731706,202001,52,0,,,,,-18995.0,2.0,3.0,1.0,4.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",1.0,2.0,"""3439d993""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,2.0,0.0,43146.601562,7059.800293,…,0.0,"""c7a5ad39""","""c7a5ad39""",3.0,23.0,35.0,0.0,12.0,12.0,955.113953,0.0,2020.0,2018.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",3.0,11.0,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.825,0.0,25.591074,0.0,24.855062,0.0,21808.175781,0.0,2020,3
1765394,202001,56,0,,,,,-18169.0,1.0,6.0,0.0,8.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",2.0,2.0,"""3439d993""","""a55475b1""",8.0,,,,,,,"""DEDUCTION_6""",,14.0,,3.0,8.0,0.0,26900.0,7781.399902,…,60000.0,"""c7a5ad39""","""a55475b1""",18.0,35.0,,1053.0,,12.0,,42079.800781,,2020.0,"""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",18.0,11.0,,1.0,,2020.0,"""a55475b1""","""a55475b1""",,82.89035,,4450.711426,,54499.941406,,100171416.0,2020,28
1779158,202002,57,0,,,,,-12426.0,4.0,5.0,1.0,10.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",13.0,8.0,"""a7fcb6e5""","""a55475b1""",10.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,11.0,0.0,31871.037109,1672.400024,…,0.0,"""c7a5ad39""","""c7a5ad39""",10.0,35.0,3593.0,689.0,12.0,12.0,47793.402344,114547.679688,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",10.0,11.0,,1.0,,2019.0,"""a55475b1""","""a55475b1""",225.151901,102.307083,3024.898926,10223.30957,759948.0625,30409.230469,137156352.0,590612928.0,2020,9
1848755,202005,73,0,,,,,-10405.0,1.0,1.0,0.0,3.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",4.0,10.0,"""3439d993""","""a55475b1""",3.0,,,,,,,,,,14.0,3.0,3.0,0.0,11265.600586,2652.0,…,0.0,"""c7a5ad39""","""c7a5ad39""",10.0,35.0,21.0,24.0,12.0,12.0,17673.6875,6297.0,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",10.0,11.0,,1.0,,2020.0,"""a55475b1""","""a55475b1""",1.16129,0.603175,1536.624023,128.440506,17.206451,8.86527,23382714.0,588557.75,2020,27
1812840,202003,62,0,,,,,-11911.0,1.0,1.0,0.0,4.0,0.0,"""a55475b1""","""39a0853f""","""a55475b1""",4.0,6.0,"""a7fcb6e5""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,3.0,0.0,0.0,6754.600098,…,0.0,"""c7a5ad39""","""c7a5ad39""",16.0,35.0,0.0,11.0,12.0,12.0,0.0,14179.580078,2021.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",16.0,11.0,,1.0,,2020.0,"""a55475b1""","""a55475b1""",0.0,0.182156,0.0,360.772827,0.0,1.164456,0.0,3649100.0,2020,11
1278161,201902,5,0,,,-17541.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,2.0,1653.0,,14.0,,,,,0.0,,1115.599976,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,10
1903543,202008,86,0,,,,0.0,-17403.0,0.0,0.0,0.0,1.0,0.0,"""2fc785b2""","""6b2ae0fa""","""a55475b1""",3.0,6.0,"""a7fcb6e5""","""a55475b1""",1.0,,,,,,,,,,14.0,1.0,4.0,0.0,44833.667969,8521.600586,…,108022.0,"""c7a5ad39""","""c7a5ad39""",6.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",6.0,23.0,,1.0,,2021.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2020,25
2539158,201902,4,0,,,-17809.0,,-17809.0,3.0,3.0,1.0,4.0,3.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,2.0,"""a55475b1""","""a55475b1""",4.0,,,,,7.0,9010.720703,,14.0,,,1.0,0.0,0.0,,9253.0,…,,"""a55475b1""","""c7a5ad39""",1.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1.0,11.0,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.0,,0.0,,0.0,,0.0,,2019,2
2604700,201908,30,0,,,-19756.0,,-19756.0,2.0,5.0,0.0,7.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",8.0,2.0,"""3439d993""","""a55475b1""",7.0,,,,,6.0,13230.963867,,14.0,,,3.0,3.0,0.0,54934.707031,7356.200195,…,0.0,"""c7a5ad39""","""c7a5ad39""",12.0,35.0,0.0,9.0,12.0,12.0,0.0,4065.582031,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",12.0,11.0,,1.0,,2019.0,"""a55475b1""","""a55475b1""",0.0,0.126374,0.0,46.617485,0.0,0.983941,0.0,143078.4375,2019,3
110246,201902,5,0,,,,,-13404.0,1.0,1.0,0.0,1.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""a55475b1""","""a55475b1""",1.0,,,,,,,,,,,0.0,0.0,0.0,,1218.0,…,,"""a55475b1""","""c7a5ad39""",0.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",0.0,23.0,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.0,,0.0,,0.0,,0.0,,2019,11


CPU times: user 3min 7s, sys: 35.3 s, total: 3min 42s
Wall time: 21.4 s


In [10]:
train_df, cat_cols = Utility.to_pandas(train_df)

### Train

In [11]:
device: str = "gpu"
iterations: int = 6000

if DRY_RUN:
    train_df = train_df.iloc[:50000]
    iterations: int = 600

In [12]:
X = train_df.drop(columns=["target", "case_id", "week_num"])
y = train_df["target"]
weeks = train_df["week_num"]

del train_df
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
catboost_params = {
    "best_model_min_trees": 1200,
    "boosting_type": "Plain",
    "eval_metric": "AUC",
    "iterations": iterations,
    "learning_rate": 0.05,
    "l2_leaf_reg": 10,
    "max_leaves": 64,
    "random_seed": 42,
    "task_type": "GPU",
    "use_best_model": True,
}

lgb_params = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.05,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

lgb_params_2 = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.03,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 16,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 72,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_estimators": 512,
    # "device": "cuda",
    "enable_categorical": True,
    "tree_method": 'hist',
    "early_stopping_rounds": 100,
}

In [16]:
%%time

fitted_models_cat = []
fitted_models_lgb = []
fitted_models_xgb = []

cv_scores_cat = []
cv_scores_lgb = []
cv_scores_xgb = []

iterator = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    #######
    # cat #
    #######
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(**catboost_params)

    clf.fit(train_pool, eval_set=val_pool, verbose=300)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_cat.append(clf)
    cv_scores_cat.append(auc_score)

    #######
    # lgb #
    #######
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    if iterator % 2 == 0:
        model = lgb.LGBMClassifier(**lgb_params)
    else:
        model = lgb.LGBMClassifier(**lgb_params_2)

    model.fit(
        X_train,
        y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.early_stopping(100)],
    )

    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_lgb.append(model)
    cv_scores_lgb.append(auc_score)

    # 時間がかかりすぎるのでコメントアウト
    # #######
    # # xgb #
    # #######
    # model = xgb.XGBClassifier(**xgb_params)

    # model.fit(
    #     X_train,
    #     y_train,
    #     eval_set=[(X_valid, y_valid)],
    #     verbose=200,
    # )

    # y_pred_valid = model.predict_proba(X_valid)[:, 1]
    # auc_score = roc_auc_score(y_valid, y_pred_valid)

    # fitted_models_xgb.append(model)
    # cv_scores_xgb.append(auc_score)

print("CV AUC scores for CatBoost: ", cv_scores_cat)
print("Maximum CV AUC score for CatBoost: ", max(cv_scores_cat))

print("CV AUC scores for LGBM: ", cv_scores_lgb)
print("Maximum CV AUC score for LGBM: ", max(cv_scores_lgb))

# print("CV AUC scores for XGB: ", cv_scores_xgb)
# print("Maximum CV AUC score for XGB: ", max(cv_scores_xgb))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[0]	validation_0-auc:0.74097
[200]	validation_0-auc:0.85064


KeyboardInterrupt: 

### Ensemble

In [None]:
from src.models.voting_model import VotingModel

model = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb)

In [None]:
with open('voting_model.pkl', 'wb') as f:
    pickle.dump(model, f)