## Baseline - Train

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
# モジュールの動的import(import先のファイルが更新されたときに追従する)
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import xgboost as xgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer



In [3]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")

In [4]:
DRY_RUN = True

### Def utility classes

In [5]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [6]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_static_0_0 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_8 loaded into memory.
File train_credit_bureau_a_2_7 loaded into memory.
File train_credit_bureau_a_2_5 loaded into memory.
File train_credit_bureau_a_2_0 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,last_collater_typofvalofguarant_298M,last_collater_typofvalofguarant_407M,last_collaterals_typeofguarante_359M,last_collaterals_typeofguarante_669M,last_num_group1_12,last_num_group2,last_pmts_month_158T,last_pmts_month_706T,last_pmts_year_1139T,last_pmts_year_507T,last_subjectroles_name_541M,last_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,str,str,str,str,u16,u8,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
125713,201904,16,0,-4651.0,,-24104.0,,-24104.0,0.0,1.0,0.0,1.0,0.0,"""a55475b1""","""717ddd49""","""a55475b1""",0.0,1.0,"""3439d993""","""a55475b1""",1.0,7348.133789,,,6.0,,,,14.0,,,0.0,2.0,0.0,151786.734375,2663.800049,…,,"""a55475b1""","""c7a5ad39""",0.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",0.0,23.0,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.0,,0.0,,0.0,,0.0,,2019,29
1632875,201911,45,0,,,,,-17608.0,1.0,1.0,0.0,3.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,1.0,"""3439d993""","""a55475b1""",3.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,3.0,0.0,81513.265625,3952.800049,…,80249.0,"""c7a5ad39""","""c7a5ad39""",10.0,35.0,0.0,791.0,12.0,12.0,0.0,10240.400391,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",10.0,11.0,,1.0,,2020.0,"""a55475b1""","""a55475b1""",0.0,86.554619,0.0,1955.092773,0.0,38832.707031,0.0,15152669.0,2019,16
750568,201907,29,0,,,-12502.0,,-12502.0,0.0,1.0,0.0,3.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",3.0,2.0,"""3439d993""","""a55475b1""",3.0,,,,,0.0,0.0,,14.0,,,1.0,0.0,0.0,5538.399902,1697.0,…,0.0,"""c7a5ad39""","""c7a5ad39""",3.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",3.0,11.0,,1.0,,2019.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,24
599828,201901,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1497.599976,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,2
615694,201901,2,0,,,-11648.0,,-11648.0,3.0,3.0,2.0,3.0,2.0,"""a55475b1""","""a55475b1""","""a55475b1""",3.0,4.0,"""a55475b1""","""a55475b1""",3.0,,,,,6.0,13539.637695,,14.0,,,1.0,2.0,,,1258.599976,…,,"""a55475b1""","""c7a5ad39""",1.0,35.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1.0,23.0,1.0,,2020.0,,"""a55475b1""","""a55475b1""",0.0,,0.0,,0.0,,0.0,,2019,20
1374777,201905,20,0,,,-12287.0,,,,,,,,"""a55475b1""","""a55475b1""","""a55475b1""",,,"""a55475b1""","""a55475b1""",,,,,,3.0,9078.286133,,14.0,,,,,0.0,0.0,1136.200073,…,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,23
1707450,201912,51,0,,,,,-17343.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,0.0,"""a55475b1""","""a55475b1""",0.0,,,,,,,"""DEDUCTION_6""",,14.0,,0.0,0.0,0.0,45359.800781,6385.0,…,0.0,"""c7a5ad39""","""a55475b1""",2.0,35.0,,23.0,,12.0,,10621.799805,,2020.0,"""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2.0,23.0,,1.0,,2020.0,"""a55475b1""","""a55475b1""",,1.02,,916.372375,,16.060816,,6032121.0,2019,25
784384,201908,34,0,-4516.0,,-25717.0,,-25717.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",0.0,0.0,"""3439d993""","""a55475b1""",0.0,22808.800781,,,7.0,,,,14.0,,,0.0,0.0,,,2199.800049,…,50920.0,"""a55475b1""","""a55475b1""",1.0,23.0,,0.0,,12.0,,0.0,,2015.0,"""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1.0,11.0,,1.0,,2015.0,"""a55475b1""","""a55475b1""",,0.0,,0.0,,0.0,,0.0,2019,29
1472822,201908,30,0,-2488.0,,-23681.0,,-23681.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",0.0,1.0,"""ecd83604""","""a55475b1""",0.0,16153.400391,,,6.0,,,,14.0,,,0.0,0.0,0.0,0.0,3474.400146,…,0.0,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2015.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",1.0,23.0,,1.0,,2015.0,"""a55475b1""","""a55475b1""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,2
1787673,202002,58,0,,,,,-18127.0,8.0,8.0,4.0,10.0,5.0,"""a55475b1""","""39a0853f""","""a55475b1""",4.0,6.0,"""3439d993""","""a55475b1""",10.0,,,,,,,"""DEDUCTION_6""",,14.0,,1.0,1.0,0.0,1648.0,3059.800049,…,0.0,"""c7a5ad39""","""c7a5ad39""",2.0,35.0,3.0,647.0,12.0,12.0,9084.064453,48301.238281,2021.0,2018.0,"""ab3c25cf""","""ab3c25cf""","""a55475b1""","""a55475b1""","""a55475b1""","""a55475b1""",2.0,35.0,,1.0,,2018.0,"""a55475b1""","""a55475b1""",0.454545,151.419357,1645.241943,15742.929688,1.072727,41083.917969,13399012.0,372559232.0,2020,16


CPU times: user 3min 10s, sys: 37.9 s, total: 3min 48s
Wall time: 22.8 s


In [7]:
train_df, cat_cols = Utility.to_pandas(train_df)

### Train

In [8]:
device: str = "gpu"
iterations: int = 6000

if DRY_RUN:
    device = "cpu"
    train_df = train_df.iloc[:50000]
    iterations: int = 600

In [9]:
X = train_df.drop(columns=["target", "case_id", "week_num"])
y = train_df["target"]
weeks = train_df["week_num"]

del train_df
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
catboost_params = {
    "best_model_min_trees": 1200,
    "boosting_type": "Plain",
    "eval_metric": "AUC",
    "iterations": iterations,
    "learning_rate": 0.05,
    "l2_leaf_reg": 10,
    "max_leaves": 64,
    "random_seed": 42,
    "task_type": "GPU",
    "use_best_model": True,
}

lgb_params = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.05,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

lgb_params_2 = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.03,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 16,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 72,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "learning_rate": 0.05,
    "max_depth": 6,
    "n_estimators": iterations,
    "enable_categorical": True,
    "tree_method": 'hist',
    "early_stopping_rounds": 100,
}

In [11]:
%%time

fitted_models_cat = []
fitted_models_lgb = []
fitted_models_xgb = []

cv_scores_cat = []
cv_scores_lgb = []
cv_scores_xgb = []

iterator = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    #######
    # cat #
    #######
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(**catboost_params)

    clf.fit(train_pool, eval_set=val_pool, verbose=300)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_cat.append(clf)
    cv_scores_cat.append(auc_score)

    #######
    # lgb #
    #######
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    if iterator % 2 == 0:
        model = lgb.LGBMClassifier(**lgb_params)
    else:
        model = lgb.LGBMClassifier(**lgb_params_2)

    model.fit(
        X_train,
        y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)],
    )

    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_lgb.append(model)
    cv_scores_lgb.append(auc_score)

    #######
    # xgb #
    #######
    model = xgb.XGBClassifier(**xgb_params)

    model.fit(
        X_train,
        y_train,
        eval_set=[(X_valid, y_valid)],
        verbose=200,
    )

    y_pred_valid = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_xgb.append(model)
    cv_scores_xgb.append(auc_score)

    iterator += 1

print("CV AUC scores for CatBoost: ", cv_scores_cat)
print("Maximum CV AUC score for CatBoost: ", max(cv_scores_cat))

print("CV AUC scores for LGBM: ", cv_scores_lgb)
print("Maximum CV AUC score for LGBM: ", max(cv_scores_lgb))

print("CV AUC scores for XGB: ", cv_scores_xgb)
print("Maximum CV AUC score for XGB: ", max(cv_scores_xgb))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5792489	best: 0.5792489 (0)	total: 28.9ms	remaining: 17.3s
300:	test: 0.7982118	best: 0.7982118 (300)	total: 7.4s	remaining: 7.35s
599:	test: 0.8002114	best: 0.8002114 (599)	total: 14.8s	remaining: 0us
bestTest = 0.80021137
bestIteration = 599


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.821393
Early stopping, best iteration is:
[174]	valid_0's auc: 0.821741
[0]	validation_0-auc:0.69459
[200]	validation_0-auc:0.80346
[275]	validation_0-auc:0.79606


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6592906	best: 0.6592906 (0)	total: 25.6ms	remaining: 15.3s
300:	test: 0.8254297	best: 0.8254297 (300)	total: 7.38s	remaining: 7.33s
599:	test: 0.8306101	best: 0.8306613 (595)	total: 14.9s	remaining: 0us
bestTest = 0.8306613266
bestIteration = 595


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.828704
Early stopping, best iteration is:
[230]	valid_0's auc: 0.829777
[0]	validation_0-auc:0.72885
[200]	validation_0-auc:0.81393
[239]	validation_0-auc:0.81122


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5970322	best: 0.5970322 (0)	total: 25.5ms	remaining: 15.3s
300:	test: 0.7833580	best: 0.7834563 (295)	total: 7.44s	remaining: 7.39s
599:	test: 0.7970613	best: 0.7970613 (599)	total: 14.9s	remaining: 0us
bestTest = 0.7970613241
bestIteration = 599


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.810414
Early stopping, best iteration is:
[146]	valid_0's auc: 0.811443
[0]	validation_0-auc:0.68811
[200]	validation_0-auc:0.79126
[268]	validation_0-auc:0.78853


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.5963892	best: 0.5963892 (0)	total: 25.3ms	remaining: 15.2s
300:	test: 0.8061437	best: 0.8061437 (300)	total: 7.46s	remaining: 7.41s
599:	test: 0.8157944	best: 0.8157944 (599)	total: 14.9s	remaining: 0us
bestTest = 0.8157944083
bestIteration = 599


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.828813
[400]	valid_0's auc: 0.831974
Early stopping, best iteration is:
[367]	valid_0's auc: 0.832375
[0]	validation_0-auc:0.71373
[200]	validation_0-auc:0.80395
[341]	validation_0-auc:0.80490


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6246235	best: 0.6246235 (0)	total: 25.2ms	remaining: 15.1s
300:	test: 0.8183791	best: 0.8183791 (300)	total: 7.37s	remaining: 7.32s
599:	test: 0.8281862	best: 0.8281862 (599)	total: 14.8s	remaining: 0us
bestTest = 0.8281861544
bestIteration = 599


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.841322
Early stopping, best iteration is:
[170]	valid_0's auc: 0.843204
[0]	validation_0-auc:0.45030
[200]	validation_0-auc:0.80350
[231]	validation_0-auc:0.80234
CV AUC scores for CatBoost:  [0.8002113708912086, 0.8306101058094949, 0.797061204798253, 0.8157947883590198, 0.8281865431103949]
Maximum CV AUC score for CatBoost:  0.8306101058094949
CV AUC scores for LGBM:  [0.8217412935323383, 0.8297770681531076, 0.8114433893146618, 0.8323751565171731, 0.8432042707493956]
Maximum CV AUC score for LGBM:  0.8432042707493956
CV AUC scores for XGB:  [0.8059505938515515, 0.8157876919543303, 0.7942771505593643, 0.8069164469343894, 0.8116128122481869]
Maximum CV AUC score for XGB:  0.8157876919543303
CPU times: user 20h 15min 5s, sys: 20.2 s, total: 20h 15min 25s
Wall time: 48min 51s


### Ensemble

In [12]:
from src.models.voting_model import VotingModel

model = VotingModel(fitted_models_cat+fitted_models_lgb+fitted_models_xgb)

In [13]:
with open('voting_model.pkl', 'wb') as f:
    pickle.dump(model, f)