## Baseline - Train

https://www.kaggle.com/code/pxcai666/catboost-lightgbm-ensemble/notebook

### Configuration

In [1]:
# モジュールの動的import(import先のファイルが更新されたときに追従する)
%load_ext autoreload
%autoreload 2

In [2]:
import sys
from pathlib import Path
import subprocess
import os
import gc
from glob import glob
import pickle

import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit, GroupKFold, StratifiedGroupKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
import lightgbm as lgb

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer



In [3]:
ROOT = "/kaggle/input/home-credit-credit-risk-model-stability"

TRAIN_DIR = os.path.join(ROOT, "parquet_files", "train")

In [4]:
DRY_RUN = False

### Def utility classes

In [5]:
from src.utils.utility import Utility
from src.utils.schema_gen import SchemaGen
from src.utils.pipeline import Pipeline
from src.utils.aggregator import Aggregator

### Read train data

In [6]:
%%time

data_store: dict = {
    "df_base": SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_base.parquet")),
    "depth_0": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_cb_0.parquet")),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_static_0_*.parquet")),
    ],
    "depth_1": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_applprev_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_a_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_tax_registry_c_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_1_*.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_other_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_person_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_deposit_1.parquet"), 1),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_debitcard_1.parquet"), 1),
    ],
    "depth_2": [
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_a_2_*.parquet"), 2),
        SchemaGen.scan_files(os.path.join(TRAIN_DIR, "train_credit_bureau_b_2.parquet"), 2),
    ],
}

train_df: pl.LazyFrame = (
    SchemaGen.join_dataframes(**data_store)
    .pipe(Pipeline.filter_cols)
    .pipe(Pipeline.transform_cols)
    .pipe(Pipeline.handle_dates)
    .pipe(Utility.reduce_memory_usage, "train_df")
)

del data_store
gc.collect()

print(f"Train data shape: {train_df.shape}")
display(train_df.head(10))

File train_base loaded into memory.
File train_static_cb_0 loaded into memory.
File train_static_0_1 loaded into memory.
File train_static_0_0 loaded into memory.
File train_applprev_1_0 loaded into memory.
File train_applprev_1_1 loaded into memory.
File train_tax_registry_a_1 loaded into memory.
File train_tax_registry_b_1 loaded into memory.
File train_tax_registry_c_1 loaded into memory.
File train_credit_bureau_a_1_3 loaded into memory.
File train_credit_bureau_a_1_1 loaded into memory.
File train_credit_bureau_a_1_0 loaded into memory.
File train_credit_bureau_a_1_2 loaded into memory.
File train_credit_bureau_b_1 loaded into memory.
File train_other_1 loaded into memory.
File train_person_1 loaded into memory.
File train_deposit_1 loaded into memory.
File train_debitcard_1 loaded into memory.
File train_credit_bureau_a_2_8 loaded into memory.
File train_credit_bureau_a_2_7 loaded into memory.
File train_credit_bureau_a_2_5 loaded into memory.
File train_credit_bureau_a_2_0 loade

case_id,month,week_num,target,assignmentdate_238D,assignmentdate_4527235D,birthdate_574D,contractssum_5085716L,dateofbirth_337D,days120_123L,days180_256L,days30_165L,days360_512L,days90_310L,description_5085714M,education_1103M,education_88M,firstquarter_103L,fourthquarter_440L,maritalst_385M,maritalst_893M,numberofqueries_373L,pmtaverage_3A,pmtaverage_4527227A,pmtcount_4527229L,pmtcount_693L,pmtscount_423L,pmtssum_45A,requesttype_4525192L,responsedate_1012D,responsedate_4527233D,responsedate_4917613D,secondquarter_766L,thirdquarter_1082L,actualdpdtolerance_344P,amtinstpaidbefduel24m_4187115A,annuity_780A,…,mean_mainoccupationinc_384A,max_amount_416A,max_num_group1_10,max_openingdate_313D,mean_amount_416A,mean_openingdate_313D,max_num_group1_11,max_openingdate_857D,mean_openingdate_857D,max_collater_typofvalofguarant_298M,max_collater_typofvalofguarant_407M,max_collater_valueofguarantee_1124L,max_collater_valueofguarantee_876L,max_collaterals_typeofguarante_359M,max_collaterals_typeofguarante_669M,max_num_group1_12,max_num_group2,max_pmts_dpd_1073P,max_pmts_dpd_303P,max_pmts_month_158T,max_pmts_month_706T,max_pmts_overdue_1140A,max_pmts_overdue_1152A,max_pmts_year_1139T,max_pmts_year_507T,max_subjectroles_name_541M,max_subjectroles_name_838M,mean_pmts_dpd_1073P,mean_pmts_dpd_303P,mean_pmts_overdue_1140A,mean_pmts_overdue_1152A,var_pmts_dpd_1073P,var_pmts_dpd_303P,var_pmts_overdue_1140A,var_pmts_overdue_1152A,year,day
u32,u32,u8,u8,i16,u8,i16,f32,i32,f32,f32,f32,f32,f32,str,str,str,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,str,i8,u8,i8,f32,f32,f32,f32,f32,…,f32,f32,u8,i16,f32,i16,u8,i16,i16,str,str,f32,f32,str,str,u16,u8,f32,f32,f32,f32,f32,f32,f32,f32,str,str,f32,f32,f32,f32,f32,f32,f32,f32,u16,u8
103291,201901,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,4929.399902,…,42000.0,203.518005,0.0,-1176.0,203.518005,-1176.0,0.0,-1176.0,-1176.0,,,,,,,,,,,,,,,,,,,,,,,,,,,2019,4
136598,201906,24,0,-2202.0,,-23393.0,,-23393.0,1.0,1.0,0.0,3.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",4.0,3.0,"""3439d993""","""46b968c3""",3.0,7222.200195,,,6.0,,,,14.0,,,3.0,0.0,0.0,162879.0,1808.200073,…,40000.0,204.03801,0.0,-1449.0,204.03801,-1449.0,0.0,-1449.0,-1449.0,"""a55475b1""","""a55475b1""",0.0,100000.0,"""c7a5ad39""","""c7a5ad39""",10.0,35.0,0.0,29.0,12.0,12.0,0.0,6849.600098,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.511765,0.0,177.059875,0.0,8.073825,0.0,673480.3125,2019,18
25700,201908,31,0,-2589.0,,-25634.0,,-25634.0,0.0,0.0,0.0,0.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",2.0,3.0,"""a55475b1""","""a55475b1""",0.0,18237.201172,,,6.0,,,,14.0,,,1.0,1.0,,,5344.600098,…,76000.0,,,,,,,,,"""a55475b1""","""a55475b1""",9023000.0,100000.0,"""c7a5ad39""","""a55475b1""",4.0,35.0,1.0,333.0,12.0,12.0,18062.712891,9214.400391,2020.0,2018.0,"""ab3c25cf""","""daf49a8a""",0.041667,20.054054,752.613037,1626.900024,0.041667,5557.695801,13594233.0,8351803.5,2019,7
114706,201903,9,0,-3035.0,,-24230.0,,-24230.0,2.0,3.0,0.0,4.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",1.0,2.0,"""b6cabe76""","""a55475b1""",4.0,17400.5,,,6.0,,,,14.0,,,2.0,1.0,0.0,,6169.800293,…,60000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1.0,23.0,0.0,,12.0,,0.0,,2020.0,,"""a55475b1""","""ab3c25cf""",0.0,,0.0,,0.0,,0.0,,2019,5
655664,201903,11,0,-250.0,,-23275.0,,-23275.0,1.0,2.0,1.0,2.0,1.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,1.0,"""a55475b1""","""a55475b1""",2.0,5960.800293,,,6.0,,,,14.0,,,0.0,0.0,,,1650.200073,…,30000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",0.0,23.0,3.0,,12.0,,51.897999,,2020.0,,"""a55475b1""","""ab3c25cf""",0.6,,10.3796,,1.8,,538.680481,,2019,22
729992,201907,25,0,,,-15156.0,,-15156.0,1.0,1.0,0.0,5.0,0.0,"""a55475b1""","""a55475b1""","""a55475b1""",1.0,3.0,"""a55475b1""","""a55475b1""",5.0,,,,,0.0,0.0,,14.0,,,5.0,3.0,0.0,,5446.0,…,30000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",1.0,35.0,0.0,0.0,12.0,12.0,0.0,0.0,2020.0,2019.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2019,1
1343447,201904,15,1,,,-17182.0,,-17182.0,6.0,6.0,0.0,8.0,2.0,"""a55475b1""","""a55475b1""","""a55475b1""",14.0,8.0,"""3439d993""","""a55475b1""",8.0,,,,,6.0,6576.800293,,14.0,,,4.0,9.0,0.0,62892.601562,8899.799805,…,99800.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,,"""a55475b1""","""c7a5ad39""",1.0,35.0,45.0,,12.0,,4001.446045,,2020.0,,"""a55475b1""","""ab3c25cf""",3.958333,,509.882629,,69.828011,,745252.0625,,2019,17
1603772,201910,43,1,,,,,-17134.0,0.0,0.0,0.0,3.0,0.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",6.0,1.0,"""3439d993""","""a55475b1""",3.0,,,,,,,"""DEDUCTION_6""",,14.0,,3.0,1.0,0.0,14270.400391,3923.600098,…,40000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""a55475b1""",10.0,35.0,0.0,165.0,12.0,12.0,0.0,28821.816406,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,12.529851,0.0,1701.282227,0.0,852.296082,0.0,21192540.0,2019,30
1772343,202002,56,0,,,,,-12512.0,4.0,6.0,2.0,7.0,3.0,"""a55475b1""","""39a0853f""","""a55475b1""",11.0,3.0,"""3439d993""","""a55475b1""",7.0,,,,,,,"""DEDUCTION_6""",,14.0,,3.0,7.0,0.0,0.0,3256.199951,…,80000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,0.0,"""c7a5ad39""","""c7a5ad39""",34.0,35.0,0.0,0.0,12.0,12.0,5199.304199,0.0,2021.0,2021.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.0,118.166,0.0,0.0,0.0,614380.9375,0.0,2020,3
917966,201912,51,0,,,,,-19112.0,1.0,2.0,0.0,4.0,1.0,"""a55475b1""","""6b2ae0fa""","""a55475b1""",2.0,3.0,"""3439d993""","""a55475b1""",4.0,,,,,,,"""DEDUCTION_6""",,14.0,,2.0,2.0,,,3061.199951,…,70000.0,,,,,,,,,"""a55475b1""","""a55475b1""",0.0,807395.0,"""c7a5ad39""","""c7a5ad39""",14.0,35.0,0.0,5.0,12.0,12.0,0.0,8281.0,2020.0,2020.0,"""ab3c25cf""","""ab3c25cf""",0.0,0.172185,0.0,174.318222,0.0,0.463488,0.0,960854.4375,2019,29


CPU times: user 2min 56s, sys: 40.6 s, total: 3min 37s
Wall time: 18.7 s


In [7]:
train_df, cat_cols = Utility.to_pandas(train_df)

### Train

In [8]:
device: str = "gpu"
est_cnt: int = 6000

if DRY_RUN:
    device = "cpu"
    train_df = train_df.iloc[:50000]
    est_cnt: int = 600

In [9]:
X = train_df.drop(columns=["target", "case_id", "week_num"])
y = train_df["target"]
weeks = train_df["week_num"]

del train_df
gc.collect()

cv = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
catboost_params = {
    "best_model_min_trees": 1200,
    "boosting_type": "Plain",
    "eval_metric": "AUC",
    "iterations": est_cnt,
    "learning_rate": 0.05,
    "l2_leaf_reg": 10,
    "max_leaves": 64,
    "random_seed": 42,
    "task_type": "GPU",
    "use_best_model": True,
}

lgb_params = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.05,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 20,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 64,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

lgb_params_2 = {
    "boosting_type": "gbdt",
    "colsample_bytree": 0.8,
    "colsample_bynode": 0.8,
    "device": device,
    "extra_trees":True,
    "learning_rate": 0.03,
    "reg_alpha": 0.1,
    "reg_lambda": 10,
    "max_depth": 16,
    "metric": "auc",
    "n_estimators": 2000,
    "num_leaves": 72,
    "objective": "binary",
    "random_state": 42,
    "verbose": -1,
}

In [11]:
%%time

fitted_models_cat = []
fitted_models_lgb = []

cv_scores_cat = []
cv_scores_lgb = []

iterator = 0
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]

    #######
    # cat #
    #######
    train_pool = Pool(X_train, y_train, cat_features=cat_cols)
    val_pool = Pool(X_valid, y_valid, cat_features=cat_cols)

    clf = CatBoostClassifier(**catboost_params)

    clf.fit(train_pool, eval_set=val_pool, verbose=300)
    y_pred_valid = clf.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_cat.append(clf)
    cv_scores_cat.append(auc_score)

    #######
    # lgb #
    #######
    X_train[cat_cols] = X_train[cat_cols].astype("category")
    X_valid[cat_cols] = X_valid[cat_cols].astype("category")

    if iterator % 2 == 0:
        model = lgb.LGBMClassifier(**lgb_params)
    else:
        model = lgb.LGBMClassifier(**lgb_params_2)

    model.fit(
        X_train,
        y_train,
        eval_set = [(X_valid, y_valid)],
        callbacks = [lgb.log_evaluation(200), lgb.early_stopping(100)],
    )

    y_pred_valid = model.predict_proba(X_valid)[:,1]
    auc_score = roc_auc_score(y_valid, y_pred_valid)

    fitted_models_lgb.append(model)
    cv_scores_lgb.append(auc_score)

    iterator += 1

print("CV AUC scores for CatBoost: ", cv_scores_cat)
print("Maximum CV AUC score for CatBoost: ", max(cv_scores_cat))

print("CV AUC scores for LGBM: ", cv_scores_lgb)
print("Maximum CV AUC score for LGBM: ", max(cv_scores_lgb))

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6907933	best: 0.6907933 (0)	total: 157ms	remaining: 15m 41s
300:	test: 0.8506913	best: 0.8506913 (300)	total: 35.6s	remaining: 11m 13s
600:	test: 0.8544757	best: 0.8544757 (600)	total: 1m 9s	remaining: 10m 27s
900:	test: 0.8562239	best: 0.8562239 (900)	total: 1m 43s	remaining: 9m 47s
1200:	test: 0.8574800	best: 0.8574848 (1195)	total: 2m 17s	remaining: 9m 9s
1500:	test: 0.8584509	best: 0.8584509 (1500)	total: 2m 51s	remaining: 8m 34s
1800:	test: 0.8592788	best: 0.8592788 (1800)	total: 3m 25s	remaining: 7m 59s
2100:	test: 0.8600082	best: 0.8600082 (2100)	total: 3m 59s	remaining: 7m 24s
2400:	test: 0.8604954	best: 0.8604954 (2400)	total: 4m 33s	remaining: 6m 50s
2700:	test: 0.8608026	best: 0.8608087 (2670)	total: 5m 8s	remaining: 6m 16s
3000:	test: 0.8612822	best: 0.8612822 (3000)	total: 5m 42s	remaining: 5m 42s
3300:	test: 0.8616092	best: 0.8616092 (3300)	total: 6m 16s	remaining: 5m 8s
3600:	test: 0.8618485	best: 0.8618513 (3580)	total: 6m 51s	remaining: 4m 33s
3900:	test: 0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.860981
[400]	valid_0's auc: 0.865324
[600]	valid_0's auc: 0.866337
[800]	valid_0's auc: 0.866734
[1000]	valid_0's auc: 0.866994
Early stopping, best iteration is:
[1010]	valid_0's auc: 0.86701


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6799294	best: 0.6799294 (0)	total: 127ms	remaining: 12m 41s
300:	test: 0.8377620	best: 0.8377620 (300)	total: 34.9s	remaining: 11m 1s
600:	test: 0.8423406	best: 0.8423406 (600)	total: 1m 8s	remaining: 10m 19s
900:	test: 0.8447096	best: 0.8447096 (900)	total: 1m 42s	remaining: 9m 40s
1200:	test: 0.8462603	best: 0.8462603 (1200)	total: 2m 16s	remaining: 9m 3s
1500:	test: 0.8473294	best: 0.8473294 (1500)	total: 2m 49s	remaining: 8m 27s
1800:	test: 0.8482442	best: 0.8482442 (1800)	total: 3m 23s	remaining: 7m 53s
2100:	test: 0.8490030	best: 0.8490030 (2100)	total: 3m 56s	remaining: 7m 19s
2400:	test: 0.8496845	best: 0.8496845 (2400)	total: 4m 30s	remaining: 6m 45s
2700:	test: 0.8501790	best: 0.8501790 (2700)	total: 5m 4s	remaining: 6m 11s
3000:	test: 0.8506145	best: 0.8506145 (3000)	total: 5m 37s	remaining: 5m 37s
3300:	test: 0.8511556	best: 0.8511556 (3300)	total: 6m 11s	remaining: 5m 3s
3600:	test: 0.8515446	best: 0.8515446 (3600)	total: 6m 44s	remaining: 4m 29s
3900:	test: 0.8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.844681
[400]	valid_0's auc: 0.85303
[600]	valid_0's auc: 0.856032
[800]	valid_0's auc: 0.857231
[1000]	valid_0's auc: 0.857655
[1200]	valid_0's auc: 0.858002
[1400]	valid_0's auc: 0.858176
[1600]	valid_0's auc: 0.85835
[1800]	valid_0's auc: 0.858482
[2000]	valid_0's auc: 0.858603
Did not meet early stopping. Best iteration is:
[1993]	valid_0's auc: 0.858615


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6747274	best: 0.6747274 (0)	total: 130ms	remaining: 12m 58s
300:	test: 0.8386381	best: 0.8386381 (300)	total: 35.9s	remaining: 11m 20s
600:	test: 0.8434204	best: 0.8434204 (600)	total: 1m 10s	remaining: 10m 37s
900:	test: 0.8455154	best: 0.8455154 (900)	total: 1m 45s	remaining: 9m 58s
1200:	test: 0.8466938	best: 0.8466938 (1200)	total: 2m 20s	remaining: 9m 19s
1500:	test: 0.8476221	best: 0.8476221 (1500)	total: 2m 54s	remaining: 8m 43s
1800:	test: 0.8485180	best: 0.8485180 (1800)	total: 3m 29s	remaining: 8m 7s
2100:	test: 0.8491917	best: 0.8491917 (2100)	total: 4m 4s	remaining: 7m 32s
2400:	test: 0.8498088	best: 0.8498088 (2400)	total: 4m 38s	remaining: 6m 57s
2700:	test: 0.8502643	best: 0.8502643 (2700)	total: 5m 13s	remaining: 6m 22s
3000:	test: 0.8507349	best: 0.8507349 (3000)	total: 5m 48s	remaining: 5m 48s
3300:	test: 0.8510727	best: 0.8510727 (3300)	total: 6m 22s	remaining: 5m 13s
3600:	test: 0.8514274	best: 0.8514274 (3600)	total: 6m 57s	remaining: 4m 38s
3900:	test: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.851288
[400]	valid_0's auc: 0.856172
[600]	valid_0's auc: 0.857409
[800]	valid_0's auc: 0.857982
[1000]	valid_0's auc: 0.858205
Early stopping, best iteration is:
[1004]	valid_0's auc: 0.858214


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6693533	best: 0.6693533 (0)	total: 132ms	remaining: 13m 10s
300:	test: 0.8390459	best: 0.8390459 (300)	total: 36.9s	remaining: 11m 38s
600:	test: 0.8440240	best: 0.8440240 (600)	total: 1m 12s	remaining: 10m 52s
900:	test: 0.8465752	best: 0.8465752 (900)	total: 1m 48s	remaining: 10m 13s
1200:	test: 0.8478985	best: 0.8478985 (1200)	total: 2m 23s	remaining: 9m 34s
1500:	test: 0.8490477	best: 0.8490477 (1500)	total: 2m 59s	remaining: 8m 57s
1800:	test: 0.8500168	best: 0.8500168 (1800)	total: 3m 34s	remaining: 8m 20s
2100:	test: 0.8507628	best: 0.8507628 (2100)	total: 4m 9s	remaining: 7m 43s
2400:	test: 0.8512424	best: 0.8512424 (2400)	total: 4m 45s	remaining: 7m 7s
2700:	test: 0.8518240	best: 0.8518240 (2700)	total: 5m 20s	remaining: 6m 31s
3000:	test: 0.8522235	best: 0.8522235 (3000)	total: 5m 56s	remaining: 5m 56s
3300:	test: 0.8526372	best: 0.8526400 (3295)	total: 6m 31s	remaining: 5m 20s
3600:	test: 0.8530054	best: 0.8530054 (3600)	total: 7m 7s	remaining: 4m 44s
3900:	test: 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.846194
[400]	valid_0's auc: 0.85455
[600]	valid_0's auc: 0.857472
[800]	valid_0's auc: 0.858663
[1000]	valid_0's auc: 0.859229
[1200]	valid_0's auc: 0.859668
[1400]	valid_0's auc: 0.860024
[1600]	valid_0's auc: 0.86021
[1800]	valid_0's auc: 0.860386
[2000]	valid_0's auc: 0.860513
Did not meet early stopping. Best iteration is:
[1994]	valid_0's auc: 0.860536


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.6781521	best: 0.6781521 (0)	total: 127ms	remaining: 12m 40s
300:	test: 0.8354783	best: 0.8354783 (300)	total: 35.2s	remaining: 11m 7s
600:	test: 0.8401458	best: 0.8401458 (600)	total: 1m 9s	remaining: 10m 24s
900:	test: 0.8422469	best: 0.8422469 (900)	total: 1m 43s	remaining: 9m 44s
1200:	test: 0.8435604	best: 0.8435604 (1200)	total: 2m 16s	remaining: 9m 7s
1500:	test: 0.8445185	best: 0.8445185 (1500)	total: 2m 50s	remaining: 8m 30s
1800:	test: 0.8454717	best: 0.8454717 (1800)	total: 3m 23s	remaining: 7m 55s
2100:	test: 0.8461337	best: 0.8461337 (2100)	total: 3m 57s	remaining: 7m 20s
2400:	test: 0.8468145	best: 0.8468145 (2400)	total: 4m 31s	remaining: 6m 47s
2700:	test: 0.8474751	best: 0.8474751 (2700)	total: 5m 5s	remaining: 6m 13s
3000:	test: 0.8479452	best: 0.8479452 (3000)	total: 5m 39s	remaining: 5m 39s
3300:	test: 0.8483309	best: 0.8483310 (3295)	total: 6m 13s	remaining: 5m 5s
3600:	test: 0.8487560	best: 0.8487560 (3600)	total: 6m 47s	remaining: 4m 31s
3900:	test: 0.8

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Training until validation scores don't improve for 100 rounds
[200]	valid_0's auc: 0.847878
[400]	valid_0's auc: 0.852602
[600]	valid_0's auc: 0.853677
[800]	valid_0's auc: 0.854149
[1000]	valid_0's auc: 0.854434
[1200]	valid_0's auc: 0.854665
Early stopping, best iteration is:
[1275]	valid_0's auc: 0.854768
CV AUC scores for CatBoost:  [0.8633729826457213, 0.8538613862399729, 0.853172044382759, 0.8548193356088387, 0.8509457174264036]
Maximum CV AUC score for CatBoost:  0.8633729826457213
CV AUC scores for LGBM:  [0.8670098449146831, 0.8586152430392099, 0.8582138299964486, 0.8605357568878694, 0.8547680495712842]
Maximum CV AUC score for LGBM:  0.8670098449146831
CPU times: user 3h 54min 50s, sys: 1min 36s, total: 3h 56min 27s
Wall time: 1h 8min


### Ensemble

In [12]:
from src.models.voting_model import VotingModel

model = VotingModel(fitted_models_cat+fitted_models_lgb)

In [13]:
with open('voting_model.pkl', 'wb') as f:
    pickle.dump(model, f)