- evaluate by mean gini and check fp and fn 
- go back to ver 47, but add second lgb and change cb pos weight and cb ensemble ratio *

In [1]:
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
from catboost import  CatBoostClassifier, FeaturesData, Pool
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold



In [2]:
ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'
TARGET = 'target'
batch_size = 1000

In [3]:
class TreeModelling:
    """
        Train and test data should contain the same selected features for ML models.
        Train, test data and target should be the same data type. (Pandas or Numpy)
    """
    def __init__(self, config):
        self.config = config 
    
    def lgb_train(self, x_tr, y_tr, params, cat = "auto"):
        
        model = lgb.train(params,
                          lgb.Dataset(x_tr, y_tr),
                          categorical_feature = cat)
    
        return model
    
    def lgb_train_and_valid(self, x_tr, y_tr, x_val, y_val, params, feval=None, cat="auto"):
            
        dtrain = lgb.Dataset(x_tr, y_tr)#, weight = train_weight)
        deval = lgb.Dataset(x_val, y_val, reference=dtrain)

        ####
        dtrain.week_num = train_week
        deval.week_num =  valid_week
        ####

        callbacks = [
                lgb.early_stopping(stopping_rounds=self.config["lgb"]["es_round"],
                                   first_metric_only = True), 
                lgb.log_evaluation(self.config["lgb"]["verbose_eval"])
            ]
            
        model = lgb.train(params, dtrain, 
                          self.config["lgb"]["num_round"], 
                          valid_sets=[deval], 
                          feval = feval,
                          callbacks = callbacks,
                          categorical_feature = cat)
        valid_pred = model.predict(x_val)
    
        return model, valid_pred
        
    def lgb_test(self, models, test):
        test_pred = [model.predict(test) for model in models]
        test_pred = np.mean(test_pred, axis=0)
        return test_pred
    

    def xgb_train_and_valid(self, x_tr, y_tr, x_val, y_val, params):
                
        xgb_eval = xgb.DMatrix(x_val, label = y_val)
        model = xgb.train(params,
                          xgb.DMatrix(x_tr, label = y_tr), 
                          self.config["xgb"]["num_round"],
                          evals = [(xgb_eval, "eval")], 
                          early_stopping_rounds = self.config["xgb"]["es_round"], 
                          verbose_eval = self.config["xgb"]["verbose_eval"])
        valid_pred = model.predict(xgb_eval, iteration_range=(0, model.best_ntree_limit))
    
        return model, valid_pred
    
    def xgb_test(self, models, test):
        dtest = xgb.DMatrix(test)
        test_pred = [model.predict(dtest, iteration_range=(0, model.best_ntree_limit)) for model in models]
        test_pred = np.mean(test_pred, axis=0)
        return test_pred
    

    def cb_train(self, x_tr, y_tr, params, cat):
        
        train_pool = Pool(data=x_tr,
                          label=y_tr,
                          cat_features=cat
                         )
                
        if self.config["cb"]["task_type"] == "classification":
            model = CatBoostClassifier(**params)

        elif self.config["cb"]["task_type"]:
            model = CatBoostRegressor(**params)
        model.fit(train_pool)
            
        return model

    
    def cb_train_and_valid(self, x_tr, y_tr, x_val, y_val, params, cat):
        
        train_pool = Pool(data=x_tr,
                          label=y_tr,
                          cat_features=cat)
        
        valid_pool = Pool(data=x_val,
                          label=y_val,
                          cat_features=cat)
                
        if self.config["cb"]["task_type"] == "classification":
            model = CatBoostClassifier(**params)
            model.fit(train_pool,
                      eval_set=[valid_pool], 
                      early_stopping_rounds=self.config["cb"]["es_round"],
                      verbose_eval = self.config["cb"]["verbose_eval"])
            valid_pred = model.predict_proba(x_val)[:,1]
        elif self.config["cb"]["task_type"]:
            model = CatBoostRegressor(**params)
            model.fit(train_pool,
                      eval_set=[valid_pool], 
                      early_stopping_rounds=self.config["cb"]["es_round"], 
                      verbose_eval = self.config["cb"]["verbose_eval"])
            valid_pred = model.predict(x_val)
            
        return model, valid_pred
    
    def cb_test(self, models, test):
        if self.config["cb"]["task_type"] == "classification":
            test_pred = [model.predict_proba(test)[:,1] for model in models]
        else:
            test_pred = [model.predict(test) for model in models]
        test_pred = np.mean(test_pred, axis=0)        
        return test_pred

------- edit from here -------------

# metric functions

In [4]:
def comp_metric(base, w_fallingrate=88.0, w_resstd=-0.5):
    def calc_lr_scores(x, y):
        a, b = np.polyfit(x, y, 1)
        y_hat = a*x + b
        residuals = y - y_hat
        res_std = np.std(residuals)
        return w_fallingrate * min(0, a) + w_resstd * res_std
    
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    avg_gini = np.mean(gini_in_time)
    x = np.arange(len(gini_in_time))
    y = np.array(gini_in_time)
    score = avg_gini + calc_lr_scores(x, y)
    return score

def weekly_gini(base, opt):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
    .groupby("WEEK_NUM")[["target", "score"]]\
    .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    if opt == "mean":
        score = np.mean(gini_in_time)
    elif opt == "min":
        score = np.min(gini_in_time)
    elif opt == "std":
        score = np.std(gini_in_time)
    return score

def lgb_avg_gini_metric(preds, eval_data):
    labels = eval_data.get_label()
    week_num = eval_data.week_num
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(week_num, columns = ["WEEK_NUM"])
    score_df["target"] = labels
    score_df["score"] = y_pred
        
    score = weekly_gini(score_df, "mean")
    
    return 'avg', score, True

def lgb_std_gini_metric(preds, eval_data):
    labels = eval_data.get_label()
    week_num = eval_data.week_num
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(week_num, columns = ["WEEK_NUM"])
    score_df["target"] = labels
    score_df["score"] = y_pred
        
    score = weekly_gini(score_df, "std")
    
    return 'std', score, False

def false_positive(base):
    base = base[base["target"]==0]
    score = np.sum(base["score"] - base["target"] >= 0.8)

    return score

def false_negative(base):
    base = base[base["target"]==1]
    score = np.sum(base["target"] - base["score"] >= 0.8)
    
    return score

def lgb_avg_neg_metric(preds, eval_data):
    labels = eval_data.get_label()
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(labels, columns = ["target"])
    score_df["score"] = y_pred
        
    score = false_positive(score_df)
    
    return 'ext_fp', score, False

def lgb_avg_pos_metric(preds, eval_data):
    labels = eval_data.get_label()
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(labels, columns = ["target"])
    score_df["score"] = y_pred
    
    score = false_negative(score_df)
    
    return "ext_fn", score, False

def metrics(preds, data):
    return [
        lgb_avg_gini_metric(preds, data),
        lgb_std_gini_metric(preds, data),
        lgb_avg_neg_metric(preds, data),
        lgb_avg_pos_metric(preds, data),
    ]

# common fe functions

In [5]:
# fix data type
feature_def = pd.read_parquet("/kaggle/input/home-credit-enhanced-feature-definitions/feature_definitions_dtypes_tables.parquet")
feature_def = dict(zip(feature_def["Variable"], feature_def["dtype"]))

def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col in ["persontype_1072L", "persontype_792L", "personindex_1023L"]:
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float32))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))
        else:
            if "String" in feature_def[col]:
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif "Float" in feature_def[col]:
                df = df.with_columns(pl.col(col).cast(pl.Float32))
            elif "Date" in feature_def[col]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif "Boolean" in feature_def[col]:
                if "String" in str(df[col].dtype):
                    df = df.with_columns(pl.col(col).str.to_lowercase())
                    df = df.with_columns(pl.col(col).replace({"false":False, "true":True}, default=None))
                    df = df.with_columns(pl.col(col).cast(pl.Int8))
                else:
                    df = df.with_columns(pl.col(col).cast(pl.Int8))

    return df

def filter_cols(df):
    remove_cols = []
    for col in df.columns:
        if df[col].n_unique() == 1 or df[col].null_count() == df.shape[0]:
            remove_cols.append(col)
    return remove_cols

In [6]:
# for numeric columns
def agg_float_cols(input_df, operation_dict):
    
    cons_cols = list(operation_dict.keys())
    agg_cols = []
    for col in cons_cols:
        operations = operation_dict[col]
        for option in operations:
            if "min" == option:
                agg_cols.append(pl.min(col).alias(f"min_{col}"))
            if "max" == option:
                agg_cols.append(pl.max(col).alias(f"max_{col}"))
            if "mean" == option:
                agg_cols.append(pl.mean(col).alias(f"mean_{col}"))
            if "median" == option:
                agg_cols.append(pl.median(col).alias(f"median_{col}"))
            if "std" == option:
                agg_cols.append(pl.std(col).alias(f"std_{col}"))
            if "sum" == option:
                agg_cols.append(pl.sum(col).alias(f"sum_{col}"))

    output_df = input_df.group_by("case_id").agg(agg_cols)
        
    return output_df

In [7]:
# for string columns
def convert_strings(train_df, test_df, str_cols):
    for col in str_cols: 
        train_categories = set(train_df.filter(pl.col(col).is_not_null()).unique(col).select(col).to_numpy().reshape(-1))
        test_categories = set(test_df.filter(pl.col(col).is_not_null()).unique(col).select(col).to_numpy().reshape(-1))
        encoding_dict = dict([(c, i) for i, c in enumerate(list(sorted(train_categories)))])
        unique_cat_idx = len(encoding_dict)
        unique_categories = list(sorted(test_categories - train_categories))
        for ele in unique_categories:
            encoding_dict[ele] = unique_cat_idx
        train_df = train_df.with_columns(pl.col(col).replace(encoding_dict, default = None))
        test_df = test_df.with_columns(pl.col(col).replace(encoding_dict, default = None))
    return train_df, test_df

In [8]:
def load_base_data(prefix):

    base =  pl.read_csv(ROOT + f'/csv_files/{prefix}/{prefix}_base.csv')
    base = base.with_columns(pl.col("date_decision").cast(pl.Date))
    
    return base

def unite_depth0_data(base, prefix):

    # depth 0 internal
    static0_paths = ROOT + f"/csv_files/{prefix}/{prefix}_static_0_*.csv"
    static0 = [pl.read_csv(path) for path in glob(static0_paths)]
    depth0_internal = pl.concat(static0, how="vertical_relaxed").pipe(set_table_dtypes) 

    # depth 0 external
    depth0_external = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_static_cb_0.csv").pipe(set_table_dtypes)
    
    base = base.join(depth0_internal, on = "case_id", how = "left")
    base = base.join(depth0_external, on = "case_id", how = "left")
        
    return base

In [9]:
def unite_depth1_data(base, prefix):    

    # person
    person = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_person_1.csv").pipe(set_table_dtypes)
    str_cols = [i for i in person.columns if "String" in str(person[i].dtype) and person[i].n_unique() < 200]
    person_feats = person.filter(pl.col("num_group1")==0).select(["case_id", "birth_259D"] + str_cols)
    
    base = base.join(person_feats, on = "case_id", how = "left")
    base = base.with_columns((pl.col("date_decision") - pl.col("birth_259D")).alias("age"))
    base = base.with_columns(pl.col("age").dt.total_days() / 365)
    
    # other
    other = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_other_1.csv").pipe(set_table_dtypes) 
    other = agg_float_cols(other, {
                                    "amtdebitincoming_4809443A": ["max"],
                                    "amtdebitoutgoing_4809440A": ["max"],
                                    "amtdepositbalance_4809441A": ["max"],
                                    "amtdepositincoming_4809444A": ["max"],
                                    "amtdepositoutgoing_4809442A": ["max"]})
    base = base.join(other, on = "case_id", how = "left")
    
    # tax
    tax_a1 = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_tax_registry_a_1.csv").pipe(set_table_dtypes)
    tax_b1 = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_tax_registry_b_1.csv").pipe(set_table_dtypes)
    
    tax_a1 = agg_float_cols(tax_a1, {"amount_4527230A": ["sum"]})
    tax_b1 = agg_float_cols(tax_b1, {"amount_4917619A": ["sum"]})
    
    base = base.join(tax_a1, on = "case_id", how = "left")
    base = base.join(tax_b1, on = "case_id", how = "left")
    
    # applprev1
    paths = ROOT + f"/csv_files/{prefix}/{prefix}_applprev_1_*.csv"
    applprev1 = [pl.read_csv(path) for path in glob(paths)]
    applprev1 = pl.concat(applprev1, how="vertical_relaxed").pipe(set_table_dtypes).drop("childnum_21L")
    
    operations = {i: ["mean", "std"] for i in applprev1.columns if "Float" in str(applprev1[i].dtype)}
    applprev1_numeric = agg_float_cols(applprev1, operations)
    base = base.join(applprev1_numeric, on = "case_id", how = "left")
    
    return base

In [10]:
def unite_depth2_data(base, prefix):
    # cb a2
    output_feats = []
    
    for path in glob(str(ROOT + f"/csv_files/{prefix}/{prefix}_credit_bureau_a_2_*.csv")):
        cb_a2 = pl.read_csv(path).pipe(set_table_dtypes)
        agg_cols = []

        cons_cols = ["pmts_overdue_1140A", "pmts_overdue_1152A"]
        for col in cons_cols:
            agg_cols.append(pl.sum(col).alias(f"sum_{col}"))
            
        cb_a2 = cb_a2.group_by('case_id').agg(agg_cols)
        output_feats.append(cb_a2)
    output_feats = pl.concat(output_feats)
    
    base = base.join(output_feats, on = "case_id", how = "left")
    return base

# feature engineering

In [11]:
train = load_base_data("train")
train_overall = unite_depth0_data(train, "train")
train_overall = unite_depth1_data(train_overall, "train")
train_overall = unite_depth2_data(train_overall, "train")
y_train = train_overall.select("target").to_pandas()

In [12]:
test = load_base_data("test")
test_overall = unite_depth0_data(test, "test")
test_overall = unite_depth1_data(test_overall, "test")
test_overall = unite_depth2_data(test_overall, "test")
test_case_ids = test["case_id"].to_numpy().reshape(-1)

# remove cols

In [13]:
remove_cols = filter_cols(train_overall)
date_features = [i for i in train_overall.columns if "Date" in str(train_overall[i].dtype)]
unused_cols = ["mastercontrelectronic_519L", "mastercontrexist_109L",
               "clientscnt_136L", "previouscontdistrict_112M",
               "lastapprcommoditytypec_5251766M", "gender_992L", "maritalst_703L"]
print(len(remove_cols + date_features + unused_cols))
train_overall = train_overall.drop(remove_cols + date_features + unused_cols)
test_overall = test_overall.drop(remove_cols + date_features + unused_cols)

40


# boolean

In [14]:
bool_features = [i for i in train_overall.columns if "Int" in str(train_overall[i].dtype) and 
                 i not in ["case_id", "MONTH", "WEEK_NUM", "target"]]
print(len(bool_features))

6


# float

In [15]:
float_features = [i for i in train_overall.columns if "Float" in str(train_overall[i].dtype)]
print(len(float_features))

204


# strings

In [16]:
str_features= [i for i in train_overall.columns if "String" in str(train_overall[i].dtype)]
print(len(str_features))
train_overall, test_overall = convert_strings(train_overall, test_overall, str_features)

34


In [17]:
selected_features = sorted([i for i in train_overall.columns if i not in ["case_id", "MONTH", "WEEK_NUM", "target"]])
cat_features = [ele for ind, ele in enumerate(selected_features) if ele in bool_features + str_features]

# convert from polars

In [18]:
train_fill_pd = train_overall.select(selected_features).to_pandas()
test_fill_pd = test_overall.select(selected_features).to_pandas()

for i in train_fill_pd.columns:
    if i in cat_features:
        if (train_fill_pd[i].isnull().sum() or test_fill_pd[i].isnull().sum()):
            train_fill_pd[i] = train_fill_pd[i] + 1
            test_fill_pd[i] = test_fill_pd[i] + 1
        train_fill_pd[i] = train_fill_pd[i].fillna(0).astype("int32")
        test_fill_pd[i] = test_fill_pd[i].fillna(0).astype("int32")

In [19]:
train_pd = train_overall.select(selected_features).to_pandas()
test_pd = test_overall.select(selected_features).to_pandas()
   
print(train_pd.shape, test_pd.shape)

(1526659, 244) (10, 244)


# cv

In [20]:
train_week_df = train.select("WEEK_NUM").to_pandas()
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

# treemodel config

In [21]:
lgb_params1 = {
            'objective': 'binary', 
            'metric': 'None',
            'learning_rate': 0.02,
            'scale_pos_weight': 10,
            'boosting': 'gbdt',
            'verbose': -1,
            'seed': 42,
            'num_leaves': 64, 
            "reg_alpha": 0.1,
            "reg_lambda": 10,
            "cat_smooth": 20,
        }

lgb_params2 = {
            'objective': 'binary', 
            'metric': 'None',
            'learning_rate': 0.02,
            'scale_pos_weight': 8,
            'boosting': 'gbdt',
            'verbose': -1,
            'seed': 42,
            'extra_trees': True,
        }

cb_params = { 
            'random_seed': 42,
            "learning_rate": 0.04,
            'use_best_model': True,
            'iterations': 10000,
            'reg_lambda': 10,
            "scale_pos_weight": 9,
            "task_type": "GPU",
            'loss_function': 'Logloss',
        }

In [22]:
config = {    
    "lgb": {
        "num_round": 10000,
        "es_round" : 20,
        "verbose_eval": 100,
    },
    "cb": {
        "task_type": "classification",
        "es_round" : 20,
        "verbose_eval": 500,
    },
}

In [23]:
treemodel = TreeModelling(config)

# modelling

In [24]:
def check_metric(output):
    tmp = train.select("WEEK_NUM", "target").to_pandas()
    tmp["score"] = output
    print("comp metric: ", comp_metric(tmp))
    print("ext fp cnt : ", false_positive(tmp))
    print("ext fn cnt : ", false_negative(tmp))
    print("std of gini: ", weekly_gini(tmp, "std"))

In [25]:
lgb_models = []
lgb_output = np.zeros(len(train_pd))
for fold, (tr_idx, val_idx) in enumerate(cv.split(train_pd, y_train, groups=train_week_df)):
    print("Fold :", fold)
    train_week = train_week_df.loc[tr_idx].values.reshape(-1)
    valid_week = train_week_df.loc[val_idx].values.reshape(-1)

    lgb_model, lgb_val_output = treemodel.lgb_train_and_valid(train_pd.loc[tr_idx], y_train.loc[tr_idx],
                                                              train_pd.loc[val_idx], y_train.loc[val_idx],
                                                              lgb_params1, cat = cat_features, feval = metrics)
    lgb_models.append(lgb_model)
    lgb_output[val_idx] = lgb_val_output
    print("----")

check_metric(lgb_output)

Fold : 0
Training until validation scores don't improve for 20 rounds
[100]	valid_0's avg: 0.646479	valid_0's std: 0.0387676	valid_0's ext_fp: 0	valid_0's ext_fn: 1744
[200]	valid_0's avg: 0.663201	valid_0's std: 0.0393006	valid_0's ext_fp: 66	valid_0's ext_fn: 1450
[300]	valid_0's avg: 0.671404	valid_0's std: 0.0385777	valid_0's ext_fp: 257	valid_0's ext_fn: 1461
[400]	valid_0's avg: 0.675572	valid_0's std: 0.0389212	valid_0's ext_fp: 435	valid_0's ext_fn: 1514
[500]	valid_0's avg: 0.67804	valid_0's std: 0.0378051	valid_0's ext_fp: 564	valid_0's ext_fn: 1551
[600]	valid_0's avg: 0.679347	valid_0's std: 0.0371962	valid_0's ext_fp: 641	valid_0's ext_fn: 1586
[700]	valid_0's avg: 0.680355	valid_0's std: 0.0368357	valid_0's ext_fp: 679	valid_0's ext_fn: 1616
Early stopping, best iteration is:
[769]	valid_0's avg: 0.680748	valid_0's std: 0.0364953	valid_0's ext_fp: 697	valid_0's ext_fn: 1627
Evaluated only: avg
----
Fold : 1
Training until validation scores don't improve for 20 rounds
[100

In [26]:
lgb_models2 = []
lgb_output2 = np.zeros(len(train_pd))
for fold, (tr_idx, val_idx) in enumerate(cv.split(train_pd, y_train, groups=train_week_df)):
    print("Fold :", fold)
    train_week = train_week_df.loc[tr_idx].values.reshape(-1)
    valid_week = train_week_df.loc[val_idx].values.reshape(-1)

    lgb_model2, lgb_val_output2 = treemodel.lgb_train_and_valid(train_pd.loc[tr_idx], y_train.loc[tr_idx],
                                                              train_pd.loc[val_idx], y_train.loc[val_idx],
                                                              lgb_params2, cat = cat_features, feval = metrics)
    lgb_models2.append(lgb_model2)
    lgb_output2[val_idx] = lgb_val_output2
    print("----")

check_metric(lgb_output2)

Fold : 0
Training until validation scores don't improve for 20 rounds
[100]	valid_0's avg: 0.629087	valid_0's std: 0.0380655	valid_0's ext_fp: 0	valid_0's ext_fn: 2441
[200]	valid_0's avg: 0.649884	valid_0's std: 0.0389166	valid_0's ext_fp: 11	valid_0's ext_fn: 1928
[300]	valid_0's avg: 0.660546	valid_0's std: 0.0389085	valid_0's ext_fp: 76	valid_0's ext_fn: 1913
[400]	valid_0's avg: 0.667	valid_0's std: 0.0379439	valid_0's ext_fp: 178	valid_0's ext_fn: 1918
[500]	valid_0's avg: 0.670928	valid_0's std: 0.0377304	valid_0's ext_fp: 268	valid_0's ext_fn: 1920
[600]	valid_0's avg: 0.674218	valid_0's std: 0.03705	valid_0's ext_fp: 344	valid_0's ext_fn: 1931
[700]	valid_0's avg: 0.676229	valid_0's std: 0.0367589	valid_0's ext_fp: 398	valid_0's ext_fn: 1956
[800]	valid_0's avg: 0.677658	valid_0's std: 0.0364132	valid_0's ext_fp: 440	valid_0's ext_fn: 1956
[900]	valid_0's avg: 0.67861	valid_0's std: 0.0362439	valid_0's ext_fp: 468	valid_0's ext_fn: 1964
Early stopping, best iteration is:
[948]

In [27]:
cb_models = []
cb_output =np.zeros(len(train_fill_pd))
for fold, (tr_idx, val_idx) in enumerate(cv.split(train_fill_pd, y_train, groups=train_week_df)):
    print("Fold :", fold)
    cb_model, cb_val_output = treemodel.cb_train_and_valid(
                                                            train_fill_pd.loc[tr_idx], y_train.loc[tr_idx],
                                                            train_fill_pd.loc[val_idx], y_train.loc[val_idx],
                                                            cb_params, cat = cat_features)
    cb_models.append(cb_model)
    cb_output[val_idx] = cb_val_output
    print("----")
     
check_metric(cb_output)

Fold : 0
0:	learn: 0.6731236	test: 0.6740160	best: 0.6740160 (0)	total: 7.9s	remaining: 21h 55m 59s
500:	learn: 0.3893288	test: 0.4099145	best: 0.4099145 (500)	total: 2m 17s	remaining: 43m 29s
1000:	learn: 0.3781421	test: 0.4046311	best: 0.4046311 (1000)	total: 4m 25s	remaining: 39m 45s
1500:	learn: 0.3706794	test: 0.4025859	best: 0.4025859 (1500)	total: 6m 32s	remaining: 37m 3s
2000:	learn: 0.3645671	test: 0.4014118	best: 0.4014096 (1998)	total: 8m 39s	remaining: 34m 37s
bestTest = 0.4007258993
bestIteration = 2464
Shrink model to first 2465 iterations.
----
Fold : 1
0:	learn: 0.6736221	test: 0.6733087	best: 0.6733087 (0)	total: 261ms	remaining: 43m 28s
500:	learn: 0.3938227	test: 0.3943253	best: 0.3943253 (500)	total: 2m 7s	remaining: 40m 18s
1000:	learn: 0.3823595	test: 0.3887546	best: 0.3887546 (1000)	total: 4m 13s	remaining: 37m 58s
1500:	learn: 0.3748550	test: 0.3866319	best: 0.3866319 (1500)	total: 6m 19s	remaining: 35m 46s
2000:	learn: 0.3686405	test: 0.3854976	best: 0.3854951 

In [28]:
print("ensemble result")
check_metric((lgb_output * 0.5 + cb_output * 0.4 + lgb_output2 * 0.1))
np.corrcoef(cb_output, lgb_output), np.corrcoef(lgb_output, lgb_output2), np.corrcoef(cb_output, lgb_output2)

ensemble result
comp metric:  0.6725508076312778
ext fp cnt :  3318
ext fn cnt :  8351
std of gini:  0.037300100376715994


(array([[1.       , 0.9718013],
        [0.9718013, 1.       ]]),
 array([[1.       , 0.9832533],
        [0.9832533, 1.       ]]),
 array([[1.        , 0.97821932],
        [0.97821932, 1.        ]]))

# submission

In [29]:
lgb_test_predictions = []
for idx in range(0, len(test_pd), batch_size):
    tmp_predictions = treemodel.lgb_test(lgb_models, test_pd.iloc[idx: idx + batch_size])
    lgb_test_predictions.append(tmp_predictions)
lgb_test_predictions = np.concatenate(lgb_test_predictions)

In [30]:
lgb_test_predictions2 = []
for idx in range(0, len(test_pd), batch_size):
    tmp_predictions = treemodel.lgb_test(lgb_models2, test_pd.iloc[idx: idx + batch_size])
    lgb_test_predictions2.append(tmp_predictions)
lgb_test_predictions2 = np.concatenate(lgb_test_predictions2)

In [31]:
cb_test_predictions = []
for idx in range(0, len(test_fill_pd), batch_size):
    tmp_predictions = treemodel.cb_test(cb_models, test_fill_pd.iloc[idx: idx + batch_size])
    cb_test_predictions.append(tmp_predictions)
cb_test_predictions = np.concatenate(cb_test_predictions)

In [32]:
test_predictions = (lgb_test_predictions * 0.5 + cb_test_predictions * 0.4 + lgb_test_predictions2 * 0.1)

# submission
submission = pd.DataFrame({
            "case_id": test_case_ids,
            "score": test_predictions
        }).set_index('case_id')
submission.to_csv("./submission.csv")