- evaluate by mean gini and check fp and fn 
- last commit to the first version *

In [1]:
from glob import glob
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import xgboost as xgb
from catboost import  CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score

In [2]:
ROOT = '/kaggle/input/home-credit-credit-risk-model-stability'
TARGET = 'target'
EVAL = True

In [3]:
class TreeModelling:
    """
        Train and test data should contain the same selected features for ML models.
        Train, test data and target should be the same data type. (Pandas or Numpy)
    """
    def __init__(self, config):
        self.config = config 
    
    def lgb_train(self, x_tr, y_tr, params, cat = None):
        
        model = lgb.train(params,
                          lgb.Dataset(x_tr, y_tr),
                          categorical_feature = cat)
    
        return model
    
    def lgb_train_and_valid(self, x_tr, y_tr, x_val, y_val, params, cat=None):
            
        dtrain = lgb.Dataset(x_tr, y_tr)
        deval = lgb.Dataset(x_val, y_val, reference=dtrain)

        ####
        dtrain.week_num = train_week
        deval.week_num =  valid_week
        ####

        callbacks = [
                lgb.early_stopping(stopping_rounds=self.config["lgb"]["es_round"],
                                   first_metric_only = True), 
                lgb.log_evaluation(self.config["lgb"]["verbose_eval"])
            ]
            
        model = lgb.train(params, dtrain, 
                          self.config["lgb"]["num_round"], 
                          valid_sets=[deval], 
                          feval = self.config["lgb"]["custom_metric"],
                          callbacks = callbacks,
                          categorical_feature = cat)
        valid_pred = model.predict(x_val)
    
        return model, valid_pred
        
    def lgb_test(self, models, test):
        test_pred = [model.predict(test) for model in models]
        test_pred = np.mean(test_pred, axis=0)
        return test_pred
    
    def lgb_test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            test_pred_batch = [model.predict(test.iloc[idx:idx+batch_size]) for model in models]
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)
    
    def lgb_numpy_test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            test_pred_batch = [model.predict(test[idx:idx+batch_size]) for model in models]
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)
    

    def xgb_train_and_valid(self, x_tr, y_tr, x_val, y_val, params):
                
        xgb_eval = xgb.DMatrix(x_val, label = y_val)
        model = xgb.train(params,
                          xgb.DMatrix(x_tr, label = y_tr), 
                          self.config["xgb"]["num_round"],
                          evals = [(xgb_eval, "eval")], 
                          early_stopping_rounds = self.config["xgb"]["es_round"], 
                          verbose_eval = self.config["xgb"]["verbose_eval"])
        valid_pred = model.predict(xgb_eval, iteration_range=(0, model.best_ntree_limit))
    
        return model, valid_pred
    
    def xgb_test(self, models, test):
        dtest = xgb.DMatrix(test)
        test_pred = [model.predict(dtest, iteration_range=(0, model.best_ntree_limit)) for model in models]
        test_pred = np.mean(test_pred, axis=0)
        return test_pred
    

    def cb_train(self, x_tr, y_tr, params):
        
        train_pool = Pool(data=x_tr,
                          label=y_tr,
                          #cat_features=cat
                         )
                
        if self.config["cb"]["task_type"] == "classification":
            model = CatBoostClassifier(**params)

        elif self.config["cb"]["task_type"]:
            model = CatBoostRegressor(**params)
        model.fit(train_pool)
            
        return model

    
    def cb_train_and_valid(self,  x_tr, y_tr, x_val, y_val, params):
        
        train_pool = Pool(data=x_tr,
                          label=y_tr,
                          #cat_features=cat,
                          weight=train_week_num)
        
        valid_pool = Pool(data=x_val,
                          label=y_val,
                          #cat_features=cat,
                          weight=valid_week_num)
                
        if self.config["cb"]["task_type"] == "classification":
            model = CatBoostClassifier(**params)
            model.fit(train_pool,
                      eval_set=[valid_pool], 
                      early_stopping_rounds=self.config["cb"]["es_round"],
                      verbose_eval = self.config["cb"]["verbose_eval"])
            valid_pred = model.predict_proba(x_val)[:,1]
        elif self.config["cb"]["task_type"]:
            model = CatBoostRegressor(**params)
            model.fit(train_pool,
                      eval_set=[valid_pool], 
                      early_stopping_rounds=self.config["cb"]["es_round"], 
                      verbose_eval = self.config["cb"]["verbose_eval"])
            valid_pred = model.predict(x_val)
            
        return model, valid_pred
    
    def cb_test(self, models, test):
        if self.config["cb"]["task_type"] == "classification":
            test_pred = [model.predict_proba(test)[:,1] for model in models]
        else:
            test_pred = [model.predict(test) for model in models]
        test_pred = np.mean(test_pred, axis=0)        
        return test_pred
    
    def cb_test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            if self.config["cb"]["task_type"] == "classification":
                test_pred_batch = [model.predict_proba(test.iloc[idx:idx+batch_size])[:,1] for model in models]
            else:
                test_pred_batch = [model.predict(test.iloc[idx:idx+batch_size]) for model in models]            
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)
    
    def cb_numpy_test_by_batch(self, models, test, batch_size):
        test_pred_all = []
        for idx in range(0, len(test), batch_size):
            if self.config["cb"]["task_type"] == "classification":
                test_pred_batch = [model.predict_proba(test[idx:idx+batch_size])[:,1] for model in models]
            else:
                test_pred_batch = [model.predict(test[idx:idx+batch_size]) for model in models]            
            test_pred_batch = np.mean(test_pred_batch, axis=0)
            test_pred_all.append(test_pred_batch)
        return np.concatenate(test_pred_all)

------- edit from here -------------

# metric functions

In [4]:
def comp_metric(base, w_fallingrate=88.0, w_resstd=-0.5):
    def calc_lr_scores(x, y):
        a, b = np.polyfit(x, y, 1)
        y_hat = a*x + b
        residuals = y - y_hat
        res_std = np.std(residuals)
        return w_fallingrate * min(0, a) + w_resstd * res_std
    
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
        .sort_values("WEEK_NUM")\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    avg_gini = np.mean(gini_in_time)
    x = np.arange(len(gini_in_time))
    y = np.array(gini_in_time)
    score = avg_gini + calc_lr_scores(x, y)
    return score

def weekly_gini(base):
    gini_in_time = base.loc[:, ["WEEK_NUM", "target", "score"]]\
    .groupby("WEEK_NUM")[["target", "score"]]\
    .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
    score = np.mean(gini_in_time)

    return score

def lgb_avg_gini_metric(preds, eval_data):
    labels = eval_data.get_label()
    week_num = eval_data.week_num
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(week_num, columns = ["WEEK_NUM"])
    score_df["target"] = labels
    score_df["score"] = y_pred
        
    score = weekly_gini(score_df)
    
    return 'avg_gini', score, True

def false_positive(base):
    base = base[base["target"]==0]
    score = np.sum(base["score"] - base["target"] >= 0.8)

    return score

def false_negative(base):
    base = base[base["target"]==1]
    score = np.sum(base["target"] - base["score"] >= 0.8)
    
    return score

def lgb_avg_neg_metric(preds, eval_data):
    labels = eval_data.get_label()
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(labels, columns = ["target"])
    score_df["score"] = y_pred
        
    score = false_positive(score_df)
    
    return 'ext_fp_num', score, False

def lgb_avg_pos_metric(preds, eval_data):
    labels = eval_data.get_label()
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(labels, columns = ["target"])
    score_df["score"] = y_pred
    
    score = false_negative(score_df)
    
    return "ext_fn_num", score, False

def lgb_overall(preds, eval_data):
    labels = eval_data.get_label()
    y_pred = preds.reshape(-1, 1)
    
    score_df = pd.DataFrame(labels, columns = ["target"])
    score_df["score"] = y_pred
    
    score = weekly_gini(score_df) - false_positive(score_df)
    
    return "overall", score, True

def metrics(preds, data):
    return [
        lgb_avg_gini_metric(preds, data),
        lgb_avg_neg_metric(preds, data),
        lgb_avg_pos_metric(preds, data),
    ]

In [5]:
class CatBoostEvalMetric(object):
    
    @staticmethod
    def calc(preds, targets, weights):
        df = pd.DataFrame(preds, columns = ["score"])
        df["target"] = targets
        df["WEEK_NUM"] = weights
        gini_in_time = df.loc[:, ["WEEK_NUM", "target", "score"]]\
        .groupby("WEEK_NUM")[["target", "score"]]\
        .apply(lambda x: 2*roc_auc_score(x["target"], x["score"])-1).tolist()
    
        avg_gini = np.mean(gini_in_time)
        return avg_gini
    
    def get_final_error(self, error, weight):
        return error

    def is_max_optimal(self):
        # the larger metric value the better
        return True

    def evaluate(self, approxes, target, weight):
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        preds = np.array(approxes[0])
        target = np.array(target)
        score = self.calc(preds, target, weight)
        return score, 0

# common fe functions

In [6]:
def read_files(regex_path):
    chunks = []
    
    for path in glob(str(regex_path)):
        df = pl.read_csv(path)
        df = df.pipe(set_table_dtypes)
        chunks.append(df)
    df = pl.concat(chunks, how="vertical_relaxed")
    return df

In [7]:
# fix data type
feature_def = pd.read_parquet("/kaggle/input/home-credit-enhanced-feature-definitions/feature_definitions_dtypes_tables.parquet")
feature_def = dict(zip(feature_def["Variable"], feature_def["dtype"]))

def set_table_dtypes(df):
    for col in df.columns:
        if col in ["case_id", "WEEK_NUM", "num_group1", "num_group2"]:
            df = df.with_columns(pl.col(col).cast(pl.Int64))
        elif col in ["date_decision"]:
            df = df.with_columns(pl.col(col).cast(pl.Date))
        elif col in ["persontype_1072L", "persontype_792L", "personindex_1023L"]:
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("P", "A"):
            df = df.with_columns(pl.col(col).cast(pl.Float32))
        elif col[-1] in ("M",):
            df = df.with_columns(pl.col(col).cast(pl.String))
        elif col[-1] in ("D",):
            df = df.with_columns(pl.col(col).cast(pl.Date))
        else:
            if "String" in feature_def[col]:
                df = df.with_columns(pl.col(col).cast(pl.String))
            elif "Float" in feature_def[col]:
                df = df.with_columns(pl.col(col).cast(pl.Float32))
            elif "Date" in feature_def[col]:
                df = df.with_columns(pl.col(col).cast(pl.Date))
            elif "Boolean" in feature_def[col]:
                if "String" in str(df[col].dtype):
                    df = df.with_columns(pl.col(col).str.to_lowercase())
                    df = df.with_columns(pl.col(col).replace({"false":False, "true":True}, default=None))
                    df = df.with_columns(pl.col(col).cast(pl.Int8))
                else:
                    df = df.with_columns(pl.col(col).cast(pl.Int8))

    return df

def filter_cols(df):
    remove_cols = []
    for col in df.columns:
        if df[col].n_unique() == 1 or df[col].null_count() == df.shape[0]:
            remove_cols.append(col)
    return remove_cols

In [8]:
# for string columns
def convert_strings(train_df, test_df, str_cols):
    for col in str_cols: 
        train_categories = set(train_df.filter(pl.col(col).is_not_null()).unique(col).select(col).to_numpy().reshape(-1))
        test_categories = set(test_df.filter(pl.col(col).is_not_null()).unique(col).select(col).to_numpy().reshape(-1))
        encoding_dict = dict([(c, i + 1) for i, c in enumerate(train_categories)])
        unique_cat_idx = len(train_categories) + 1
        unique_categories = list(sorted(test_categories - train_categories))
        for ele in unique_categories:
            encoding_dict[ele] = unique_cat_idx
        train_df = train_df.with_columns(pl.col(col).replace(encoding_dict, default = 0))
        test_df = test_df.with_columns(pl.col(col).replace(encoding_dict, default = 0))
    return train_df, test_df

def ce_strings(train_df, test_df, str_cols):
    for col in str_cols:
        str_count = train_df.filter(pl.col(col).is_not_null()).group_by(col).len()
        str_count.columns = [col, "ce_"+col]
        train_df = train_df.join(str_count, on = col, how = "left").with_columns(pl.col("ce_"+col))
        test_df = test_df.join(str_count, on = col, how = "left").with_columns(pl.col("ce_"+col))
        
    return train_df, test_df

In [9]:
# for numeric columns
def agg_float_cols(input_df, options = ["max", "min", "mean", "std"]):
    
    float_cols = [i for i in input_df.columns if "Float" in str(input_df[i].dtype)]
    agg_cols = []
    for col in float_cols:
        if "min" in options:
            agg_cols.append(pl.min(col).alias(f"min_{col}"))
        if "max" in options:
            agg_cols.append(pl.max(col).alias(f"max_{col}"))
        if "mean" in options:
            agg_cols.append(pl.mean(col).alias(f"mean_{col}"))
        if "std" in options:
            agg_cols.append(pl.std(col).alias(f"std_{col}"))
        if "sum" in options:
            agg_cols.append(pl.sum(col).alias(f"sum_{col}"))

    output_df = input_df.group_by("case_id").agg(agg_cols)
        
    return output_df

In [10]:
def load_base_data(prefix):

    base =  pl.read_csv(ROOT + f'/csv_files/{prefix}/{prefix}_base.csv')
    base = base.with_columns(pl.col("date_decision").cast(pl.Date))

    return base


def unite_depth0_data(base, prefix):

    static0_paths = ROOT + f"/csv_files/{prefix}/{prefix}_static_0_*.csv"
    
    # depth 0 internal
    static0 = [pl.read_csv(path) for path in glob(static0_paths)]
    depth0_internal = pl.concat(static0, how="vertical_relaxed").pipe(set_table_dtypes) 

    # depth 0 external
    depth0_external = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_static_cb_0.csv").pipe(set_table_dtypes)
    
    base = base.join(depth0_internal, on = "case_id", how = "left")
    base = base.join(depth0_external, on = "case_id", how = "left")
    
    return base

In [11]:
def unite_depth1_data(base, prefix):
    
    base_date = base.select("case_id", "date_decision")
    
    # internal
    #applprev1 = make_applprev1_feats(prefix, base_date)
    person = make_person_feats(prefix)
    #other = make_other_feats(prefix)
    depo = make_depo_feats(prefix, base_date)
    
    # external
    tax = make_tax_feats(prefix, base_date)

    #base = base.join(applprev1, on = "case_id", how = "left")
    base = base.join(person, on = "case_id", how = "left")
    #base = base.join(other, on = "case_id", how = "left")
    base = base.join(depo, on = "case_id", how = "left")
    base = base.join(tax, on = "case_id", how = "left")
    base = calc_age(base)
            
    return base

def calc_age(df):
    for col in ["birth_259D"]: 
        df = df.with_columns((pl.col("date_decision") - pl.col(col)).alias("age"))
        df = df.with_columns(pl.col("age").dt.total_days())
    return df

def make_applprev1_feats(prefix, base_date):

    applprev1 = read_files(ROOT + f"/csv_files/{prefix}/{prefix}_applprev_1_*.csv").pipe(set_table_dtypes)
    applprev_count = applprev1.group_by("case_id").len()
    applprev_count.columns = ["case_id", "appl_count"]
    #applprev1_numeric = agg_float_cols(applprev1, options = ["max", "min"])
    #str_cols = [i for i in applprev1.columns if "String" in str(applprev1[i].dtype)]
    #applprev1_str = applprev1.filter(pl.col("num_group1")==0).drop("num_group1") #.select(["case_id"]+str_cols)
    #applprev1_feats = applprev1_numeric.join(applprev1_str, on = "case_id", how = "left")

    return applprev_count #_feats

def make_person_feats(prefix):
    person = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_person_1.csv").pipe(set_table_dtypes)
    #person = person.drop("gender_992L", "childnum_185L")
    #person_numeric = agg_float_cols(person)

    #str_cols = [i for i in person.columns if "String" in str(person[i].dtype)]
    person_str = person.filter(pl.col("num_group1")==0).select(["case_id", "birth_259D", "sex_738L"])
    #person_feats = person_numeric.join(person_str, on = "case_id", how = "left")

    return person_str #feats

def make_other_feats(prefix):
    
    other = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_other_1.csv").pipe(set_table_dtypes) 
    other = agg_float_cols(other, options = ["max"])
    return other

def make_depo_feats(prefix, base_date):
    
    deposit = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_deposit_1.csv").pipe(set_table_dtypes)
    deposit = agg_float_cols(deposit)
    return deposit

def make_tax_feats(prefix, base_date):
    tax_a1 = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_tax_registry_a_1.csv").pipe(set_table_dtypes)
    tax_b1 = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_tax_registry_b_1.csv").pipe(set_table_dtypes)
    
    tax_a1 = agg_float_cols(tax_a1, options = ["sum"])
    tax_b1 = agg_float_cols(tax_b1, options = ["sum"])
    
    tax = tax_a1.join(tax_b1, on = "case_id", how = "outer_coalesce")
    
    return tax

In [12]:
def unite_depth2_data(base, prefix):
    applprev2 = make_applprev2_feats(prefix)    
    base = base.join(applprev2, on = "case_id", how = "left")

    cb_a2 = load_and_make_cb_a2_feats(prefix)
    base = base.join(cb_a2, on = "case_id", how = "left")

    return base
    
def make_applprev2_feats(prefix):
    applprev2 = pl.read_csv(ROOT + f"/csv_files/{prefix}/{prefix}_applprev_2.csv").pipe(set_table_dtypes)
    applprev2_count = applprev2.group_by("case_id").len()
    
    applprev2 = applprev2.filter(pl.col("num_group1")==0).filter(pl.col("num_group2")==0)
    applprev2 = applprev2.drop("num_group1", "num_group2", "credacc_cards_status_52L")
    applprev2 = applprev2.join(applprev2_count, on = "case_id", how = "left")
    
    return applprev2

def load_and_make_cb_a2_feats(prefix):
    output_feats = []
    
    for path in glob(str(ROOT + f"/csv_files/{prefix}/{prefix}_credit_bureau_a_2_*.csv")):
        cb_a2 = pl.read_csv(path).pipe(set_table_dtypes)
        agg_cols = []

        cons_cols = ["pmts_overdue_1140A", "pmts_overdue_1152A"]
        for col in cons_cols:
            agg_cols.append(pl.sum(col).alias(f"sum_{col}"))
            
        cb_a2 = cb_a2.group_by('case_id').agg(agg_cols)
        output_feats.append(cb_a2)
    output_feats = pl.concat(output_feats)
    return output_feats

# feature engineering

In [13]:
train = load_base_data("train")
train_date = train.select("case_id", "date_decision")
train_overall = unite_depth0_data(train, "train")
#train_overall = unite_depth1_data(train_overall, "train")
#train_overall = unite_depth2_data(train_overall, "train")
y_train = train_overall[TARGET].to_pandas().astype(np.float32).values

In [14]:
test = load_base_data("test")
test_overall = unite_depth0_data(test, "test")
#test_overall = unite_depth1_data(test_overall, "test")
#test_overall = unite_depth2_data(test_overall, "test")
test_case_ids = test["case_id"].to_numpy().reshape(-1)

In [15]:
all_train_week_nums = train.unique("WEEK_NUM").sort("WEEK_NUM").select("WEEK_NUM").to_numpy().reshape(-1)
train_week_df = train.select("WEEK_NUM").to_pandas()

from sklearn.model_selection import StratifiedGroupKFold
cv = StratifiedGroupKFold(n_splits=5, shuffle=False)

# remove cols

In [16]:
remove_cols = filter_cols(train_overall)
print(remove_cols)
train_overall = train_overall.drop(remove_cols)
test_overall = test_overall.drop(remove_cols)
print(len(remove_cols))

['deferredmnthsnum_166L']
1


# deal with strings

In [17]:
str_features= [i for i in train_overall.columns if "String" in str(train_overall[i].dtype)]
print(len(str_features))
train_overall, test_overall = convert_strings(train_overall, test_overall, str_features)
train_overall, test_overall = ce_strings(train_overall, test_overall, str_features)

25


# convert to numpy

In [18]:
date_features = [i for i in train_overall.columns if "Date" in str(train_overall[i].dtype)]
train_overall = train_overall.drop(date_features)
test_overall = test_overall.drop(date_features)

selected_features = [i for i in train_overall.columns if i not in ['case_id', 'MONTH', 'WEEK_NUM', 'target']]
cat_feature_idx = [ind for ind, ele in enumerate(selected_features) if ele in str_features]

train_numpy = np.ndarray(shape=(len(train_overall), len(selected_features)), dtype=np.float32)
test_numpy = np.ndarray(shape=(len(test_overall), len(selected_features)), dtype=np.float32)

for idx, feature in enumerate(selected_features):
    train_numpy[:,idx] = train_overall[feature].to_pandas().astype(np.float32).values
    test_numpy[:,idx] = test_overall[feature].to_pandas().astype(np.float32).values
    
print(train_numpy.shape, test_numpy.shape)

(1526659, 219) (10, 219)


# treemodel config

In [19]:
config = {    
    "lgb": {
        "es_round" : 100,
        "verbose_eval": 100,
        "params" : {
            "learning_rate": 0.02,
            'objective': 'binary', 
            'metric': 'None',
            'is_unbalance': True,
            'boosting': 'gbdt',
            'verbose': -1,
            'seed': 42,
            'num_leaves': 64, 
            'max_depth': 10,
            'extra_trees': True, #
            "reg_alpha": 0.1,  #
            "reg_lambda": 10,  #   
        },   
        "custom_metric": metrics,
    },
}

In [20]:
config["lgb"]["num_round"] = 10#000
treemodel = TreeModelling(config)

# modelling

In [21]:
lgb_models = []
lgb_output =np.zeros(len(train_numpy))
for fold, (tr_idx, val_idx) in enumerate(cv.split(train_numpy, y_train, groups=train_week_df)):
    print("Fold :", fold)
    train_week = train_week_df.loc[tr_idx].values
    valid_week = train_week_df.loc[val_idx].values
    lgb_model, lgb_val_output = treemodel.lgb_train_and_valid(train_numpy[tr_idx], y_train[tr_idx],
                                                              train_numpy[val_idx], y_train[val_idx],
                                                              config["lgb"]["params"], cat = cat_feature_idx)
    lgb_models.append(lgb_model)
    lgb_output[val_idx] = lgb_val_output
    print("----")

Fold : 0
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10]	valid_0's avg_gini: 0.517293	valid_0's ext_fp_num: 0	valid_0's ext_fn_num: 10237
Evaluated only: avg_gini
----
Fold : 1
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10]	valid_0's avg_gini: 0.527509	valid_0's ext_fp_num: 0	valid_0's ext_fn_num: 9236
Evaluated only: avg_gini
----
Fold : 2
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10]	valid_0's avg_gini: 0.53173	valid_0's ext_fp_num: 0	valid_0's ext_fn_num: 9363
Evaluated only: avg_gini
----
Fold : 3
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[10]	valid_0's avg_gini: 0.534693	valid_0's ext_fp_num: 0	valid_0's ext_fn_num: 9579
Evaluated only: avg_gini
----
Fold : 4
Training until validation scores don't improve for 100 

In [22]:
tmp = train_overall.select("WEEK_NUM", "target").to_pandas()
tmp["score"] = lgb_output
tmp["score_diff"] = np.abs(tmp["target"] - tmp["score"])
print("comp metric: ", comp_metric(tmp))
print("ext fp cnt : ", np.sum(tmp[tmp["target"]==0]["score_diff"] >= 0.8))
print("ext fn cnt : ", np.sum(tmp[tmp["target"]==1]["score_diff"] >= 0.8))

comp metric:  0.5061259634124069
ext fp cnt :  0
ext fn cnt :  47848


In [23]:
if not EVAL:
    cv_predictions = treemodel.lgb_numpy_test_by_batch(lgb_models, test_numpy, batch_size = 1000)
    
    config["lgb"]["num_round"] = 12#00
    treemodel = TreeModelling(config)
    lgb_full_model  = treemodel.lgb_train(train_numpy, y_train, config["lgb"]["params"], cat=cat_feature_idx)
    lgb_models = [lgb_full_model]
    full_predictions = treemodel.lgb_numpy_test_by_batch(lgb_models, test_numpy, batch_size = 1000)
  
    test_predictions = (cv_predictions + full_predictions) / 2
    # submission
    submission = pd.DataFrame({
            "case_id": test_case_ids,
            "score": test_predictions
        }).set_index('case_id')
    submission.to_csv("./submission.csv")