In [None]:
# ====================================================
# Library
# ====================================================
import gc
import os
import warnings

warnings.filterwarnings("ignore")
import itertools
import random

import joblib
import numpy as np
import pandas as pd
import scipy as sp

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
import gc
import warnings
from itertools import combinations

import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import scipy as sp

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
import itertools

from tqdm.auto import tqdm


# ====================================================
# Read & preprocess data and save it to disk
# ====================================================
def read_preprocess_data(path):
    train = pd.read_parquet(path + "/train.parquet")
    features = train.drop(["customer_ID", "S_2"], axis=1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print("Starting training feature engineer...")
    train_num_agg = train.groupby("customer_ID")[num_features].agg(
        ["mean", "std", "min", "max", "last"]
    )
    train_num_agg.columns = ["_".join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace=True)
    train_cat_agg = train.groupby("customer_ID")[cat_features].agg(
        ["count", "last", "nunique"]
    )
    train_cat_agg.columns = ["_".join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace=True)

    train_labels = pd.read_csv('../input/amex-default-prediction/train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how="inner", on="customer_ID").merge(
        train_labels, how="inner", on="customer_ID"
    )
    del train_num_agg, train_cat_agg
    gc.collect()
    test = pd.read_parquet(path + "/test.parquet")
    print("Starting test feature engineer...")
    test_num_agg = test.groupby("customer_ID")[num_features].agg(
        ["mean", "std", "min", "max", "last"]
    )
    test_num_agg.columns = ["_".join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace=True)
    test_cat_agg = test.groupby("customer_ID")[cat_features].agg(
        ["count", "last", "nunique"]
    )
    test_cat_agg.columns = ["_".join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace=True)
    test = test_num_agg.merge(test_cat_agg, how="inner", on="customer_ID")
    del test_num_agg, test_cat_agg
    gc.collect()
    # Save files to disk
    train.to_parquet("train_fe.parquet")
    test.to_parquet("test_fe.parquet")


# Read & Preprocess Data
read_preprocess_data("../input/amex-data-integer-dtypes-parquet-format/")

Starting training feature engineer...


In [None]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    input_dir = "./"
    seed = 42
    n_folds = 6
    target = "target"


# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)


# ====================================================
# Read data
# ====================================================
def read_data():
    train = pd.read_parquet(CFG.input_dir + "train_fe.parquet")
    test = pd.read_parquet(CFG.input_dir + "test_fe.parquet")
    return train, test


# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:, 0] == 0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:, 0]) / np.sum(labels[:, 0])
    gini = [0, 0]
    for i in [1, 0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:, 0] == 0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] * weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1] / gini[0] + top_four)


# ====================================================
# LGBM amex metric
# ====================================================
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return "amex_metric", amex_metric(y_true, y_pred), True


import pickle

class DartEarlyStopping(object):
    def __init__(self, data_name, monitor_metric, stopping_round):
        self.data_name = data_name
        self.monitor_metric = monitor_metric
        self.stopping_round = stopping_round
        self.best_score = None
        self.best_model = None
        self.best_score_list = []
        self.best_iter = 0

    def _is_higher_score(self, metric_score, is_higher_better):
        if self.best_score is None:
            return True
        return (self.best_score < metric_score) if is_higher_better else (self.best_score > metric_score)

    def _deepcopy(self, x):
        return pickle.loads(pickle.dumps(x))

    def __call__(self, env):
        evals = env.evaluation_result_list
        for data, metric, score, is_higher_better in evals:
            if data != self.data_name or metric != self.monitor_metric:
                continue
            if not self._is_higher_score(score, is_higher_better):
                if env.iteration - self.best_iter > self.stopping_round:
                    eval_result_str = '\t'.join([lgb.callback._format_eval_result(x) for x in self.best_score_list])
                    print(f"Early stopping, best iteration is:\n[{self.best_iter+1}]\t{eval_result_str}") 
                    print(f"You can get best model by \"DartEarlyStopping.best_model\"")
                    raise lgb.callback.EarlyStopException(self.best_iter, self.best_score_list)
                return

            self.best_model = self._deepcopy(env.model)
            self.best_iter = env.iteration
            self.best_score_list = evals
            self.best_score = score
            return
        raise ValueError("monitoring metric not found")



# ====================================================
# Train & Evaluate
# ====================================================
def train_and_evaluate(train, test):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    cat_features = [f"{cf}_last" for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    # Round last float features to 2 decimal place
    num_cols = list(
        train.dtypes[(train.dtypes == "float32") | (train.dtypes == "float64")].index
    )
    num_cols = [col for col in num_cols if "last" in col]
    for col in num_cols:
        train[col + "_round2"] = train[col].round(2)
        test[col + "_round2"] = test[col].round(2)
    # Get feature list
    features = [col for col in train.columns if col not in ["customer_ID", CFG.target]]
    params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': CFG.seed,
        'num_leaves': 100,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40
        }
    
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True, random_state=CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(" ")
        print("-" * 50)
        print(f"Training fold {fold} with {len(features)} features...")
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = (
            train[CFG.target].iloc[trn_ind],
            train[CFG.target].iloc[val_ind],
        )
        lgb_train = lgb.Dataset(x_train, y_train, categorical_feature=cat_features)
        lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature=cat_features)

        es = DartEarlyStopping("valid_0", "amex_metric", stopping_round=500)
        model = lgb.train(
            params=params,
            train_set=lgb_train,
            num_boost_round=5000,
            valid_sets=[lgb_valid],
            early_stopping_rounds=100,
            callbacks=[es],
            verbose_eval=10,
            feval=lgb_amex_metric,
        )

        # Save best model
        joblib.dump(
            model,
            f"lgbm_fold{fold}_seed{CFG.seed}.pkl",
        )
        # Predict validation
        val_pred = model.predict(x_val)
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        # Predict the test set
        test_pred = model.predict(test[features])
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f"Our fold {fold} CV score is {score}")
        del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
        gc.collect()
    
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f"Our out of folds CV score is {score}")
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame(
        {
            "customer_ID": train["customer_ID"],
            "target": train[CFG.target],
            "prediction": oof_predictions,
        }
    )
    oof_df.to_csv(
        f"oof_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv",
        index=False,
    )
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame(
        {"customer_ID": test["customer_ID"], "prediction": test_predictions}
    )
    test_df.to_csv(
        f"test_lgbm_baseline_{CFG.n_folds}fold_seed{CFG.seed}.csv",
        index=False,
    )

seed_everything(CFG.seed)
train, test = read_data()
train_and_evaluate(train, test)