In [1]:
import numpy as np
import pandas as pd
import os
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
import gc
from math import sqrt
from sklearn.metrics import roc_auc_score
import pickle

import lightgbm as lgb
import math as mt
from math import *
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict


import joblib
import random
import itertools
import scipy as sp

from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

In [3]:
class CFG:
    seed = 42
    n_folds = 9
    target = 'target'
    input_dir = '../src/data/processed/'
    dpv = 'v5' # v2 with date feats
    mv = 'v1'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_fe_public.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_fe_public.parquet')
    test['D_86_last'] = np.where(test['D_86_last']==-1, 0, test['D_86_last'])
    dcols = [col for col in train.columns if '_isFirstEq' in col] + [col for col in train.columns if '_isLastEq' in col]
    train = train.drop(dcols, axis=1)
    test = test.drop(dcols, axis=1)
    return train, test

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [5]:
seed_everything(CFG.seed)
df_train, df_test = read_data()
df_train.shape, df_test.shape

((458913, 1650), (924621, 1649))

In [6]:
num_cols = list(df_train.dtypes[(df_train.dtypes == 'float32') | (df_train.dtypes == 'float64')].index)
len(num_cols)

1173

In [7]:
# D_* = Delinquency variables
# S_* = Spend variables
# P_* = Payment variables
# B_* = Balance variables
# R_* = Risk variables

In [8]:
# Scols = [col for col in df_train.columns if col.startswith('S_') and col in num_cols]
# Dcols = [col for col in df_train.columns if col.startswith('D_') and col in num_cols]
# Bcols = [col for col in df_train.columns if col.startswith('B_') and col in num_cols]
# Rcols = [col for col in df_train.columns if col.startswith('R_') and col in num_cols]

# len(Scols), len(Dcols), len(Bcols), len(Rcols)

In [9]:
from sklearn.preprocessing import QuantileTransformer

for col in (num_cols):

    transformer = QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal")
    vec_len = len(df_train[col].values)
    vec_len_test = len(df_test[col].values)
    raw_vec = df_train[col].replace([np.inf, -np.inf], np.nan).values.reshape(vec_len, 1)
    transformer.fit(raw_vec)
    df_train[col] = transformer.transform(raw_vec).reshape(1, vec_len)[0]
    df_test[col] = transformer.transform(df_test[col].replace([np.inf, -np.inf], 
                             np.nan).values.reshape(vec_len_test,1)).reshape(1, vec_len_test)[0]

In [10]:
cat_cols = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']
cat_features = [f"{cf}_last" for cf in cat_cols]

In [11]:
df_train.shape, df_test.shape

((458913, 1650), (924621, 1649))

In [12]:
rcols = ['P_2','B_9','D_44','D_48','B_2','B_1','B_11','B_37','B_3','B_18','R_1','D_41','S_23','S_3','B_4',
         'R_2','B_7','D_41','D_39','D_48']

for col in [f"{cf}_last" for cf in rcols]:
    try:
        df_train['round_'+col] = df_train[col].round(2)
        df_test['round_'+col] = df_test[col].round(2)
    except:
        pass

In [13]:
df_train.shape, df_test.shape

((458913, 1668), (924621, 1667))

In [14]:
def add_stats(data, feats, type_col):
    
    cols = feats
    data[type_col+'_f_min']   = data[cols].fillna(0).min(axis = 1).round(2)
    data[type_col+'_f_max']   = data[cols].fillna(0).max(axis = 1).round(2)
    data[type_col+'_f_mean']  = data[cols].fillna(0).mean(axis = 1).round(2)
    data[type_col+'_f_sd']    = data[cols].fillna(0).std(axis = 1).round(2)
    data[type_col+'_f_sum']   = data[cols].fillna(0).sum(axis = 1).round(2)
    data[type_col+'_f_skew']  = data[cols].fillna(0).skew(axis = 1).round(2)
    data[type_col+'_f_gtone']  = data[cols].gt(1).sum(axis=1)
    data[type_col+'_f_gttwo']  = data[cols].gt(2).sum(axis=1)
    data[type_col+'_f_gtfour']  = data[cols].gt(4).sum(axis=1)
    data[type_col+'_f_gteight']  = data[cols].gt(8).sum(axis=1)
    data[type_col+'_f_gtsixteen']  = data[cols].gt(16).sum(axis=1)
    data[type_col+'_f_meanone_meandiff'] = (data[type_col+'_f_sum']/data[type_col+'_f_gtone']-data[type_col+'_f_mean'])
    data[type_col+'_f_meantwo_meandiff'] = (data[type_col+'_f_sum']/data[type_col+'_f_gttwo']-data[type_col+'_f_mean'])
    data[type_col+'_f_meanfour_meandiff'] = (data[type_col+'_f_sum']/data[type_col+'_f_gtfour']-data[type_col+'_f_mean'])
    data[type_col+'_f_meaneight_meandiff'] = (data[type_col+'_f_sum']/data[type_col+'_f_gteight']-data[type_col+'_f_mean'])
    data[type_col+'_f_meansixteen_meandiff'] = (data[type_col+'_f_sum']/data[type_col+'_f_gtsixteen']-data[type_col+'_f_mean'])
    
    data = data.drop([type_col+'_f_gtone',type_col+'_f_gttwo',
                      type_col+'_f_gtfour',type_col+'_f_gteight',type_col+'_f_gtsixteen'], axis=1)
    
    return data

In [15]:
typecol = 'last'
df_train = add_stats(df_train, [col for col in num_cols if 'last' in col and col not in cat_features], typecol)
df_test = add_stats(df_test, [col for col in num_cols if 'last' in col and col not in cat_features], typecol)

df_train.shape, df_test.shape

((458913, 1679), (924621, 1678))

In [16]:
typecol = 'round'
df_train = add_stats(df_train,[col for col in num_cols if 'round_' in col and col not in cat_features],typecol)
df_test = add_stats(df_test,[col for col in num_cols if 'round_' in col and col not in cat_features],typecol)

df_train.shape, df_test.shape

((458913, 1690), (924621, 1689))

In [17]:
typecol = 'mean'
df_train = add_stats(df_train, [col for col in num_cols if 'mean' in col and col not in cat_features],typecol)
df_test = add_stats(df_test, [col for col in num_cols if 'mean' in col and col not in cat_features],typecol)

df_train.shape, df_test.shape

((458913, 1701), (924621, 1700))

In [18]:
typecol = 'min'
df_train = add_stats(df_train, [col for col in num_cols if 'min' in col and col not in cat_features],typecol)
df_test = add_stats(df_test, [col for col in num_cols if 'min' in col and col not in cat_features],typecol)

df_train.shape, df_test.shape

((458913, 1712), (924621, 1711))

In [19]:
typecol = 'max'
df_train = add_stats(df_train, [col for col in num_cols if 'max' in col and col not in cat_features],typecol)
df_test = add_stats(df_test, [col for col in num_cols if 'max' in col and col not in cat_features],typecol)

df_train.shape, df_test.shape

((458913, 1723), (924621, 1722))

In [20]:
merge_date = False

if merge_date:
    
    dt_trn = pd.read_parquet(CFG.input_dir + 'dt_trn.parquet')
    df_train = df_train.merge(dt_trn, on='customer_ID', how='left')
    
    del dt_trn
    gc.collect()
    
    dt_tst = pd.read_parquet(CFG.input_dir + 'dt_tst.parquet')
    df_test = df_test.merge(dt_tst, on='customer_ID', how='left')
    
    del dt_tst, fcols
    gc.collect()
    
df_train.shape, df_test.shape

((458913, 1723), (924621, 1722))

In [21]:
X_train = df_train.drop(['customer_ID'], axis=1)
X_test = df_test.drop(['customer_ID'], axis=1)
Y_train = df_train['target']

In [22]:
for cat_col in [f"{cf}_last" for cf in cat_cols]:
    encoder = LabelEncoder()
    X_train[cat_col] = encoder.fit_transform(X_train[cat_col])
    X_test[cat_col] = encoder.transform(X_test[cat_col])

In [23]:
# Get the difference between last and mean
# num_cols = [col for col in df_train.columns if 'last' in col]
# num_cols = [col[:-5] for col in num_cols if 'round' not in col]

num_cols = [col for col in df_train.columns if 'last' in col]
num_cols = [col for col in num_cols if 'lag' not in col]
num_cols = [col for col in num_cols if 'round' not in col]
num_cols = [col for col in num_cols if 'last_f_' not in col]
num_cols = [col[:-5] for col in num_cols]

print(len(num_cols))
l = 0
for col in num_cols:
    try:
        X_train[f'{col}_last_mean_diff'] = X_train[f'{col}_last'] - X_train[f'{col}_mean']
        X_test[f'{col}_last_mean_diff'] = X_test[f'{col}_last'] - X_test[f'{col}_mean']
        X_train[f'{col}_last_first_R'] = X_train[f'{col}_last']/X_train[f'{col}_first']
        X_test[f'{col}_last_first_R'] = X_test[f'{col}_last']/X_test[f'{col}_first']
        X_train[f'{col}_last_mean_R'] = X_train[f'{col}_last']/X_train[f'{col}_mean']
        X_test[f'{col}_last_mean_R'] = X_test[f'{col}_last']/X_test[f'{col}_mean']
        X_train[f'{col}_last_max_diff'] = X_train[f'{col}_last'] - X_train[f'{col}_max']
        X_test[f'{col}_last_max_diff'] = X_test[f'{col}_last'] - X_test[f'{col}_max']
        X_train[f'{col}_last_max_R'] = X_train[f'{col}_last']/X_train[f'{col}_max']
        X_test[f'{col}_last_max_R'] = X_test[f'{col}_last']/X_test[f'{col}_max']
        X_train[f'{col}_max_mean_R'] = X_train[f'{col}_max']/X_train[f'{col}_mean']
        X_test[f'{col}_max_mean_R'] = X_test[f'{col}_max']/X_test[f'{col}_mean']
        l = l+1
    except:
        pass
    
print(l)

188
177


In [24]:
X_train.shape, X_test.shape

((458913, 2784), (924621, 2783))

In [25]:
class DartEarlyStopping(object):
    """DartEarlyStopping"""

    def __init__(self, data_name, monitor_metric, stopping_round):
        self.data_name = data_name
        self.monitor_metric = monitor_metric
        self.stopping_round = stopping_round
        self.best_score = None
        self.best_model = None
        self.best_score_list = []
        self.best_iter = 0

    def _is_higher_score(self, metric_score, is_higher_better):
        if self.best_score is None:
            return True
        return (self.best_score < metric_score) if is_higher_better else (self.best_score > metric_score)

    def _deepcopy(self, x):
        return pickle.loads(pickle.dumps(x))

    def __call__(self, env):
        evals = env.evaluation_result_list
        for data, metric, score, is_higher_better in evals:
            if data != self.data_name or metric != self.monitor_metric:
                continue
            if not self._is_higher_score(score, is_higher_better):
                if env.iteration - self.best_iter > self.stopping_round:
                    eval_result_str = '\t'.join([lgb.callback._format_eval_result(x) for x in self.best_score_list])
                    lgb.basic._log_info(f"Early stopping, best iteration is:\n[{self.best_iter+1}]\t{eval_result_str}") 
                    lgb.basic._log_info(f"You can get best model by \"DartEarlyStopping.best_model\"")
                    raise lgb.callback.EarlyStopException(self.best_iter, self.best_score_list)
                return
            self.best_model = self._deepcopy(env.model)
            self.best_iter = env.iteration
            self.best_score_list = evals
            self.best_score = score
            return
        raise ValueError("monitoring metric not found")
        

In [26]:
params = {
    'objective': 'binary',
    'boosting': 'dart',
    'seed': CFG.seed,
    'num_leaves': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.20,
    'bagging_freq': 8,
    'bagging_fraction': 0.40,
    'n_jobs': -1,
    'lambda_l2': 2,
    'min_data_in_leaf': 40,
    'verbose': -1,
#     'metric': "binary_logloss", 
    'metric': 'amex_metric',
#     'first_metric_only': True,
    }

In [30]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

# Create a numpy array to store test predictions
test_predictions = np.zeros(len(X_test))
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(X_train))

kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(X_train, Y_train)):
    
#     if fold>4:
    print(' ')
    print('-'*50)
    print(f'Training fold {fold}:-')

    x_train, x_val = X_train.drop('target', axis=1).iloc[trn_ind], X_train.drop('target', axis=1).iloc[val_ind]
    y_train, y_val = Y_train.iloc[trn_ind], Y_train.iloc[val_ind]

    lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
    lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)

    print('X_train shape:', x_train.shape)
    print('X_valid shape:', x_val.shape)
    print('X_test shape:', X_test.shape)
    
#     if fold <= CFG.n_folds-1:
    if fold>8:

        des = DartEarlyStopping("valid_1", "amex_metric", 750)
    
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 15000,
            valid_sets = [lgb_train, lgb_valid],
            verbose_eval = 1000,
            feval = lgb_amex_metric,
#             callbacks=[des],
            )

        # Save best model
        joblib.dump(model, f'../src/models/BinaryModels/lgb_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl')

    else:
        with open(f'../src/models/BinaryModels/lgb_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl', 'rb') as f:
            model = pickle.load(f)

    # Predict validation
    val_pred = model.predict(x_val)
    # Add to out of folds array
    oof_predictions[val_ind] = val_pred
    # Predict the test set
    test_pred = model.predict(X_test)
    test_predictions += test_pred / CFG.n_folds
    # Compute fold metric
    score = amex_metric(y_val, val_pred)
    print(f'Our fold {fold} CV score is {score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0:-
X_train shape: (407922, 2783)
X_valid shape: (50991, 2783)
X_test shape: (924621, 2783)
Our fold 0 CV score is 0.8035082131354067
 
--------------------------------------------------
Training fold 1:-
X_train shape: (407922, 2783)
X_valid shape: (50991, 2783)
X_test shape: (924621, 2783)
Our fold 1 CV score is 0.8002827155503132
 
--------------------------------------------------
Training fold 2:-
X_train shape: (407922, 2783)
X_valid shape: (50991, 2783)
X_test shape: (924621, 2783)
Our fold 2 CV score is 0.8016446846574319
 
--------------------------------------------------
Training fold 3:-
X_train shape: (407923, 2783)
X_valid shape: (50990, 2783)
X_test shape: (924621, 2783)
Our fold 3 CV score is 0.7846536069416695
 
--------------------------------------------------
Training fold 4:-
X_train shape: (407923, 2783)
X_valid shape: (50990, 2783)
X_test shape: (924621, 2783)
Our fold 4 CV score is 0.803011904016

In [31]:
# Compute out of folds metric
score = amex_metric(Y_train, oof_predictions)
print(f'Our out of folds CV score is {score}')

oof_filename = f'OOFs/oof_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv'
tst_filename = f'PREDs/lgb_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv'

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({'customer_ID': df_train['customer_ID'], 'target': Y_train, 'prediction': oof_predictions})
oof_df.to_csv(oof_filename, index = False)

# Create a dataframe to store test prediction
test_df = pd.DataFrame({'customer_ID': df_test['customer_ID'], 'prediction': test_predictions})
test_df.to_csv(tst_filename, index = False)
print(tst_filename)

Our out of folds CV score is 0.7973207780207017
PREDs/lgb_v1_dpv59fold_seed42.csv


In [29]:
# !kaggle competitions submit -c amex-default-prediction -f cb_v2_dpv25fold_seed802.csv -m "catboost with \
# 2008 feats CV 7952"