In [1]:
import numpy as np
import pandas as pd
import os
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
import gc
from math import sqrt
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import pickle

import math as mt
from math import *
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict


import joblib
import random
import itertools
import scipy as sp

from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

In [3]:
class CFG:
    seed = 42
    n_folds = 5
    target = 'target'
    input_dir = '../src/data/processed/'
    dpv = 'v4'
    mv = 'v1'

In [4]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_v4.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_v4.parquet')
    return train, test

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [5]:
seed_everything(CFG.seed)
df_train, df_test = read_data()
df_train.shape, df_test.shape

((458913, 2446), (924621, 2445))

In [6]:
cat_cols = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']
cat_features = []
for col in cat_cols:
    for i in range(1, 14):
        cat_features.append(col+'_'+('{:0>2}'.format(i)))

In [23]:
X_train = df_train.drop(['customer_ID'], axis=1)
X_test = df_test.drop(['customer_ID'], axis=1)
Y_train = df_train['target']

In [24]:
# X_test['B_38_02'].value_counts()

In [25]:
# X_train['B_38_02'].value_counts()

In [26]:
X_test['B_30_02'] = np.where(X_test['B_30_02']==-1, 0, X_test['B_30_02'])
X_test['B_38_02'] = np.where(X_test['B_38_02']==-1, 2, X_test['B_38_02'])

# Label encode categorical features
# for cat_col in tqdm(cat_features):
#     encoder = LabelEncoder()    
#     X_train[cat_col] = encoder.fit_transform(X_train[cat_col])
#     X_test[cat_col] = encoder.transform(X_test[cat_col])

In [40]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

params = {
    'objective': 'binary',
    'metric': "binary_logloss",
    'boosting': 'dart',
    'seed': CFG.seed,
    'num_leaves': 100,
    'learning_rate': 0.01,
    'feature_fraction': 0.20,
    'bagging_freq': 10,
    'bagging_fraction': 0.50,
    'n_jobs': -1,
    'lambda_l2': 2,
    'min_data_in_leaf': 40,
    'verbose': -1,
    }

# Create a numpy array to store test predictions
test_predictions = np.zeros(len(X_test))
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(X_train))

kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(X_train, Y_train)):

    print(' ')
    print('-'*50)
    print(f'Training fold {fold}:-')
    x_train, x_val = X_train.drop('target', axis=1).iloc[trn_ind], X_train.drop('target', axis=1).iloc[val_ind]
    y_train, y_val = Y_train.iloc[trn_ind], Y_train.iloc[val_ind]

    print('X_train shape:', x_train.shape)
    print('X_valid shape:', x_val.shape)
    print('X_test shape:', X_test.shape)

#     lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = cat_features)
#     lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = cat_features)
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_val, y_val)

#     if fold<=4:
    if fold>0:

        model = lgb.train(
            params = params,
            train_set = lgb_train,
            num_boost_round = 7500,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 500,
            feval = lgb_amex_metric
            )

        # Save best model
        joblib.dump(model, f'../src/models/BinaryModels/lgb_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl')

    else:
        with open(f'../src/models/BinaryModels/lgb_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl', 'rb') as f:
            model = pickle.load(f)

    # Predict validation
    val_pred = model.predict(x_val)
    # Add to out of folds array
    oof_predictions[val_ind] = val_pred
    # Predict the test set
    test_pred = model.predict(X_test)
    test_predictions += test_pred / CFG.n_folds
    # Compute fold metric
    score = amex_metric(y_val, val_pred)
    print(f'Our fold {fold} CV score is {score}')
    del x_train, x_val, y_train, y_val, lgb_train, lgb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0:-
X_train shape: (367130, 2444)
X_valid shape: (91783, 2444)
X_test shape: (924621, 2444)
Our fold 0 CV score is 0.7928680464958229
 
--------------------------------------------------
Training fold 1:-
X_train shape: (367130, 2444)
X_valid shape: (91783, 2444)
X_test shape: (924621, 2444)
[500]	training's binary_logloss: 0.347133	training's amex_metric: 0.760543	valid_1's binary_logloss: 0.350417	valid_1's amex_metric: 0.742846
[1000]	training's binary_logloss: 0.256225	training's amex_metric: 0.780681	valid_1's binary_logloss: 0.263792	valid_1's amex_metric: 0.757842
[1500]	training's binary_logloss: 0.231022	training's amex_metric: 0.796553	valid_1's binary_logloss: 0.242829	valid_1's amex_metric: 0.767007
[2000]	training's binary_logloss: 0.216061	training's amex_metric: 0.811688	valid_1's binary_logloss: 0.233098	valid_1's amex_metric: 0.773669
[2500]	training's binary_logloss: 0.208394	training's amex_metric: 0.

[3500]	training's binary_logloss: 0.194398	training's amex_metric: 0.84343	valid_1's binary_logloss: 0.223495	valid_1's amex_metric: 0.784861
[4000]	training's binary_logloss: 0.188423	training's amex_metric: 0.852925	valid_1's binary_logloss: 0.222364	valid_1's amex_metric: 0.786692
[4500]	training's binary_logloss: 0.182605	training's amex_metric: 0.862625	valid_1's binary_logloss: 0.221464	valid_1's amex_metric: 0.788072
[5000]	training's binary_logloss: 0.176888	training's amex_metric: 0.872399	valid_1's binary_logloss: 0.220728	valid_1's amex_metric: 0.78828
[5500]	training's binary_logloss: 0.171831	training's amex_metric: 0.881393	valid_1's binary_logloss: 0.220313	valid_1's amex_metric: 0.788633
[6000]	training's binary_logloss: 0.167496	training's amex_metric: 0.889388	valid_1's binary_logloss: 0.219989	valid_1's amex_metric: 0.788517
[6500]	training's binary_logloss: 0.162927	training's amex_metric: 0.897436	valid_1's binary_logloss: 0.219721	valid_1's amex_metric: 0.788713
[

In [41]:
# Compute out of folds metric
score = amex_metric(Y_train, oof_predictions)
print(f'Our out of folds CV score is {score}')

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({'customer_ID': df_train['customer_ID'], 'target': Y_train, 'prediction': oof_predictions})
oof_df.to_csv(f'oof_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)

# Create a dataframe to store test prediction
test_df = pd.DataFrame({'customer_ID': df_test['customer_ID'], 'prediction': test_predictions})
test_df.to_csv(f'lgb_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)

Our out of folds CV score is 0.7874231500547724


In [42]:
!kaggle competitions submit -c amex-default-prediction -f lgb_v1_dpv45fold_seed42.csv -m "all pivoted \
2444 feats CV 7875"

100%|██████████████████████████████████████| 75.2M/75.2M [00:07<00:00, 10.5MB/s]
Successfully submitted to American Express - Default Prediction