In [1]:
import numpy as np
import pandas as pd
import os
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [18]:
import gc
from math import sqrt
from catboost import Pool, CatBoostClassifier
from sklearn.metrics import roc_auc_score
import pickle

import catboost as cb 

import math as mt
from math import *
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict


import joblib
import random
import itertools
import scipy as sp

from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

In [6]:
class CFG:
    seed = 42
    n_folds = 5
    target = 'target'
    input_dir = '../src/data/processed/'
    dpv = 'v4'
    mv = 'v1'

In [7]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_v4.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_v4.parquet')
    return train, test

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [8]:
seed_everything(CFG.seed)
df_train, df_test = read_data()
df_train.shape, df_test.shape

((458913, 2446), (924621, 2445))

In [9]:
cat_cols = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']
cat_features = []
for col in cat_cols:
    for i in range(1, 14):
        cat_features.append(col+'_'+('{:0>2}'.format(i)))

In [10]:
log_cols = ["B_4","D_62","D_53","D_55","D_132"]
log_transform = []
for col in log_cols:
    for i in range(1, 14):
        log_transform.append(col+'_'+('{:0>2}'.format(i)))

In [27]:
X_train = df_train.drop(['customer_ID'], axis=1)
X_test = df_test.drop(['customer_ID'], axis=1)
Y_train = df_train['target']

In [28]:
X_test['B_30_02'] = np.where(X_test['B_30_02']==-1, 0, X_test['B_30_02'])
X_test['B_38_02'] = np.where(X_test['B_38_02']==-1, 2, X_test['B_38_02'])

for col in X_test.columns:
    if X_test[col].dtype=='float16':
        X_train[col] = X_train[col].astype('float32').round(decimals=2).astype('float16')
        X_test[col] = X_test[col].astype('float32').round(decimals=2).astype('float16')

In [29]:
# for col in tqdm(log_transform):
#     X_train[col] = np.log(X_train[col]+1)
#     X_test[col] = np.log(X_test[col]+1)

In [41]:
def cb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

params = {
 'nan_mode': 'Min',
 'eval_metric': 'Logloss',
 'iterations': 7500,
 'grow_policy': 'SymmetricTree', #['Depthwise','Lossguide']
 'boosting_type': 'Plain', #['Ordered']
 'l2_leaf_reg': 3,
 'subsample': 0.66,
 'use_best_model': True,
 'random_seed': 42,
 'depth': 8,
 'best_model_min_trees': 4200,
 'min_data_in_leaf': 31,
 'loss_function': 'Logloss',
 'learning_rate': 0.05}

# Create a numpy array to store test predictions
test_predictions = np.zeros(len(X_test))
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(X_train))

kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(X_train, Y_train)):

    print(' ')
    print('-'*50)
    print(f'Training fold {fold}:-')
    x_train, x_val = X_train.drop('target', axis=1).iloc[trn_ind], X_train.drop('target', axis=1).iloc[val_ind]
    y_train, y_val = Y_train.iloc[trn_ind], Y_train.iloc[val_ind]

    print('X_train shape:', x_train.shape)
    print('X_valid shape:', x_val.shape)
    print('X_test shape:', X_test.shape)
    
    cb_train = Pool(data=x_train, label=y_train)
    cb_valid = Pool(data=x_val, label=y_val)

#     if fold<=4:
    if fold>0:
    
        model = cb.train(
            params=params,
            dtrain=cb_train,
            num_boost_round=7500,
            eval_set=[cb_train, cb_valid],
            verbose_eval=500,
            early_stopping_rounds=100)

        # Save best model
        joblib.dump(model, f'../src/models/BinaryModels/cb_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl')

    else:
        with open(f'../src/models/BinaryModels/cb_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl', 'rb') as f:
            model = pickle.load(f)

    # Predict validation
    val_pred = model.predict(x_val)
    # Add to out of folds array
    oof_predictions[val_ind] = val_pred
    # Predict the test set
    test_pred = model.predict(X_test)
    test_predictions += test_pred / CFG.n_folds
    # Compute fold metric
    score = amex_metric(y_val, val_pred)
    print(f'Our fold {fold} CV score is {score}')
    del x_train, x_val, y_train, y_val, cb_train, cb_valid
    gc.collect()

 
--------------------------------------------------
Training fold 0:-
X_train shape: (367130, 2444)
X_valid shape: (91783, 2444)
X_test shape: (924621, 2444)
Our fold 0 CV score is 0.788466956500341
 
--------------------------------------------------
Training fold 1:-
X_train shape: (367130, 2444)
X_valid shape: (91783, 2444)
X_test shape: (924621, 2444)
0:	learn: 0.6317612	test: 0.6318188	test1: 0.6319571	best: 0.6319571 (0)	total: 334ms	remaining: 41m 46s
500:	learn: 0.2089413	test: 0.2089646	test1: 0.2277766	best: 0.2277766 (500)	total: 2m 38s	remaining: 36m 52s
1000:	learn: 0.1891066	test: 0.1891290	test1: 0.2247715	best: 0.2247715 (1000)	total: 5m 9s	remaining: 33m 29s
1500:	learn: 0.1733262	test: 0.1733483	test1: 0.2236477	best: 0.2236298 (1489)	total: 7m 40s	remaining: 30m 38s
2000:	learn: 0.1595947	test: 0.1596165	test1: 0.2233507	best: 0.2233507 (2000)	total: 10m 13s	remaining: 28m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2232298252
bestIterati

In [42]:
# Compute out of folds metric
score = amex_metric(Y_train, oof_predictions)
print(f'Our out of folds CV score is {score}')

# Create a dataframe to store out of folds predictions
oof_df = pd.DataFrame({'customer_ID': df_train['customer_ID'], 'target': Y_train, 'prediction': oof_predictions})
oof_df.to_csv(f'cboof_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)

# Create a dataframe to store test prediction
test_df = pd.DataFrame({'customer_ID': df_test['customer_ID'], 'prediction': test_predictions})
test_df.to_csv(f'cb_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)

Our out of folds CV score is 0.7845458563047485


In [42]:
# !kaggle competitions submit -c amex-default-prediction -f cb_v1_dpv45fold_seed42.csv -m "all pivoted \
# 2444 feats CV 7875"

100%|██████████████████████████████████████| 75.2M/75.2M [00:07<00:00, 10.5MB/s]
Successfully submitted to American Express - Default Prediction