In [1]:
import numpy as np
import pandas as pd
import os
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
import gc
from math import sqrt
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import pickle

import math as mt
from math import *
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict


import joblib
import random
import itertools
import scipy as sp

from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

In [3]:
cat_feats = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']

In [4]:
class CFG:
    seed = 42
    n_folds = 5
    target = 'target'
    input_dir = '../src/data/processed/'
    dpv = 'v2'
    mv = 'v4'

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_fe_public.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_fe_public.parquet')
    test['D_86_last'] = np.where(test['D_86_last']==-1, 0, test['D_86_last'])
    dcols = [col for col in train.columns if '_isFirstEq' in col] + [col for col in train.columns if '_isLastEq' in col]
    train = train.drop(dcols, axis=1)
    test = test.drop(dcols, axis=1)
    
    return train, test

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [6]:
seed_everything(CFG.seed)
df_train, df_test = read_data()
df_train.shape, df_test.shape

((458913, 1650), (924621, 1649))

In [7]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

def train_and_evaluate(df_1, df_2, cat_feats):
    
    train = df_1.copy()
    X_test = df_2.copy()
    
    cat_features = [f"{cf}_last" for cf in cat_feats]    
    print('categorical cols: ',len(cat_features))
    
    # Label encode categorical features
#     for cat_col in cat_features:
#         encoder = LabelEncoder()
#         train[cat_col] = encoder.fit_transform(train[cat_col])
#         X_test[cat_col] = encoder.transform(X_test[cat_col])
        
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        X_test[col + '_round2'] = X_test[col].round(2)
    
    # Get the difference between last and mean
    num_cols = [col for col in train.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    
    for col in num_cols:
        try:
            train[f'{col}_last_mean_absdiff'] = abs(train[f'{col}_last'] - train[f'{col}_mean'])
            X_test[f'{col}_last_mean_absdiff'] = abs(X_test[f'{col}_last'] - X_test[f'{col}_mean'])
            train[f'{col}_last_mean_sq'] = (train[f'{col}_last'] - train[f'{col}_mean'])**2
            X_test[f'{col}_last_mean_sq'] = (X_test[f'{col}_last'] - X_test[f'{col}_mean'])**2
#             train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
#             X_test[f'{col}_last_mean_diff'] = X_test[f'{col}_last'] - X_test[f'{col}_mean']
        except:
            pass
    
    # Transform float64 and float32 to float16
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    
    print('numeric cols: ',len(num_cols))
#     for col in tqdm(num_cols):
#         train[col] = train[col].astype(np.float16)
#         test[col] = test[col].astype(np.float16)
    
#     # Get feature list
#     features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
#     feats = [col for col in features if col not in cat_features]
#     print('all cols: ',len(features))

    params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': CFG.seed,
        'learning_rate': 0.01,
        'feature_fraction': 0.20,
        'bagging_freq': 10,
        'bagging_fraction': 0.50,
        'n_jobs': -1,
        'lambda_l2': 2,
        'num_leaves': 100,
        'min_data_in_leaf': 40,
        'verbose': -1,
        }
    
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(X_test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        
        features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
        feats = [col for col in features if col not in cat_features]
        print('all cols: ',len(features))

#         if fold>=1:

        test = X_test.reset_index(drop=True)
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train.iloc[trn_ind], train.iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]

        for cc in tqdm(cat_features, disable=True):

            # Cross validated Target Encoding
            gg = x_train.groupby(cc).agg({'target':['mean']}).reset_index().reset_index(drop=True)
            gg.columns = [cc, 'CVTE_'+cc]
            x_train = x_train.merge(gg, on=cc, how='left')
#             x_val = x_val.merge(gg, on=cc, how='left')
            test = test.merge(gg, on=cc, how='left')
            features.append('CVTE_'+cc)
            feats.append('CVTE_'+cc)

        x_train = x_train[features]
#         x_val = x_val[features]
        test = test[features]

        for cc in tqdm(cat_features, disable=True):
            lr = LinearRegression(n_jobs=-1)
            lr.fit(np.array(x_train['CVTE_'+cc])[:,None], y_train)
            x_train['CVTE_'+cc] = cross_val_predict(lr, np.array(x_train['CVTE_'+cc])[:,None], 
                                                         y_train, cv=kfold, n_jobs=-1)
#             x_val['CVTE_'+cc] = lr.predict(np.array(x_val['CVTE_'+cc].fillna(x_train['CVTE_'+cc].mean()))[:,None])
            test['CVTE_'+cc] = lr.predict(np.array(test['CVTE_'+cc].fillna(x_train['CVTE_'+cc].mean()))[:,None])

            x_train = x_train.drop([cc], axis=1)
#             x_val = x_val.drop([cc], axis=1)
            test = test.drop([cc], axis=1)

        print('X_train shape:', x_train.shape)
#         print('X_valid shape:', x_val.shape)
        print('X_test shape:', test.shape)
            
        if fold>4:
            
#             lgb_train = lgb.Dataset(x_train[feats], y_train)
#             lgb_valid = lgb.Dataset(x_val[feats], y_val)
            
            model = lgb.train(
                params = params,
                train_set = lgb_train,
                num_boost_round = 8000,
                valid_sets = [lgb_train, lgb_valid],
                early_stopping_rounds = 100,
                verbose_eval = 500,
                feval = lgb_amex_metric
                )

            # Save best model
            joblib.dump(model, f'../src/models/BinaryModels/lgbm_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl')
        
        else:
            with open(f'../src/models/BinaryModels/lgbm_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl', 'rb') as f:
                model = pickle.load(f)
                
        # Predict validation
#         val_pred = model.predict(x_val[feats])
#         # Add to out of folds array
#         oof_predictions[val_ind] = val_pred
        # Predict the test set
        test_pred = model.predict(test[feats])
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
#         score = amex_metric(y_val, val_pred)
#         print(f'Our fold {fold} CV score is {score}')
#         del lgb_train, lgb_valid
        
        del x_train, x_val, y_train, y_val
        gc.collect()
    
#     # Compute out of folds metric
#     score = amex_metric(train[CFG.target], oof_predictions)
#     print(f'Our out of folds CV score is {score}')
    
#     # Create a dataframe to store out of folds predictions
#     oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
#     oof_df.to_csv(f'oof_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': X_test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(f'lgb_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    

In [8]:
train_and_evaluate(df_train, df_test, cat_feats)

# Our fold 0 CV score is 0.802154 7k trees miss by 11
# Our fold 1 CV score is 0.793524 8k trees miss by 04
# Our fold 2 CV score is 0.798584 8k trees plus by 03
# Our fold 3 CV score is 0.792205 8k trees miss by 18
# Our fold 4 CV score is 0.799698 8k trees plus by 10

# Our out of folds CV score is 0.7971650432465842 miss by 04

categorical cols:  11
numeric cols:  1890
all cols:  2365
 
--------------------------------------------------
Training fold 0 with 2365 features...
X_train shape: (367130, 2365)
X_test shape: (924621, 2365)
all cols:  2365
 
--------------------------------------------------
Training fold 1 with 2365 features...
X_train shape: (367130, 2365)
X_test shape: (924621, 2365)
all cols:  2365
 
--------------------------------------------------
Training fold 2 with 2365 features...
X_train shape: (367130, 2365)
X_test shape: (924621, 2365)
all cols:  2365
 
--------------------------------------------------
Training fold 3 with 2365 features...
X_train shape: (367131, 2365)
X_test shape: (924621, 2365)
all cols:  2365
 
--------------------------------------------------
Training fold 4 with 2365 features...
X_train shape: (367131, 2365)
X_test shape: (924621, 2365)


In [10]:
!kaggle competitions submit -c amex-default-prediction -f lgb_v4_dpv25fold_seed42.csv -m "replicate public with \
CVTE 2365 feats v1 CV 797165"

100%|██████████████████████████████████████| 75.3M/75.3M [00:06<00:00, 11.6MB/s]
Successfully submitted to American Express - Default Prediction

In [9]:
# !kaggle competitions submit -c amex-default-prediction -f lgb_v4_dpv25fold_seed42.csv -m "replicate public with \
# CVTE 2365 feats v1 CV 797165"