In [1]:
import numpy as np
import pandas as pd
import os
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500

In [2]:
import gc
from math import sqrt
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
import pickle

import math as mt
from math import *
import matplotlib as mlp
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold, StratifiedKFold, train_test_split
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_predict


import joblib
import random
import itertools
import scipy as sp

from itertools import combinations
import warnings
warnings.filterwarnings('ignore')

In [3]:
cat_feats = ['B_30','B_38','D_114','D_116','D_117','D_120','D_126','D_63','D_64','D_66','D_68']

In [4]:
class CFG:
    seed = 42
    n_folds = 5
    target = 'target'
    input_dir = '../src/data/processed/'
    dpv = 'v2'
    mv = 'v3'

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_fe_public.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_fe_public.parquet')
    test['D_86_last'] = np.where(test['D_86_last']==-1, 0, test['D_86_last'])
    return train, test
#     return train

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def amex_metric_np(preds, target):
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)
    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)
    g = gini / gini_max
    return 0.5 * (g + d)

In [6]:
seed_everything(CFG.seed)
df_train, df_test = read_data()
df_train.shape, df_test.shape

# df_train = read_data()
# df_train.shape

((458913, 2358), (924621, 2357))

In [7]:
def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

def train_and_evaluate(df_1, df_2, cat_feats):
# def train_and_evaluate(df_1, cat_feats):
    
    train = df_1.copy()
    X_test = df_2.copy()

    cat_features = ([f"{cf}_last" for cf in cat_feats]+[f"{cf}_first" for cf in cat_feats])
    add_cats = ([col for col in train.columns.tolist() if f"_isFirstEqMin" in col]+
                [col for col in train.columns.tolist() if f"_isLastEqMin" in col])
    
    # Label encode categorical features
#     for cat_col in cat_features+add_cats:
#         encoder = LabelEncoder()
#         train[cat_col] = encoder.fit_transform(train[cat_col])
# #         X_test[cat_col] = encoder.transform(X_test[cat_col])
        
    # Round last float features to 2 decimal place
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]

#     for col in num_cols:
#         train[col + '_round2'] = train[col].round(2)
# #         X_test[col + '_round2'] = X_test[col].round(2)
    
    # Get the difference between last and mean
    num_cols = [col for col in train.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    
    for col in num_cols:
        try:
            train[f'{col}_last_mean_diff'] = abs(train[f'{col}_last'] - train[f'{col}_mean'])
            train[f'{col}_last_mean_diff_sq'] = (train[f'{col}_last'] - train[f'{col}_mean'])**2
            X_test[f'{col}_last_mean_diff'] = abs(X_test[f'{col}_last'] - X_test[f'{col}_mean'])
            X_test[f'{col}_last_mean_diff_sq'] = (X_test[f'{col}_last'] - X_test[f'{col}_mean'])**2
        except:
            pass
    
    # Transform float64 and float32 to float16
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    
    print('numeric cols: ',len(num_cols))
#     for col in tqdm(num_cols):
#         train[col] = train[col].astype(np.float16)
#         test[col] = test[col].astype(np.float16)
    
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    print('all cols: ',len(features))

    params = {
        'objective': 'binary',
        'metric': "binary_logloss",
        'boosting': 'dart',
        'seed': CFG.seed,
        'num_leaves': 227,
        'max_bin': 1023,
        'learning_rate': 0.02,
        'feature_fraction': 0.50,
        'bagging_freq': 10,
        'bagging_fraction': 0.80,
        'n_jobs': -1,
        'verbose': -1,
        'lambda_l2': 2,
        'min_data_in_leaf': 40,
        }
    
#     params = {
#         'objective': 'binary',
#         'metric': "binary_logloss",
#         'boosting': 'dart',
#         'seed': CFG.seed,
#         'num_leaves': 100,
#         'learning_rate': 0.01,
#         'feature_fraction': 0.50,
#         'bagging_freq': 10,
#         'bagging_fraction': 0.80,
#         'n_jobs': -1,
#         'lambda_l2': 2,
#         'min_data_in_leaf': 40,
#         'verbose': -1
#         }
    
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(X_test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)

    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
            
        test = X_test.reset_index(drop=True)
        print(' ')
        print('-'*50)
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]

        print(f'Training fold {fold} with {len(features)} features...')
        for cc in tqdm(cat_features+add_cats, disable=True):
            le = LabelEncoder()
            le.fit(list(set(x_val[cc]) | set(x_train[cc])))
            x_train[cc] = le.transform(x_train[[cc]])
            x_val[cc] = le.transform(x_val[[cc]])
            test[cc] = le.transform(test[[cc]])

            lr = LinearRegression(n_jobs=-1)
            ohe = OneHotEncoder(sparse=True)
            ohe.fit(np.arange(len(le.classes_)).reshape(-1, 1))
            x_ohe_train = ohe.transform(x_train[[cc]])
            x_ohe_val = ohe.transform(x_val[[cc]])
            x_ohe_test = ohe.transform(test[[cc]])

            lr.fit(x_ohe_train, y_train)

            x_train[cc] = cross_val_predict(lr, x_ohe_train, y_train, cv=kfold, n_jobs=-1)
            x_val[cc] = lr.predict(x_ohe_val)
            test[cc] = lr.predict(x_ohe_test)

        print('X_train shape:', x_train.shape)
        print('X_valid shape:', x_val.shape)
        print('X_test shape:', test.shape)

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_val, y_val)

#         lgb_train = lgb.Dataset(x_train, y_train, categorical_feature = add_cats+cat_features)
#         lgb_valid = lgb.Dataset(x_val, y_val, categorical_feature = add_cats+cat_features)
            
#         if fold<=3:
        if fold>=3:
            
            model_lgb = lgb.train(
                params = params,
                train_set = lgb_train,
                num_boost_round = 7000,
                valid_sets = [lgb_train, lgb_valid],
                early_stopping_rounds = 100,
                verbose_eval = 500,
                feval = lgb_amex_metric
                )

            # Save best model
            joblib.dump(model_lgb, 
                        f'../src/models/BinaryModels/lgbm_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl')
        else:
            with open(f'../src/models/BinaryModels/lgbm_{CFG.mv}_dp{CFG.dpv}_fold{fold}_seed{CFG.seed}.pkl', 'rb') as f:
                model_lgb = pickle.load(f)
                
        # Predict validation
        val_pred = model_lgb.predict(x_val)
        # Add to out of folds array
        oof_predictions[val_ind] = val_pred
        
        del lgb_train, lgb_valid
        gc.collect()
        
        # Predict the test set
        test_pred = model_lgb.predict(test[features])
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val
        gc.collect()
    
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(f'oof_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(f'lgb_{CFG.mv}_dp{CFG.dpv}{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)
    

In [8]:
# train_and_evaluate(df_train, cat_feats)
train_and_evaluate(df_train, df_test, cat_feats)

numeric cols:  1527
all cols:  2710
 
--------------------------------------------------
Training fold 0 with 2710 features...
X_train shape: (367130, 2710)
X_valid shape: (91783, 2710)
X_test shape: (924621, 2711)
Our fold 0 CV score is 0.8021394084312112
 
--------------------------------------------------
Training fold 1 with 2710 features...
X_train shape: (367130, 2710)
X_valid shape: (91783, 2710)
X_test shape: (924621, 2711)
Our fold 1 CV score is 0.7947088905157497
 
--------------------------------------------------
Training fold 2 with 2710 features...
X_train shape: (367130, 2710)
X_valid shape: (91783, 2710)
X_test shape: (924621, 2711)
Our fold 2 CV score is 0.7974814143549587
 
--------------------------------------------------
Training fold 3 with 2710 features...
X_train shape: (367131, 2710)
X_valid shape: (91782, 2710)
X_test shape: (924621, 2711)
[500]	training's binary_logloss: 0.242297	training's amex_metric: 0.824676	valid_1's binary_logloss: 0.26117	valid_1's ame

In [10]:
!kaggle competitions submit -c amex-default-prediction -f lgb_v3_dpv25fold_seed42.csv -m "add \
cats 2710 new feats v1 CV 7967"

100%|██████████████████████████████████████| 75.5M/75.5M [00:06<00:00, 11.3MB/s]
Successfully submitted to American Express - Default Prediction