In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from datetime import datetime, date, time

import gc
import copy

import pyarrow.parquet as pq
import pyarrow as pa

In [2]:
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, KFold

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import warnings
warnings.filterwarnings("ignore")

import pytorch_lightning as pl
random_seed=1234
pl.seed_everything(random_seed)

Global seed set to 1234


1234

In [4]:
%%time
# train_file = r'/kaggle/input/amex-agg-data-rev2/agg_train_all_rev2_rev.parquet'
train_file = r'amex\agg_train_all_rev2_rev.parquet'
df=pd.read_parquet(train_file, engine='pyarrow')

Wall time: 18.1 s


In [5]:
x_cols = ['log_D_45|mean', 'R_28|last', 'D_49|last', 'D_42|min', 'D_130|max', 'B_20|max', 'B_11|min', 'R_16|last', 'R_5|mean', 'D_62|min', 'D_133|last', 'R_20|max', 'B_17|last', 'D_139|last', 'R_3|max', 'log_B_40|last', 'S_22|last', 'B_23|min', 'log_B_40|max', 'B_14|min', 'R_7|max', 'B_22|last', 'B_23|mean', 'D_83|last', 'D_123|mean', 'log_B_4|last', 'log_D_49|last', 'D_114|last', 'B_24|last', 'log_S_5|mean', 'D_127|last', 'log_B_4|max', 'R_1|last', 'D_70|min', 'D_62|last', 'R_9_mean2std', 'D_105|max', 'B_7|min', 'R_5|last', 'D_120=1.0', 'D_129|max', 'R_22|max', 'B_7|last', 'S_25|mean', 'D_120|nunique', 'log_D_131|min', 'S_27_mean2std', 'P_2|max', 'R_17|last', 'R_2|last', 'D_142|last', 'B_38|nunique', 'D_52|mean', 'log_B_22|last', 'B_18|last', 'D_58|last', 'S_8|min', 'R_20|last', 'S_20|last', 'B_26|last', 'D_70_mean2std', 'B_11|mean', 'D_138|last', 'R_6|max', 'B_3|mean', 'log_D_107|last', 'log_D_43|last', 'D_43|max', 'D_45|min', 'D_58|min', 'B_15|mean', 'log_B_24|last', 'log_B_18|max', 'D_113|last', 'D_56|last', 'B_31|last', 'D_88|last', 'S_8|mean', 'D_139|mean', 'log_D_133|last', 'S_15|max', 'D_143|last', 'D_102|last', 'B_19|last', 'R_4|min', 'B_3|last', 'S_3|mean', 'D_60|last', 'log_D_115|last', 'D_44|last', 'D_131|max', 'R_23|last', 'D_48|last', 'S_12|last', 'D_79|last', 'S_22|mean', 'D_109|last', 'B_1|min', 'log_D_131|last', 'D_113|max', 'S_7|max', 'log_B_4|mean', 'log_B_27|last', 'D_46|min', 'D_43|min', 'S_17|last', 'B_10|mean', 'B_15|last', 'log_B_3|mean', 'log_D_39|last', 'B_12|last', 'B_11|last', 'S_22|max', 'B_20|mean', 'R_26|last', 'log_B_3|min', 'B_39|last', 'log_B_23|last', 'R_19|last', 'D_122|min', 'D_84|mean', 'B_4_mean2std', 'S_7|last', 'P_2|last', 'D_39|max', 'B_9|min', 'D_48|min', 'D_61|max', 'D_73|last', 'R_13|mean', 'log_B_11|min', 'log_B_12|max', 'log_D_118|last', 'log_B_40|mean', 'D_111|last', 'R_1|mean', 'R_13|last', 'S_7|min', 'log_D_118|mean', 'log_D_135|last', 'B_38|last', 'D_132|min', 'log_D_51|last', 'D_102|max', 'B_22|mean', 'R_8|last', 'log_D_119|last', 'B_25|last', 'B_30=1.0', 'D_56|min', 'D_71|last', 'B_25|min', 'B_38=5.0', 'D_140|last', 'D_52_mean2std', 'R_12|max', 'R_14|last', 'D_62|max', 'R_26_mean2std', 'log_B_29|last', 'D_89|last', 'R_18|last', 'S_23|mean', 'D_44|min', 'B_8|min', 'D_75|max', 'D_45|max', 'B_38=7.0', 'R_6|last', 'log_B_18|mean', 'S_3|min', 'P_3|max', 'D_114=0.0', 'log_B_22|min', 'D_108|last', 'D_129|mean', 'B_20|min', 'D_87|last', 'D_126|last', 'log_D_106|last', 'log_B_3|last', 'D_70|max', 'D_115|last', 'D_39|last', 'D_44|mean', 'log_B_23|max', 'B_17|mean', 'B_6|min', 'log_B_26|mean', 'B_23|last', 'log_B_24|mean', 'R_27|max', 'D_53|min', 'log_D_45|max', 'B_3|max', 'B_33|mean', 'log_D_137|last', 'S_27|last', 'S_6|max', 'B_37|mean', 'D_89|max', 'log_D_125|min', 'D_74|mean', 'log_B_32|last', 'P_4|last', 'D_110|last', 'D_77|min', 'D_128|min', 'D_106|last', 'R_10|max', 'log_D_60|min', 'D_122|max', 'D_120|last', 'D_133|max', 'log_B_21|last', 'B_10|last', 'D_49_mean2std', 'B_40|min', 'D_84|max', 'log_B_11|last', 'S_26|last', 'log_S_26|last', 'D_45|last', 'D_91|last', 'D_61|min', 'R_16|max', 'B_24|mean', 'R_10|last', 'B_8|last', 'D_109|mean', 'R_2|mean', 'R_4|mean', 'S_15|mean', 'S_25|min', 'B_38=6.0', 'B_4|max', 'D_105|last', 'log_B_9|min', 'log_D_123|last', 'P_2|mean', 'D_43|last', 'B_31|nunique', 'R_27|last', 'R_9|last', 'S_9|max', 'D_75|mean', 'B_30|nunique', 'R_24|last', 'D_42|last', 'R_13|max', 'D_86|last', 'R_10|mean', 'log_R_28|last', 'B_7|max', 'log_B_9|last', 'D_104|last', 'P_2|min', 'D_117|last', 'D_68=1.0', 'R_8|max', 'S_3|last', 'D_137|last', 'D_59|max', 'log_B_12|last', 'D_47|last', 'D_132|last', 'R_3|last', 'R_2|min', 'R_11|last', 'B_1|last', 'B_21|last', 'D_41|mean', 'D_119|last', 'D_93|last', 'D_46|mean', 'D_51|mean', 'B_33|min', 'R_13_mean2std', 'D_132_mean2std', 'log_B_3|max', 'log_D_44|min', 'R_25|max', 'D_107|max', 'B_30=0.0', 'S_19|last', 'log_D_45|min', 'D_81|max', 'S_15|last', 'R_8|mean', 'D_82|last', 'log_B_13|last', 'B_22|max', 'B_3|min', 'log_D_41|last', 'D_123|last', 'D_81|mean', 'R_1|max', 'R_27|mean', 'D_121|last', 'D_74|max', 'R_7|last', 'D_41|last', 'log_B_11|max', 'B_8|max', 'B_5|last', 'log_D_113|last', 'P_3|min', 'R_7|mean', 'D_53_mean2std', 'S_7|mean', 'D_133|min', 'D_41|min', 'log_D_133|min', 'log_B_26|max', 'D_96|last', 'D_48|max', 'B_2|mean', 'D_61|mean', 'log_B_36|last', 'D_60|max', 'D_125|last', 'log_D_39|min', 'D_75|last', 'B_9|last', 'log_D_44|max', 'D_65|last', 'B_8|mean', 'D_62|mean', 'log_D_43|min', 'D_114|nunique', 'P_4|max', 'B_28|last', 'D_128|max', 'D_131|min', 'R_27|min', 'D_144|last', 'B_16|min', 'log_B_22|max', 'R_4|last', 'S_3|max', 'B_36|last', 'D_69|min', 'R_22|last', 'log_D_39|max', 'log_B_41|max', 'D_135_mean2std', 'D_77|max', 'B_21|max', 'D_118|last', 'D_53|last', 'log_B_42|last', 'D_112|last', 'B_33|last', 'S_12|max', 'log_B_26|last', 'log_D_43|max', 'R_11|max', 'log_B_22|mean', 'R_6|mean', 'B_9|max', 'D_41|max', 'log_B_9|max', 'S_11|mean', 'B_27|last', 'S_8|last', 'D_68|last', 'R_5|max', 'log_D_109|last', 'B_33|max', 'B_16|last', 'D_43|mean', 'D_124|last', 'D_52|min', 'B_17|min', 'D_134|min', 'D_45|mean', 'D_140|max', 'D_131|last', 'D_48|mean', 'S_5|last', 'B_19|min', 'D_94|last', 'log_D_45|last', 'R_3|mean', 'S_17|min', 'R_12|last', 'D_46_mean2std', 'D_72|min', 'D_81|last', 'D_77|last', 'B_32|last', 'log_B_11|mean', 'B_37|last', 'D_53|max', 'log_B_18|last', 'log_B_40|min', 'log_D_107|min', 'D_54|last', 'D_52|last', 'R_18|max', 'D_135|last', 'D_64=U', 'log_D_51|max', 'B_41|last', 'B_7|mean', 'log_B_5|last', 'D_103|last', 'S_9|last', 'D_107|last', 'log_B_41|last', 'D_122|last', 'D_46|last', 'D_78|max', 'P_3|last', 'D_70|last', 'R_4|max', 'P_3|mean', 'log_D_44|mean', 'S_16|last', 'D_80|last', 'log_B_18|min', 'B_29|last', 'D_128|last', 'log_D_44|last', 'S_20|max', 'D_52|max', 'D_77|mean', 'S_25|last', 'B_37|max', 'log_D_41|min', 'B_9|mean', 'log_B_23|min', 'log_B_21|mean', 'R_15|max', 'D_59|last', 'D_129|last', 'S_6|min', 'B_5|mean', 'D_65|mean', 'R_21|last', 'D_42|mean', 'D_130|last', 'D_69|last', 'D_42_mean2std', 'D_65|max', 'D_141|last', 'B_13|last', 'S_13|last', 'D_42|max', 'D_78|last', 'log_D_102|last', 'B_30=2.0', 'B_20|last', 'D_55|min', 'log_D_140|last', 'R_24|max', 'P_4|mean', 'log_D_107|mean', 'D_50|last', 'D_72|max', 'R_1|min', 'B_17|max', 'B_30|last', 'B_18|min', 'D_134|last', 'D_76|last', 'log_D_125|last', 'log_S_5|last', 'R_17|max', 'log_B_4|min', 'B_42|last', 'log_D_138|last', 'B_1|max', 'B_40|last', 'D_74|last', 'D_84|last', 'R_24|mean', 'log_D_60|last', 'R_19|min', 'S_11|last', 'B_17_mean2std', 'B_2|last', 'S_18|last', 'B_16|max', 'log_D_41|max', 'B_32|max', 'B_2|min', 'D_44|max', 'S_23|last', 'D_61|last', 'P_4|min', 'log_D_51|min', 'R_3|min', 'D_136|last', 'S_6|last', 'S_24|last', 'log_D_108|last', 'R_2|max', 'D_89|mean', 'R_16|mean', 'log_D_102|max', 'D_39|mean', 'log_B_9|mean', 'B_1|mean', 'D_145|last', 'R_25|last', 'S_23|max', 'D_51|last', 'D_72|last', 'log_D_113|min', 'R_12|mean', 'R_15|mean', 'B_38=4.0', 'D_92|last', 'B_6|last', 'D_55|last', 'log_D_136|last', 'D_79|max', 'log_D_43|mean', 'R_20|mean', 'B_4|last', 'B_22|min', 'D_116|last', 'B_14|last', 'R_11|mean', 'D_78|mean', 'R_15|last']


len(x_cols)

527

## hyperopt parameters

In [6]:
learn_rates = np.concatenate((
#                            np.arange(0.00001, 0.0001, 0.00001),  
                           np.arange(0.0001, 0.001, 0.0001), 
                           np.arange(0.001, 0.01, 0.001), 
                           np.arange(0.01, 0.1, 0.01)
                          ), 
                          axis=0)


len(learn_rates)

27

In [7]:
#https://lightgbm.readthedocs.io/en/latest/Parameters.html
#https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=classifier#lightgbm.LGBMClassifier

from hyperopt import hp
import numpy as np
space  = { 
    
                 'n_estimators': hp.choice('n_estimators', range(500, 2000, 50)),#num_boost_round
                 'boosting_type':hp.choice('boosting_type', ['gbdt']),#boosting
                 'objective':hp.choice('objective', ['binary'] ),
                 'metric':hp.choice('metric', ['auc']),
                 'learning_rate':  hp.choice('learning_rate', learn_rates), 
                 'colsample_bytree': hp.choice('colsample_bytree', np.round(np.arange(0.1, 0.86, 0.05),3)), #feature_fraction
                 'max_depth': hp.choice('max_depth', range(7, 36, 1)), #int type
                 'min_child_samples': hp.choice('min_child_samples',range(100, 5001, 50)), #min_data_in_leaf
                 'reg_alpha':hp.choice('reg_alpha', [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,15]),#lambda_l1
                 'reg_lambda':hp.choice('reg_lambda', [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,15]),#lambda_l2
                 'max_bin':hp.choice('max_bin', range(500, 10000, 50)),
                 'min_data_in_bin':hp.choice('min_data_in_bin', range(500, 9000, 50)),
                 'subsample':hp.choice('subsample', np.round(np.arange(0.1, 0.96, 0.05),3)), #bagging_fraction
                 'subsample_freq':hp.choice('subsample_freq', range(1, 100, 2)),#bagging_freq
                 #max number of leaves in one tree. 1 < num_leaves <= 131072. classes< num_leaves< 2^max_depth  
                 'num_leaves':hp.choice('num_leaves', range(31, 300, 5)),#max_leaves. 
                 'random_state':hp.choice('random_state', [1234]),
                 'n_jobs':hp.choice('n_jobs', [4]),#nthread
                 #'min_split_gain':hp.choice('min_split_gain', [0.0]), #min_gain_to_split
                 #'min_child_weight':hp.choice('min_child_weight', [0.001]),   #min_sum_hessian_in_leaf
                 #'subsample_for_bin':hp.choice('subsample_for_bin', [200000]),   #bin_construct_sample_cnt 
                 #'importance_type':hp.choice('importance_type', ['split']),   
    
                  }
                  
    


### lightgbm

In [8]:
import lightgbm as lgb
def train_trees(X_train, y_train, num_round=100, params={} ):
    
    params['verbosity'] = -1
    dtrain = lgb.Dataset(X_train, y_train)
    
    tree_model = lgb.train(params,
                dtrain,
                num_boost_round=num_round)
    
    
    del dtrain
    gc.collect()

    return tree_model

In [9]:
# @yunchonggan's fast metric implementation
# From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
# https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)


In [10]:
X = df[x_cols]
y = df[['target']]

In [11]:
del df
gc.collect(0)

36

In [12]:
skf = KFold(n_splits=3)
skf.get_n_splits(X, y)

3

In [13]:
print(skf)

for train_index, test_index in skf.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     print("TRAIN:", len(train_index), "TEST:", len(test_index), len(test_index)/(len(test_index)+len(train_index)))
    print(y.iloc[test_index]['target'].value_counts()/len(test_index)) 
    print(y.iloc[train_index]['target'].value_counts()/len(train_index))

KFold(n_splits=3, random_state=None, shuffle=False)
0    0.740565
1    0.259435
Name: target, dtype: float64
0    0.741317
1    0.258683
Name: target, dtype: float64
0    0.739552
1    0.260448
Name: target, dtype: float64
0    0.741824
1    0.258176
Name: target, dtype: float64
0    0.743082
1    0.256918
Name: target, dtype: float64
0    0.740059
1    0.259941
Name: target, dtype: float64


In [14]:
len(train_index), len(test_index)

(305942, 152971)

In [15]:
loss_dict = []


def score(params):
#     print(params)
    num_boost_round = params['n_estimators']
    params_ = copy.deepcopy(params)
    del params_['n_estimators']
    
    losses = []
    
    for train_index, test_index in skf.split(X, y):
        
        #----start: data prep-------------------------------------
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        #----end: data prep-------------------------------------
        
        #-----start: train trees---------------------------------------
        tree_model = train_trees(X_train, y_train['target'].values,
                                 num_round=num_boost_round, 
                                 params=params_
                                )

        y_preds = tree_model.predict(X_test, num_iteration=tree_model.best_iteration)
#         loss = roc_auc_score(y_test, y_preds)
#         loss = f1_score(y_test['target'].values, (y_preds>=0.9).astype(int))
#         loss = amex_metric(y_test, 
#                            pd.DataFrame(data={'prediction': y_preds}))
        loss = amex_metric(y_test['target'].values, y_preds)
        losses.append(loss)
        #-----end: train trees---------------------------------------
        
    loss = np.mean(losses)
#     print(loss)
    loss_dict.append({'params': params, 'losses': losses, 'mean_loss': loss})
    
    if len(loss_dict)%10==0:
        pd.DataFrame(data=loss_dict).to_excel('amex-hyperopt-lgb-rev2-spsfr-3kfold-corr75-527feats.xlsx', index=False)
    return {'loss': -loss, 'status': STATUS_OK}

In [16]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, anneal, rand
from functools import partial
def optimize(space, evals, cores, trials, optimizer=tpe.suggest, random_state=1234, n_startup_jobs=10):
    algo = partial(optimizer, n_startup_jobs=n_startup_jobs)
    best = fmin(score, space, algo=algo, max_evals=evals, trials = trials)
    print(best)
    return best

In [17]:
cores = 4
n=500
verbose = False
trials = Trials()

In [None]:
best_param = optimize(space,
                      evals = n,
                      optimizer=tpe.suggest,
                      cores = cores,
                      trials = trials, random_state=1234, 
                      n_startup_jobs=10)

In [19]:
pd.DataFrame(data=loss_dict).to_excel('amex-hyperopt-lgb-rev2-spsfr-3kfold-corr75-527feats.xlsx', index=False)