In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from datetime import datetime, date, time

import gc
import copy

import pyarrow.parquet as pq
import pyarrow as pa

In [2]:
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, KFold

In [3]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import warnings
warnings.filterwarnings("ignore")

import pytorch_lightning as pl
random_seed=1234
pl.seed_everything(random_seed)

Global seed set to 1234


1234

In [4]:
%%time
# train_file = r'/kaggle/input/amex-agg-data-rev2/agg_train_all_rev2_rev.parquet'
train_file = r'amex/agg_v3/agg_train_all_small.parquet'

df=pd.read_parquet(train_file, engine='pyarrow')

Wall time: 17.3 s


In [7]:
x_cols = ['D_41__std', 'B_33__mean__log', 'R_3__last__log', 'D_70__std', 'D_61__min', 'B_33__min', 'B_20__max', 'S_8__mean', 'D_62__mean', 'B_3__last2max', 'B_16__min', 'B_11__mean', 'D_84__std', 'D_41__mean__log', 'B_2__mean', 'D_84__range', 'S_7__std', 'D_45__mean', 'R_1__std', 'R_4__range', 'B_16__last', 'R_8__std', 'B_28__last2max', 'D_43__last', 'P_2__max', 'S_23__std', 'R_3__mean', 'D_51__mean__log', 'B_3__mean__log', 'B_19__std', 'S_8__min', 'R_7__mean', 'R_1__last', 'R_3__std', 'R_2__last__log', 'R_5__mean', 'D_45__first', 'D_44__last', 'B_19__mean', 'R_1__max', 'D_58__first', 'D_75__range', 'R_2__mean__log', 'B_9__last', 'B_11__mean__log', 'D_74__max', 'B_37__max', 'R_2__range', 'B_9__mean__log', 'B_40__last__log', 'D_84__mean__log', 'R_8__mean__log', 'D_78__last', 'B_10__first', 'D_43__mean', 'D_74__first', 'R_10__range', 'R_15__std', 'P_3__mean', 'B_18__last__log', 'B_25__last2max', 'D_75__last2max', 'D_48__max', 'B_16__last__log', 'S_3__mean', 'B_23__last2max', 'D_61__last', 'D_75__mean', 'D_44__range', 'B_4__last', 'B_10__mean', 'D_77__first', 'D_74__last2max', 'B_3__std', 'R_2__std', 'B_4__mean__log', 'D_55__mean__log', 'D_62__first', 'B_40__last', 'D_70__last__log', 'D_42__mean', 'D_44__last__log', 'D_74__std', 'R_6__mean', 'B_18__max', 'R_4__max', 'R_2__last', 'R_6__mean__log', 'B_11__min', 'B_38__nunique', 'B_7__mean', 'P_3__min', 'D_41__last', 'D_41__last__log', 'D_84__mean', 'D_42__max', 'B_2__last2max', 'B_9__last2max', 'B_11__last', 'B_2__mean__log', 'D_62__min', 'B_16__mean', 'B_33__first', 'R_15__mean__log', 'B_1__last', 'D_44__mean__log', 'D_65__mean__log', 'P_2__std', 'B_7__std', 'D_75__std', 'S_8__last', 'R_5__last', 'B_16__max', 'B_37__mean', 'B_38__last', 'B_7__max', 'S_25__std', 'D_52__min', 'B_1__min', 'B_22__range', 'S_25__mean', 'D_78__std', 'B_8__last__log', 'D_44__first', 'B_1__max', 'D_70__max', 'S_3__std', 'B_23__max', 'D_53__last__log', 'B_19__max', 'B_8__mean__log', 'D_74__last', 'D_52__first', 'R_3__range', 'D_55__max', 'D_58__std', 'D_61__first', 'R_1__mean__log', 'B_9__first', 'B_23__min', 'D_84__max', 'P_2__range', 'B_18__min', 'D_78__max', 'B_4__mean', 'B_20__range', 'B_11__last2max', 'D_45__min', 'D_53__max', 'D_77__max', 'B_9__last__log', 'D_75__mean__log', 'B_11__max', 'B_38=2.0', 'B_37__min', 'D_75__last__log', 'D_58__mean__log', 'D_52__last', 'D_52__mean', 'B_30__nunique', 'D_78__last__log', 'B_38=4.0', 'D_77__last__log', 'D_77__mean', 'B_4__last2max', 'B_20__last2max', 'B_20__mean__log', 'P_2__min', 'B_23__mean', 'D_77__mean__log', 'D_58__min', 'D_42__last', 'D_45__max', 'R_15__max', 'B_9__min', 'R_5__std', 'R_10__mean', 'B_3__max', 'B_10__min', 'R_6__last__log', 'B_16__mean__log', 'R_6__std', 'D_45__last__log', 'B_7__last', 'B_33__max', 'D_44__min', 'P_2__mean', 'S_15__max', 'S_23__last', 'B_30=1.0', 'R_2__mean', 'R_1__last__log', 'D_44__last2max', 'D_48__first', 'R_1__range', 'S_15__range', 'R_3__mean__log', 'R_5__range', 'R_6__max', 'R_4__std', 'B_19__mean__log', 'R_7__std', 'S_7__max', 'R_3__min', 'B_17__min', 'B_8__last', 'B_6__last', 'B_33__last', 'B_7__range', 'D_55__range', 'B_22__last', 'B_37__last2max', 'S_22__last', 'B_7__min', 'D_75__last', 'D_74__mean', 'B_4__last__log', 'D_112__last2max', 'B_18__last2max', 'B_5__last__log', 'D_55__mean', 'D_55__last__log', 'B_6__last2max', 'D_61__mean', 'D_70__mean', 'B_4__min', 'B_3__last__log', 'B_2__min', 'D_41__range', 'D_78__mean', 'B_23__range', 'D_55__std', 'D_70__last', 'R_5__max', 'B_9__mean', 'B_6__mean', 'B_28__last', 'D_44__mean', 'D_52__max', 'S_23__range', 'D_75__max', 'S_7__mean', 'D_48__last2max', 'D_75__first', 'P_3__last', 'P_2__last2max', 'B_22__last__log', 'B_4__max', 'D_78__range', 'D_70__mean__log', 'D_43__max', 'D_42__min', 'D_48__mean', 'B_2__last__log', 'D_112__last__log', 'D_44__max', 'R_16__mean__log', 'S_3__min', 'B_22__mean', 'B_30=0.0', 'D_75__min', 'D_58__max', 'D_48__min', 'B_33__last__log', 'B_18__first', 'B_7__first', 'R_3__max', 'R_10__std', 'D_74__last__log', 'B_22__std', 'D_62__last', 'D_48__last', 'D_43__mean__log', 'R_4__last', 'D_62__max', 'B_23__first', 'B_18__mean', 'B_19__range', 'B_3__last', 'B_20__mean', 'D_65__last__log', 'D_55__min', 'D_61__max', 'B_11__last__log', 'B_6__min', 'R_1__mean', 'D_45__mean__log', 'B_9__max', 'B_23__last', 'B_6__max', 'R_8__max', 'B_14__last2max', 'B_3__range', 'D_55__last', 'D_58__range', 'B_26__last__log', 'B_38=5.0', 'D_58__last__log', 'B_20__last', 'D_58__last2max', 'B_33__last2max', 'D_39__last__log', 'R_2__max', 'B_1__mean', 'D_53__mean__log', 'S_25__last2max', 'R_4__mean', 'D_58__mean', 'B_2__max', 'B_1__last2max', 'P_2__last', 'B_10__max', 'B_23__last__log', 'R_10__max', 'S_7__last', 'B_23__std', 'R_5__mean__log', 'B_3__min', 'S_25__min', 'B_22__mean__log', 'B_19__last', 'B_10__last', 'D_77__min', 'B_22__max', 'R_8__range', 'S_25__range', 'D_42__first', 'D_44__std', 'B_23__mean__log', 'D_41__max', 'S_3__range', 'B_18__last', 'R_8__mean', 'S_7__range', 'B_19__last__log', 'D_74__range', 'R_15__range', 'B_40__min', 'D_39__last', 'P_2__first', 'D_78__mean__log', 'D_74__min', 'B_40__mean', 'R_4__mean__log', 'S_3__max', 'D_45__last', 'B_20__last__log', 'B_8__first', 'B_37__last', 'B_16__last2max', 'B_40__mean__log', 'B_7__last2max', 'R_6__range', 'D_39__max', 'D_39__std', 'D_70__range', 'D_112__last', 'B_8__min', 'D_58__last', 'D_77__last', 'R_7__range', 'B_2__first', 'B_33__mean', 'R_7__mean__log', 'B_18__mean__log', 'S_3__last', 'R_10__mean__log', 'B_8__mean', 'D_39__range', 'D_74__mean__log', 'B_3__mean', 'S_15__mean', 'B_2__last']



len(x_cols)

373

## hyperopt parameters

In [8]:
learn_rates = np.concatenate((
#                            np.arange(0.00001, 0.0001, 0.00001),  
                           np.arange(0.0001, 0.001, 0.0001), 
                           np.arange(0.001, 0.01, 0.001), 
                           np.arange(0.01, 0.1, 0.01)
                          ), 
                          axis=0)


len(learn_rates)

27

In [9]:
#https://lightgbm.readthedocs.io/en/latest/Parameters.html
#https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=classifier#lightgbm.LGBMClassifier

from hyperopt import hp
import numpy as np
space  = { 
    
                 'n_estimators': hp.choice('n_estimators', range(500, 2000, 50)),#num_boost_round
                 'boosting_type':hp.choice('boosting_type', ['gbdt']),#boosting
                 'objective':hp.choice('objective', ['binary'] ),
                 'metric':hp.choice('metric', ['auc']),
                 'learning_rate':  hp.choice('learning_rate', learn_rates), 
                 'colsample_bytree': hp.choice('colsample_bytree', np.round(np.arange(0.1, 0.86, 0.05),3)), #feature_fraction
                 'max_depth': hp.choice('max_depth', range(7, 36, 1)), #int type
                 'min_child_samples': hp.choice('min_child_samples',range(100, 5001, 50)), #min_data_in_leaf
                 'reg_alpha':hp.choice('reg_alpha', [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,15]),#lambda_l1
                 'reg_lambda':hp.choice('reg_lambda', [0.0001, 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10,15]),#lambda_l2
                 'max_bin':hp.choice('max_bin', range(500, 10000, 50)),
                 'min_data_in_bin':hp.choice('min_data_in_bin', range(500, 9000, 50)),
                 'subsample':hp.choice('subsample', np.round(np.arange(0.1, 0.96, 0.05),3)), #bagging_fraction
                 'subsample_freq':hp.choice('subsample_freq', range(1, 100, 2)),#bagging_freq
                 #max number of leaves in one tree. 1 < num_leaves <= 131072. classes< num_leaves< 2^max_depth  
                 'num_leaves':hp.choice('num_leaves', range(31, 300, 5)),#max_leaves. 
                 'random_state':hp.choice('random_state', [1234]),
                 'n_jobs':hp.choice('n_jobs', [4]),#nthread
                 #'min_split_gain':hp.choice('min_split_gain', [0.0]), #min_gain_to_split
                 #'min_child_weight':hp.choice('min_child_weight', [0.001]),   #min_sum_hessian_in_leaf
                 #'subsample_for_bin':hp.choice('subsample_for_bin', [200000]),   #bin_construct_sample_cnt 
                 #'importance_type':hp.choice('importance_type', ['split']),   
    
                  }
                  
    


### lightgbm

In [10]:
import lightgbm as lgb
def train_trees(X_train, y_train, num_round=100, params={} ):
    
    params['verbosity'] = -1
    dtrain = lgb.Dataset(X_train, y_train)
    
    tree_model = lgb.train(params,
                dtrain,
                num_boost_round=num_round)
    
    
    del dtrain
    gc.collect()

    return tree_model

In [11]:
# @yunchonggan's fast metric implementation
# From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
# https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)


In [12]:
X = df[x_cols]
y = df[['target']]

In [13]:
del df
gc.collect()

64

In [14]:
skf = KFold(n_splits=3)

In [15]:
print(skf)

for train_index, test_index in skf.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     print("TRAIN:", len(train_index), "TEST:", len(test_index), len(test_index)/(len(test_index)+len(train_index)))
    print(y.iloc[test_index]['target'].value_counts()/len(test_index)) 
    print(y.iloc[train_index]['target'].value_counts()/len(train_index))

KFold(n_splits=3, random_state=None, shuffle=False)
0    0.740565
1    0.259435
Name: target, dtype: float64
0    0.741317
1    0.258683
Name: target, dtype: float64
0    0.739552
1    0.260448
Name: target, dtype: float64
0    0.741824
1    0.258176
Name: target, dtype: float64
0    0.743082
1    0.256918
Name: target, dtype: float64
0    0.740059
1    0.259941
Name: target, dtype: float64


In [16]:
len(train_index), len(test_index)

(305942, 152971)

In [18]:
log_file = 'amex/agg_v3/lgb-hyperopt-v3data-3kfold-373feats.xlsx'

In [19]:
loss_dict = []


def score(params):
#     print(params)
    num_boost_round = params['n_estimators']
    params_ = copy.deepcopy(params)
    del params_['n_estimators']
    
    losses = []
    
    for train_index, test_index in skf.split(X, y):
        
        #----start: data prep-------------------------------------
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        #----end: data prep-------------------------------------
        
        #-----start: train trees---------------------------------------
        tree_model = train_trees(X_train, y_train['target'].values,
                                 num_round=num_boost_round, 
                                 params=params_
                                )

        y_preds = tree_model.predict(X_test, num_iteration=tree_model.best_iteration)
#         loss = roc_auc_score(y_test, y_preds)
#         loss = f1_score(y_test['target'].values, (y_preds>=0.9).astype(int))
#         loss = amex_metric(y_test, 
#                            pd.DataFrame(data={'prediction': y_preds}))
        loss = amex_metric(y_test['target'].values, y_preds)
        losses.append(loss)
        #-----end: train trees---------------------------------------
        
    loss = np.mean(losses)
#     print(loss)
    loss_dict.append({'params': params, 'losses': losses, 'mean_loss': loss})
    
    if len(loss_dict)%10==0:
        pd.DataFrame(data=loss_dict).to_excel(log_file, index=False)
    return {'loss': -loss, 'status': STATUS_OK}

In [20]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, anneal, rand
from functools import partial
def optimize(space, evals, cores, trials, optimizer=tpe.suggest, random_state=1234, n_startup_jobs=10):
    algo = partial(optimizer, n_startup_jobs=n_startup_jobs)
    best = fmin(score, space, algo=algo, max_evals=evals, trials = trials)
    print(best)
    return best

In [21]:
cores = 4
n=500
verbose = False
trials = Trials()

In [None]:
best_param = optimize(space,
                      evals = n,
                      optimizer=tpe.suggest,
                      cores = cores,
                      trials = trials, random_state=1234, 
                      n_startup_jobs=10)

In [23]:
pd.DataFrame(data=loss_dict).to_excel(log_file, index=False)