# Kaggle Santander competition

In [1]:
# general & data analysis imports
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
# from imblearn.over_sampling import SMOTE

import lightgbm as lgb
import time

from hyperopt import hp, tpe
from hyperopt.fmin import fmin




## data load

In [2]:
train_dataset=pd.read_csv('train.csv')
test_dataset=pd.read_csv('test.csv')

## data analysis

In [None]:
train_dataset.info()

In [None]:
train_dataset.describe()

In [None]:
train_dataset.columns

In [None]:
train_dataset.head()

In [None]:
train_dataset.isnull().sum().sum()

## data preprocessing

In [3]:
# remove dependent variable from train set to have the same structure as test set
df_target=train_dataset['target'].copy()
df_train=train_dataset.drop(['ID_code','target'], axis=1)
df_test=test_dataset.drop('ID_code', axis=1)

In [None]:
#sm = SMOTE(random_state=1)
#X_train, y_train = sm.fit_resample(X_train, y_train)
#X_val, y_val = sm.fit_resample(X_val, y_val)

## training, tuning and evaluation - random search
random hyperparameters search, without kfold

In [None]:
X_train,X_val,y_train,y_val=train_test_split(df_train.values,df_target.values,test_size=0.2,random_state=1, shuffle=True)
X_test=df_test.values

In [None]:
#lgbm parameters values for random search
param_grid = dict(
         objective =  ['binary'],
         learning_rate = np.logspace(-3, -1, num=50, base=10.0),
         feature_fraction = np.logspace(-2, -1, num=50, base=10.0),
         num_leaves = np.arange(10,30,2),
         min_data_in_leaf = np.arange(30,150,50),
         bagging_fraction = np.arange(0.3,0.95,0.01),
         bagging_freq = np.arange(3, 30, 5),
         max_depth = [-1],
         boosting_type = ['gbdt'],
         metric = ['auc'],
         min_sum_hessian_in_leaf = np.logspace(-4, 2, num=50, base=10.0),
         n_jobs = [-1],
         num_round = [2500]
)

In [None]:
def random_search(param_grid, X_train, X_val, y_train, y_val, iterations):
    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val)
    param_list=list(param_grid.keys())
    metrics_list=['ROC_train','ROC_val','ROC_diff']
    logging_list=param_list+metrics_list
    results=[]
    try:
        for i in range(iterations):
            print(f'iteration {i+1} of {iterations}')
            # randomly select parameters
            param = dict()
            for key in param_grid:
                param[key] = np.random.choice(param_grid[key])
            print(f'selected params:{param}')
            # train the model
            clf = lgb.train(param, train_set, valid_sets=[train_set,val_set], verbose_eval=500,early_stopping_rounds = 400)
            # calculate & log statistics
            y_train_proba=clf.predict(X_train)
            y_val_proba=clf.predict(X_val)
            param['ROC_train']=roc_auc_score(y_train,y_train_proba)
            param['ROC_val']=roc_auc_score(y_val,y_val_proba)
            param['ROC_diff']=param['ROC_train']-param['ROC_val']
            logging_list
            # log results
            result_line=[]
            # log parameters
            for key in logging_list:
                result_line.append(param[key])
            results.append(result_line)
    except(KeyboardInterrupt):
        pass
    # save results to file
    result_df=pd.DataFrame(results, columns=logging_list)
    result_df.to_csv('hp_search.csv', index=False)

        

        
        

In [None]:
random_search(param_grid, X_train, X_val, y_train, y_val, 400)

## training, tuning and evaluation - hyperopt
 hyperparameters search by hyperopt

In [None]:
X_train,X_val,y_train,y_val=train_test_split(df_train.values,df_target.values,test_size=0.15,random_state=1, shuffle=True)
X_test=df_test.values
train_set = lgb.Dataset(X_train, label=y_train)
val_set = lgb.Dataset(X_val, label=y_val)

In [None]:
space= { 'objective': 'binary', 
         'boosting_type': 'gbdt',
         'metric': 'auc',
         'max_depth': -1,
         'learning_rate': 0.04,
         # 'max_depth':hp.quniform('max_depth', 4, 14, 1),
         'feature_fraction': hp.uniform('feature_fraction',0.01, 0.2),
         'bagging_fraction': hp.uniform('bagging_fraction',0.2, 0.9),
         'num_leaves': hp.quniform('num_leaves', 10, 20, 1),
         'min_data_in_leaf': hp.quniform('min_data_in_leaf', 75, 85, 1),
         'bagging_freq': hp.quniform('bagging_freq', 3, 10, 1),
         'n_jobs': -1,
         'num_round': 3000
}

In [None]:
def check_model(param):
    # print received params
    # print(f'received params: {param}')
    # convert to integers
    integer_params=['max_depth', 'num_leaves', 'min_data_in_leaf', 'bagging_freq']
    for p in integer_params:
        param[p]=int(param[p])
    print(f'corrected params: {param}')
    # train clasiffier
    clf = lgb.train(param, train_set, valid_sets=[train_set,val_set], verbose_eval=1000,early_stopping_rounds = 400)
    # calculate ROC (more is better)
    y_val_proba=clf.predict(X_val)
    roc=roc_auc_score(y_val,y_val_proba)
    # return optimization result (less is better)
    return 1-roc


In [None]:
best_classifier = fmin(check_model,space,algo=tpe.suggest,max_evals=150)
print(best_classifier)


## final training
Kfold with best found hyperparameters. Predicted probabilities are mean of predictions from all folds.

In [4]:
X_test=df_test.values

In [5]:
# 0.900 (from random search)
best_param = {'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.02, 'feature_fraction': 0.023,
              'num_leaves': 10, 'min_data_in_leaf': 80, 'bagging_fraction': 0.46, 'bagging_freq': 18, 
              'min_sum_hessian_in_leaf': 1.45, 'num_rounds': 5000, 'verbose': 1}
fold_n=4

# 0.896 (from hyperopt)
#best_param = {'objective': 'binary', 'metric': 'auc', 'bagging_fraction': 0.7755883925843395, 'bagging_freq': 4,
#              'feature_fraction': 0.11619997622598302, 'min_data_in_leaf': 85, 'num_leaves': 19,
#              'learning_rate': 0.04, 'max_depth': -1, 'num_rounds': 5000, 'verbose': 1}
#fold_n=5

# predicted probabilities on test set (competition set)
y_probs = np.zeros(len(df_test.values))
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
for i, (train_index, valid_index) in enumerate(folds.split(df_train,df_target)):
    tic=time.time()
    print(f'Calculating fold {i+1}/{fold_n}...')
    train_set = lgb.Dataset(df_train.iloc[train_index], label=df_target.iloc[train_index])
    val_set = lgb.Dataset(df_train.iloc[valid_index], label=df_target.iloc[valid_index])
    clf = lgb.train(best_param, train_set, valid_sets=[train_set, val_set], verbose_eval=500, early_stopping_rounds = 400)
    # y_probs += clf.predict(df_test.values, num_iteration=clf.best_iteration)/fold_n
    y_probs += clf.predict(df_test.values)/fold_n
    toc=time.time()
    print(f'Fold {i+1} calcutated in {toc-tic}.')


Calculating fold 1/4...




Training until validation scores don't improve for 400 rounds.
[500]	training's auc: 0.90442	valid_1's auc: 0.890074
[1000]	training's auc: 0.912202	valid_1's auc: 0.896432
[1500]	training's auc: 0.917283	valid_1's auc: 0.899101
[2000]	training's auc: 0.921287	valid_1's auc: 0.900481
[2500]	training's auc: 0.924929	valid_1's auc: 0.901141
[3000]	training's auc: 0.927809	valid_1's auc: 0.901922
[3500]	training's auc: 0.930756	valid_1's auc: 0.902281
[4000]	training's auc: 0.933523	valid_1's auc: 0.902529
[4500]	training's auc: 0.936257	valid_1's auc: 0.902815
[5000]	training's auc: 0.93878	valid_1's auc: 0.902951
Did not meet early stopping. Best iteration is:
[5000]	training's auc: 0.93878	valid_1's auc: 0.902951
Fold 1 calcutated in 206.73588609695435.
Calculating fold 2/4...
Training until validation scores don't improve for 400 rounds.
[500]	training's auc: 0.905695	valid_1's auc: 0.883989
[1000]	training's auc: 0.914279	valid_1's auc: 0.889088
[1500]	training's auc: 0.919304	valid_

### without kfold

In [8]:
best_param = {'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'feature_fraction': 0.023,
              'num_leaves': 10, 'min_data_in_leaf': 80, 'bagging_fraction': 0.46, 'bagging_freq': 18, 
              'min_sum_hessian_in_leaf': 1.45, 'num_rounds': 15000, 'verbose': 1}

train_set = lgb.Dataset(df_train.values, label=df_target.values)
clf = lgb.train(best_param, train_set, valid_sets=[train_set], verbose_eval=500, early_stopping_rounds = 400)
y_probs = clf.predict(df_test.values)



Training until validation scores don't improve for 400 rounds.
[500]	training's auc: 0.899538
[1000]	training's auc: 0.905339
[1500]	training's auc: 0.90874
[2000]	training's auc: 0.911266
[2500]	training's auc: 0.913613
[3000]	training's auc: 0.915054
[3500]	training's auc: 0.916634
[4000]	training's auc: 0.918196
[4500]	training's auc: 0.919637
[5000]	training's auc: 0.920892
[5500]	training's auc: 0.922118
[6000]	training's auc: 0.923209
[6500]	training's auc: 0.924424
[7000]	training's auc: 0.925567
[7500]	training's auc: 0.92667
[8000]	training's auc: 0.927826
[8500]	training's auc: 0.92888
[9000]	training's auc: 0.929924
[9500]	training's auc: 0.930953
[10000]	training's auc: 0.931979
[10500]	training's auc: 0.932979
[11000]	training's auc: 0.933961
[11500]	training's auc: 0.934974
[12000]	training's auc: 0.935964
[12500]	training's auc: 0.936964
[13000]	training's auc: 0.937965
[13500]	training's auc: 0.938943
[14000]	training's auc: 0.939914
[14500]	training's auc: 0.940877
[15

## submision

In [9]:
submission_df = pd.DataFrame({"ID_code":test_dataset["ID_code"].values})
submission_df["target"] = y_probs
submission_df.to_csv("submission.csv", index=False)