# Kaggle Santander competition

In [65]:
# general & data analysis imports
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
# from imblearn.over_sampling import SMOTE

import lightgbm as lgb
import time

from hyperopt import hp, tpe
from hyperopt.fmin import fmin




## data load

In [4]:
train_dataset=pd.read_csv('train.csv')
test_dataset=pd.read_csv('test.csv')

## data analysis

In [None]:
train_dataset.info()

In [None]:
train_dataset.describe()

In [None]:
train_dataset.columns

In [None]:
train_dataset.head()

In [None]:
train_dataset.isnull().sum().sum()

## data preprocessing

In [5]:
# remove dependent variable from train set to have the same structure as test set
df_target=train_dataset['target'].copy()
df_train=train_dataset.drop(['ID_code','target'], axis=1)
df_test=test_dataset.drop('ID_code', axis=1)

In [7]:
#sm = SMOTE(random_state=1)
#X_train, y_train = sm.fit_resample(X_train, y_train)
#X_val, y_val = sm.fit_resample(X_val, y_val)

## training, tuning and evaluation - random search
random hyperparameters search, without kfold

In [55]:
X_train,X_val,y_train,y_val=train_test_split(df_train.values,df_target.values,test_size=0.15,random_state=1, shuffle=True)
X_test=df_test.values

In [56]:
#lgbm parameters values for random search
param_grid = dict(
         objective =  ['binary'],
         learning_rate = np.logspace(-3, -1, num=50, base=10.0),
         feature_fraction = np.logspace(-2, -1, num=50, base=10.0),
         num_leaves = np.arange(10,30,2),
         min_data_in_leaf = np.arange(30,150,50),
         bagging_fraction = np.arange(0.3,0.95,0.01),
         bagging_freq = np.arange(3, 30, 5),
         max_depth = [-1],
         boosting_type = ['gbdt'],
         metric = ['auc'],
         min_sum_hessian_in_leaf = np.logspace(-4, 2, num=50, base=10.0),
         n_jobs = [-1],
         num_round = [2500]
)

In [57]:
def random_search(param_grid, X_train, X_val, y_train, y_val, iterations):
    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val)
    param_list=list(param_grid.keys())
    metrics_list=['ROC_train','ROC_val','ROC_diff']
    logging_list=param_list+metrics_list
    results=[]
    try:
        for i in range(iterations):
            print(f'iteration {i+1} of {iterations}')
            # randomly select parameters
            param = dict()
            for key in param_grid:
                param[key] = np.random.choice(param_grid[key])
            print(f'selected params:{param}')
            # train the model
            clf = lgb.train(param, train_set, valid_sets=[train_set,val_set], verbose_eval=500,early_stopping_rounds = 400)
            # calculate & log statistics
            y_train_proba=clf.predict(X_train)
            y_val_proba=clf.predict(X_val)
            param['ROC_train']=roc_auc_score(y_train,y_train_proba)
            param['ROC_val']=roc_auc_score(y_val,y_val_proba)
            param['ROC_diff']=param['ROC_train']-param['ROC_val']
            logging_list
            # log results
            result_line=[]
            # log parameters
            for key in logging_list:
                result_line.append(param[key])
            results.append(result_line)
    except(KeyboardInterrupt):
        pass
    # save results to file
    result_df=pd.DataFrame(results, columns=logging_list)
    result_df.to_csv('hp_search.csv', index=False)

        

        
        

In [64]:
random_search(param_grid, X_train, X_val, y_train, y_val, 100)

iteration 1 of 100
selected params:{'objective': 'binary', 'learning_rate': 0.001, 'feature_fraction': 0.08685113737513525, 'num_leaves': 14, 'min_data_in_leaf': 80, 'bagging_fraction': 0.8300000000000005, 'bagging_freq': 23, 'max_depth': -1, 'boosting_type': 'gbdt', 'metric': 'auc', 'min_sum_hessian_in_leaf': 5.963623316594637, 'n_jobs': -1, 'num_round': 2500}




## training, tuning and evaluation - hyperopt
 hyperparameters search by hyperopt

In [55]:
X_train,X_val,y_train,y_val=train_test_split(df_train.values,df_target.values,test_size=0.15,random_state=1, shuffle=True)
X_test=df_test.values

In [72]:
space= { 'objective': 'binary', 
         'boosting_type': 'gbdt',
         'metric': 'auc',
         'learning_rate': hp.loguniform('learning_rate', 1e-2, 1e-1),
         'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
         'feature_fraction': hp.uniform('feature_fraction',0.01, 0.9),
         'bagging_fraction': hp.uniform('bagging_fraction',0.01, 0.9),
         'num_leaves': hp.choice('num_leaves', np.arange(10, 40, dtype=int)),
         'min_data_in_leaf': hp.choice('min_data_in_leaf', np.arange(30, 150, 10, dtype=int)),
         'bagging_freq': hp.choice('bagging_freq', np.arange(3, 10, dtype=int)),
         'n_jobs': -1,
         'num_round': 2500
}

In [None]:
def check_model(hp_params):
    

## final training
kfold with best found hyperparameters
predicted probabilities are mean of predictions from all folds

In [59]:
X_test=df_test.values

In [63]:
best_param = {'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.03, 'feature_fraction': 0.025,
              'num_leaves': 10, 'min_data_in_leaf': 130, 'bagging_fraction': 0.5, 'bagging_freq': 3, 
              'num_rounds': 4000, 'verbose': 1}
        # , 'device': 'gpu', 'gpu_use_dp': False}

# predicted probabilities on test set (competition set)
y_probs = np.zeros(len(df_test.values))
fold_n=3
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
for i, (train_index, valid_index) in enumerate(folds.split(df_train,df_target)):
    tic=time.time()
    print(f'Calculating fold {i+1}/{fold_n}...')
    train_set = lgb.Dataset(df_train.iloc[train_index], label=df_target.iloc[train_index])
    val_set = lgb.Dataset(df_train.iloc[valid_index], label=df_target.iloc[valid_index])
    clf = lgb.train(best_param, train_set, valid_sets=[train_set, val_set], verbose_eval=500,early_stopping_rounds = 400)
    y_probs += clf.predict(df_test.values, num_iteration=clf.best_iteration)/fold_n
    toc=time.time()
    print(f'Fold {i+1} calcutated in {toc-tic}.')


Calculating fold 1/3...




Training until validation scores don't improve for 400 rounds.
[500]	valid_0's auc: 0.888485
[1000]	valid_0's auc: 0.895666


KeyboardInterrupt: 

In [None]:
#y_val_probs=clf.predict(X_val)
#y_val_preds=np.where(y_val_probs>0.5,1,0)

In [None]:
#roc_auc_score(y_val, y_val_probs)

In [None]:
#y_probs=clf.predict(X_test)

In [61]:
y_probs


array([0.09905054, 0.21218829, 0.16893732, ..., 0.00809165, 0.1014492 ,
       0.06534386])

## submision

In [62]:
submission_df = pd.DataFrame({"ID_code":test_dataset["ID_code"].values})
submission_df["target"] = y_probs
submission_df.to_csv("submission.csv", index=False)