# Kaggle Santander competition

In [36]:
# general & data analysis imports
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
# from imblearn.over_sampling import SMOTE

import lightgbm as lgb
import time





## data load

In [4]:
train_dataset=pd.read_csv('train.csv')
test_dataset=pd.read_csv('test.csv')

## data analysis

In [None]:
train_dataset.info()

In [None]:
train_dataset.describe()

In [None]:
train_dataset.columns

In [None]:
train_dataset.head()

In [None]:
train_dataset.isnull().sum().sum()

## data preprocessing

In [5]:
# remove dependent variable from train set to have the same structure as test set
df_target=train_dataset['target'].copy()
df_train=train_dataset.drop(['ID_code','target'], axis=1)
df_test=test_dataset.drop('ID_code', axis=1)

In [7]:
#sm = SMOTE(random_state=1)
#X_train, y_train = sm.fit_resample(X_train, y_train)
#X_val, y_val = sm.fit_resample(X_val, y_val)

## training, tuning and evaluation
random hyperparameters search, without kfold

In [55]:
X_train,X_val,y_train,y_val=train_test_split(df_train.values,df_target.values,test_size=0.15,random_state=1, shuffle=True)
X_test=df_test.values

In [56]:
#lgbm parameters values for random search
param_grid = dict(
         objective =  ['binary'],
         learning_rate = np.logspace(-3, -1, num=50, base=10.0),
         feature_fraction = np.logspace(-2, -1, num=50, base=10.0),
         num_leaves = np.arange(10,30,2),
         min_data_in_leaf = np.arange(30,150,50),
         bagging_fraction = np.arange(0.3,0.95,0.01),
         bagging_freq = np.arange(3, 30, 5),
         max_depth = [-1],
         boosting_type = ['gbdt'],
         metric = ['auc'],
         min_sum_hessian_in_leaf = np.logspace(-4, 2, num=50, base=10.0),
         n_jobs = [-1],
         num_round = [2500]
)

In [57]:
def random_search(param_grid, X_train, X_val, y_train, y_val, iterations):
    train_set = lgb.Dataset(X_train, label=y_train)
    val_set = lgb.Dataset(X_val, label=y_val)
    param_list=list(param_grid.keys())
    metrics_list=['ROC_train','ROC_val','ROC_diff']
    logging_list=param_list+metrics_list
    results=[]
    try:
        for i in range(iterations):
            print(f'iteration {i+1} of {iterations}')
            # randomly select parameters
            param = dict()
            for key in param_grid:
                param[key] = np.random.choice(param_grid[key])
            print(f'selected params:{param}')
            # train the model
            clf = lgb.train(param, train_set, valid_sets=[train_set,val_set], verbose_eval=500,early_stopping_rounds = 400)
            # calculate & log statistics
            y_train_proba=clf.predict(X_train)
            y_val_proba=clf.predict(X_val)
            param['ROC_train']=roc_auc_score(y_train,y_train_proba)
            param['ROC_val']=roc_auc_score(y_val,y_val_proba)
            param['ROC_diff']=param['ROC_train']-param['ROC_val']
            logging_list
            # log results
            result_line=[]
            # log parameters
            for key in logging_list:
                result_line.append(param[key])
            results.append(result_line)
    except(KeyboardInterrupt):
        pass
    # save results to file
    result_df=pd.DataFrame(results, columns=logging_list)
    result_df.to_csv('hp_search.csv', index=False)

        

        
        

In [None]:
random_search(param_grid, X_train, X_val, y_train, y_val, 100)

iteration 1 of 100
selected params:{'objective': 'binary', 'learning_rate': 0.02442053094548651, 'feature_fraction': 0.0517947467923121, 'num_leaves': 12, 'min_data_in_leaf': 30, 'bagging_fraction': 0.49000000000000016, 'bagging_freq': 23, 'max_depth': -1, 'boosting_type': 'gbdt', 'metric': 'auc', 'min_sum_hessian_in_leaf': 56.89866029018293, 'n_jobs': -1, 'num_round': 2500}
Training until validation scores don't improve for 400 rounds.
[500]	training's auc: 0.898337	valid_1's auc: 0.881642
[1000]	training's auc: 0.911384	valid_1's auc: 0.891246
[1500]	training's auc: 0.91977	valid_1's auc: 0.896037
[2000]	training's auc: 0.92548	valid_1's auc: 0.898719
[2500]	training's auc: 0.930083	valid_1's auc: 0.899766
Did not meet early stopping. Best iteration is:
[2500]	training's auc: 0.930083	valid_1's auc: 0.899766
iteration 2 of 100
selected params:{'objective': 'binary', 'learning_rate': 0.015264179671752334, 'feature_fraction': 0.09102981779915217, 'num_leaves': 16, 'min_data_in_leaf': 1



[500]	training's auc: 0.890582	valid_1's auc: 0.872807
[1000]	training's auc: 0.907909	valid_1's auc: 0.885749
[1500]	training's auc: 0.917831	valid_1's auc: 0.892538
[2000]	training's auc: 0.924678	valid_1's auc: 0.896492
[2500]	training's auc: 0.929682	valid_1's auc: 0.898608
Did not meet early stopping. Best iteration is:
[2500]	training's auc: 0.929682	valid_1's auc: 0.898608
iteration 3 of 100
selected params:{'objective': 'binary', 'learning_rate': 0.04714866363457392, 'feature_fraction': 0.1, 'num_leaves': 14, 'min_data_in_leaf': 130, 'bagging_fraction': 0.8100000000000005, 'bagging_freq': 23, 'max_depth': -1, 'boosting_type': 'gbdt', 'metric': 'auc', 'min_sum_hessian_in_leaf': 0.0016768329368110084, 'n_jobs': -1, 'num_round': 2500}
Training until validation scores don't improve for 400 rounds.




[500]	training's auc: 0.91583	valid_1's auc: 0.887103
[1000]	training's auc: 0.935821	valid_1's auc: 0.896734
[1500]	training's auc: 0.94793	valid_1's auc: 0.898781
[2000]	training's auc: 0.95776	valid_1's auc: 0.899059
Early stopping, best iteration is:
[1879]	training's auc: 0.955522	valid_1's auc: 0.899271
iteration 4 of 100
selected params:{'objective': 'binary', 'learning_rate': 0.012648552168552958, 'feature_fraction': 0.011513953993264475, 'num_leaves': 28, 'min_data_in_leaf': 30, 'bagging_fraction': 0.5600000000000003, 'bagging_freq': 13, 'max_depth': -1, 'boosting_type': 'gbdt', 'metric': 'auc', 'min_sum_hessian_in_leaf': 0.00013257113655901095, 'n_jobs': -1, 'num_round': 2500}
Training until validation scores don't improve for 400 rounds.




[500]	training's auc: 0.926405	valid_1's auc: 0.873574


## final training
kfold with best found hyperparameters
predicted probabilities are mean of predictions from all folds

In [10]:
param = {'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_rounds': 3000, 'verbose': 1}
        # , 'device': 'gpu', 'gpu_use_dp': False}

# predicted probabilities on test set (competition set)
y_probs = np.zeros(len(df_test.values))
fold_n=3
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
for i, (train_index, valid_index) in enumerate(folds.split(df_train,df_target)):
    tic=time.time()
    print(f'Calculating fold {i+1}/{fold_n}...')
    train_set = lgb.Dataset(df_train.iloc[train_index], label=df_target.iloc[train_index])
    val_set = lgb.Dataset(df_train.iloc[valid_index], label=df_target.iloc[valid_index])
    clf = lgb.train(param, train_set, valid_sets=[train_set,val_set], verbose_eval=500,early_stopping_rounds = 400)
    y_probs += clf.predict(df_test.values, num_iteration=clf.best_iteration)/fold_n
    toc=time.time()
    print(f'Fold {i+1} calcutated in {toc-tic}.')


Calculating fold 1/3...




Training until validation scores don't improve for 400 rounds.
[500]	training's auc: 0.904735	valid_1's auc: 0.853201
[1000]	training's auc: 0.938294	valid_1's auc: 0.875071
[1500]	training's auc: 0.955639	valid_1's auc: 0.884141
[2000]	training's auc: 0.967035	valid_1's auc: 0.889215
[2500]	training's auc: 0.975449	valid_1's auc: 0.891839
[3000]	training's auc: 0.98191	valid_1's auc: 0.893351
Did not meet early stopping. Best iteration is:
[3000]	training's auc: 0.98191	valid_1's auc: 0.893351
Fold 1 calcutated in 663.8956127166748.
Calculating fold 2/3...
Training until validation scores don't improve for 400 rounds.
[500]	training's auc: 0.90724	valid_1's auc: 0.848393
[1000]	training's auc: 0.940216	valid_1's auc: 0.87054
[1500]	training's auc: 0.957037	valid_1's auc: 0.879951
[2000]	training's auc: 0.968067	valid_1's auc: 0.885308
[2500]	training's auc: 0.976115	valid_1's auc: 0.888255
[3000]	training's auc: 0.982512	valid_1's auc: 0.889822
Did not meet early stopping. Best iterat

In [None]:
#y_val_probs=clf.predict(X_val)
#y_val_preds=np.where(y_val_probs>0.5,1,0)

In [None]:
#roc_auc_score(y_val, y_val_probs)

In [None]:
#y_probs=clf.predict(X_test)

In [11]:
y_probs


array([0.08730093, 0.2432966 , 0.20597767, ..., 0.00640229, 0.07279869,
       0.05458183])

## submision

In [12]:
submission_df = pd.DataFrame({"ID_code":test_dataset["ID_code"].values})
submission_df["target"] = y_probs
submission_df.to_csv("submission.csv", index=False)