In [8]:
import pandas as pd
import numpy as np
import re
import gc
import xgboost as xgb
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, 
                              GradientBoostingClassifier, ExtraTreesClassifier)
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv('/nfsroot/data/home/2359B48/Santander Competition/datasets/train.csv')
test = pd.read_csv('/nfsroot/data/home/2359B48/Santander Competition/datasets/test.csv')

In [3]:
print(train.shape)
print(test.shape)
print(train.target.value_counts())
train.head(2)

(200000, 202)
(200000, 201)
0    179902
1     20098
Name: target, dtype: int64


Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518


In [4]:
X = train.iloc[:, 2:]
y = train[['target']]

In [5]:
folds = KFold(n_splits=5, shuffle=True, random_state=123)
oof_preds = np.zeros(train.shape[0])
sub_preds = np.zeros(test.shape[0])

In [9]:
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx]
    val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
    
    clf = LGBMClassifier(
        n_estimators=100,
        learning_rate=0.05,
        num_leaves=123,
        colsample_bytree=.8,
        subsample=.9,
        max_depth=15,
        reg_alpha=.1,
        reg_lambda=.1,
        min_split_gain=.01,
        min_child_weight=2,
        is_unbalance = True)
    
    clf.fit(trn_x, trn_y, eval_set= [(trn_x, trn_y), (val_x, val_y)], 
                             eval_metric='auc', verbose=-1, early_stopping_rounds=70)
    
    oof_preds[val_idx] = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(test.iloc[:, 1:], num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits
    
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(val_y, oof_preds[val_idx])))
    del clf, trn_x, trn_y, val_x, val_y
    gc.collect()
    

Training until validation scores don't improve for 150 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.969102	training's binary_logloss: 0.38238	valid_1's auc: 0.859548	valid_1's binary_logloss: 0.425146
Fold  1 AUC : 0.859548
Training until validation scores don't improve for 150 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.967839	training's binary_logloss: 0.384522	valid_1's auc: 0.862727	valid_1's binary_logloss: 0.42763
Fold  2 AUC : 0.862727
Training until validation scores don't improve for 150 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.968742	training's binary_logloss: 0.381995	valid_1's auc: 0.856958	valid_1's binary_logloss: 0.427792
Fold  3 AUC : 0.856958
Training until validation scores don't improve for 150 rounds.
Did not meet early stopping. Best iteration is:
[100]	training's auc: 0.968353	training's binary_logloss: 0.383589	valid_1's auc: 0.863637	valid_1's binary_log

In [12]:
print('Full AUC score %.6f' % roc_auc_score(y, oof_preds))   

test['target'] = sub_preds

test[['ID_code', 'target']].to_csv('/nfsroot/data/home/2359B48/Santander Competition/submissions/submission_mar_15th.csv', index=False, float_format='%.8f')

Full AUC score 0.859610


In [None]:
param = {'num_leaves': 50,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 5,
         'learning_rate': 0.006,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 27,
         "metric": 'auc',
         "verbosity": -1,
        'is_unbalance' : True}