# Kaggle Santander competition

In [9]:
# general & data analysis imports
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE

import lightgbm as lgb
import time





## data load

In [2]:
train_dataset=pd.read_csv('train.csv')
test_dataset=pd.read_csv('test.csv')

## data analysis

In [None]:
train_dataset.info()

In [None]:
train_dataset.describe()

In [None]:
train_dataset.columns

In [None]:
train_dataset.head()

In [None]:
train_dataset.isnull().sum().sum()

## data preprocessing

In [3]:
# remove dependent variable from train set to have the same structure as test set
df_target=train_dataset['target'].copy()
df_train=train_dataset.drop(['ID_code','target'], axis=1)
df_test=test_dataset.drop('ID_code', axis=1)

In [4]:
#X_train,X_val,y_train,y_val=train_test_split(df_train.values,df_target.values,test_size=0.15,random_state=1, shuffle=True)
X_test=df_test.values

In [5]:
#sm = SMOTE(random_state=1)
#X_train, y_train = sm.fit_resample(X_train, y_train)
#X_val, y_val = sm.fit_resample(X_val, y_val)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)
print(X_test.shape)

## training and evaluation

In [16]:
param = {'objective': 'binary', 'metric': 'auc', 'learning_rate': 0.01, 'num_rounds': 2000, 'verbose': 1}
        # , 'device': 'gpu', 'gpu_use_dp': False}

# predicted probabilities on test set (competition set)
y_probs = np.zeros(len(X_test))
fold_n=2
folds = StratifiedKFold(n_splits=fold_n, shuffle=True, random_state=30)
for i, (train_index, valid_index) in enumerate(folds.split(df_train,df_target)):
    tic=time.time()
    print(f'Calculating fold {i+1}/{fold_n}...')
    train_set = lgb.Dataset(df_train.iloc[train_index], label=df_target.iloc[train_index])
    val_set = lgb.Dataset(df_train.iloc[valid_index], label=df_target.iloc[valid_index])
    clf = lgb.train(param, train_set, valid_sets=[train_set,val_set], verbose_eval=200,early_stopping_rounds = 400)
    y_probs += clf.predict(X_test, num_iteration=clf.best_iteration)/fold_n
    toc=time.time()
    print(f'Fold {i+1} calcutated in {toc-tic}.')


Calculating fold 1/2...




Training until validation scores don't improve for 400 rounds.
[200]	training's auc: 0.863846	valid_1's auc: 0.811589
[400]	training's auc: 0.905308	valid_1's auc: 0.843182
[600]	training's auc: 0.926754	valid_1's auc: 0.858061
[800]	training's auc: 0.940904	valid_1's auc: 0.867434
[1000]	training's auc: 0.951521	valid_1's auc: 0.873778
[1200]	training's auc: 0.959486	valid_1's auc: 0.878296
[1400]	training's auc: 0.965913	valid_1's auc: 0.881625
[1600]	training's auc: 0.971285	valid_1's auc: 0.884177
[1800]	training's auc: 0.975624	valid_1's auc: 0.886154
[2000]	training's auc: 0.979317	valid_1's auc: 0.887588
Did not meet early stopping. Best iteration is:
[2000]	training's auc: 0.979317	valid_1's auc: 0.887588
Fold 1 calcutated in 158.90864825248718.
Calculating fold 2/2...
Training until validation scores don't improve for 400 rounds.
[200]	training's auc: 0.864172	valid_1's auc: 0.810007
[400]	training's auc: 0.906164	valid_1's auc: 0.840448
[600]	training's auc: 0.928207	valid_1'

In [None]:
#y_val_probs=clf.predict(X_val)
#y_val_preds=np.where(y_val_probs>0.5,1,0)

In [None]:
#roc_auc_score(y_val, y_val_probs)

In [None]:
#y_probs=clf.predict(X_test)

In [None]:
y_probs


## submision

In [None]:
submission_df = pd.DataFrame({"ID_code":test_dataset["ID_code"].values})
submission_df["target"] = y_probs
submission_df.to_csv("submission.csv", index=False)