## This notebook is the modeling part of  Santander Customer Transaction Prediction Competition

## Tried:
 - Randomness seed, doesn't work
 - Variable bins
 - Narrow prossibilities
 - Add count in test for each instance
 - Resampling
 - Hyper-parameter tuning

## Package Load

In [8]:
import numpy as np # linear algebra
import pandas as pd 
# Any results you write to the current directory are saved as output.
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb
import xgboost as xgb
from tqdm import tqdm_notebook
from scipy.signal import savgol_filter
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
from sklearn.cluster import KMeans
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

plt.style.use('seaborn')
sns.set(font_scale=1)

## Data Load

In [9]:
# Load the data and set random seed
random_state = 42
random.seed(random_state)
df_train = pd.read_pickle('./train.pkl')
df_test = pd.read_pickle('./test.pkl')
y = df_train['target']#.values

In [10]:
# Define Features
feature = [col for col in df_train.columns if col not in ['ID_code', 'target']]

In [11]:
# Split features and labels
public = np.load('public_LB.npy')
private = np.load('private_LB.npy')
real_idx = np.hstack([public,private])
real_test = df_test.iloc[real_idx,:]
fake_idx = np.array(list(set(range(200000))-set(real_idx)))
fake_test = df_test.iloc[fake_idx,:]

## Get Features

### Raw Features

In [12]:
train_X_raw = df_train[feature]
test_X_raw = real_test[feature]
fake_X_raw = fake_test[feature]

### Noise Features

In [13]:
train_X_noise = np.load('train_0408_noise_raw.npy')
test_X_noise = np.load('test_0408_noise_raw.npy')
fake_X_noise = np.load('train_fake_raw.npy')

In [14]:
noise_merge = np.vstack([train_X_noise, test_X_noise])

In [15]:
# x = noise_merge[:,1].copy()
# n_unique = len(np.unique(x, return_counts=True)[1])
# # cluster = int(n_unique*2/3)
# fake_idx = np.where(np.isnan(x))[0]
# mask = np.ones(x.shape[0], dtype=bool)
# mask[fake_idx] = False
# clean_data = x[mask]

Tried different ways to denoise and smooth data, which took too much time, so not implemented in the final model, but the idea is to use some clustering method and polyorder smooth method to denoise and smooth data.

In [16]:
def denoise(df):
    for feature in tqdm_notebook(range(df.shape[1])):
        x = df[:,feature].copy()
        n_unique = len(np.unique(x, return_counts=True)[1])
        cluster = int(n_unique*2/3)
        fake_idx = np.where(np.isnan(x))[0]
        mask = np.ones(x.shape[0], dtype=bool)
        mask[fake_idx] = False
        clean_data = x[mask]
        kmeans = KMeans(n_clusters=cluster, random_state=0, n_jobs=16,max_iter=100).fit(clean_data.reshape(-1, 1))
        x[mask] = kmeans.labels_
        df[:,feature] = x
    return df
    

In [17]:
def smooth(df):
    for feature in tqdm_notebook(range(df.shape[1])):
        x = df[:,feature].copy()
        n_unique = len(np.unique(x, return_counts=True)[1])
#         cluster = int(n_unique*2/3)
        fake_idx = np.where(np.isnan(x))[0]
        mask = np.ones(x.shape[0], dtype=bool)
        mask[fake_idx] = False
        clean_data = x[mask]
        x[mask] = savgol_filter(clean_data,window_length=5,polyorder=3)
        df[:,feature] = x
    return df
    

In [None]:
# noise_merge = denoise(noise_merge)
# smooth_merge = smooth(noise_merge)

In [75]:
# train_X_noise = noise_merge[:200000]
# test_X_noise = noise_merge[200000:]

In [94]:
# train_X_noise = smooth_merge[:200000]
# test_X_noise = smooth_merge[200000:]

### Variable Level Features

In [18]:
nan_cnt_train = np.load('nan_cnt_train.npy')
nan_cnt_test = np.load('nan_cnt_test.npy')
var_sta_train = np.load('var_sta_train.npy')
var_sta_test = np.load('var_sta_test.npy')
var_sta_fake = np.load('var_sta_fake.npy')

### Combine Features

In [20]:
# Combine different features
train_X = np.hstack([train_X_raw,
                     train_X_noise,
#                      train_X_sta,
                     var_sta_train,
                     np.expand_dims(nan_cnt_train,1)])

test_X = np.hstack([test_X_raw,
                    test_X_noise,
#                     test_X_sta,
                    var_sta_test,
                    np.expand_dims(nan_cnt_test,1)])

In [205]:
# nan_cnt_fake = np.isnan(fake_X_noise).sum(axis=1)

In [206]:
# # Get features for fake data
# fake_X = np.hstack([fake_X_raw,
#                     fake_X_noise,
# #                     test_X_sta,
#                     var_sta_fake,
#                     np.expand_dims(nan_cnt_fake,1)])

In [145]:
# Try use some rough bins method to denoise data
for i in range(200):
    train_X[:,200+i] = np.round(train_X[:,200+i], 3)

## Split train and valid data

In [None]:
# 5 fold split and use first fold to make quick check
num_folds = 5
folds = StratifiedKFold(n_splits=num_folds, random_state=random_state)

In [149]:
train_idx = list(folds.split(train_X, y))[0][0]
val_idx = list(folds.split(train_X, y))[0][1]

In [150]:
X_trn, y_train = train_X[train_idx], y[train_idx]
X_valid, y_valid = train_X[val_idx], y[val_idx]

In [151]:
trn_data = lgb.Dataset(X_trn, label=y_train)
val_data = lgb.Dataset(X_valid, label=y_valid)

Resample if needed

In [152]:
# from imblearn.over_sampling import RandomOverSampler
# ros = RandomOverSampler(random_state=0)
# X_resampled, y_resampled = ros.fit_resample(X_trn, y_train)
# trn_data = lgb.Dataset(X_resampled, label=y_resampled)
# val_data = lgb.Dataset(X_valid, label=y_valid)

## Model Define

Here I use Lightgbm since it's fast, I tried different method for data preprocessing, different groups of features and different hyper-parameters.

In [157]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.35,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.3,
    'learning_rate': 0.0083,
    'max_depth': 8,
    'metric': 'auc',
    'max_bin': 165,
    'min_data_in_leaf': 40,
    'min_child_weight': 10,
    'subsample':0.7,
#     'subsample':0.4,
    'num_leaves': 2,
    'colsample_bytree': 0.03,
    'num_threads': 36,
    'tree_learner': 'serial',
#     'lambda_l1' : 1.7,
#     'lambda_l2' : 5,
    'objective': 'binary', 
    'verbosity': -1
}

In [156]:
# scale rd4 +pct975
print(param)
clf = lgb.train(param, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=2500, early_stopping_rounds = 2000)

{'bagging_freq': 5, 'bagging_fraction': 0.5, 'boost_from_average': 'false', 'boost': 'gbdt', 'feature_fraction': 0.3, 'learning_rate': 0.01, 'max_depth': 8, 'metric': 'auc', 'max_bin': 180, 'min_data_in_leaf': 40, 'min_child_weight': 10, 'subsample': 0.7, 'num_leaves': 2, 'colsample_bytree': 0.03, 'num_threads': 36, 'tree_learner': 'serial', 'objective': 'binary', 'verbosity': -1}
Training until validation scores don't improve for 2000 rounds.
[2500]	training's auc: 0.862672	valid_1's auc: 0.851922
[5000]	training's auc: 0.890861	valid_1's auc: 0.878785
[7500]	training's auc: 0.905086	valid_1's auc: 0.892467
[10000]	training's auc: 0.913406	valid_1's auc: 0.900568
[12500]	training's auc: 0.918728	valid_1's auc: 0.905626
[15000]	training's auc: 0.922527	valid_1's auc: 0.909293
[17500]	training's auc: 0.925369	valid_1's auc: 0.912002
[20000]	training's auc: 0.927587	valid_1's auc: 0.914064
[22500]	training's auc: 0.929353	valid_1's auc: 0.91575
[25000]	training's auc: 0.930646	valid_1's 

In [23]:
# scale rd4 +pct975
print(param)
clf = lgb.train(param, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=2500, early_stopping_rounds = 4000)

{'bagging_freq': 5, 'bagging_fraction': 0.35, 'boost_from_average': 'false', 'boost': 'gbdt', 'feature_fraction': 0.4, 'learning_rate': 0.0083, 'max_depth': 8, 'metric': 'auc', 'max_bin': 165, 'min_data_in_leaf': 40, 'min_child_weight': 10, 'subsample': 0.7, 'num_leaves': 2, 'colsample_bytree': 0.03, 'num_threads': 36, 'tree_learner': 'serial', 'objective': 'binary', 'verbosity': -1}
Training until validation scores don't improve for 4000 rounds.
[2500]	training's auc: 0.856842	valid_1's auc: 0.846069
[5000]	training's auc: 0.886017	valid_1's auc: 0.874494
[7500]	training's auc: 0.90104	valid_1's auc: 0.889096
[10000]	training's auc: 0.909948	valid_1's auc: 0.89805
[12500]	training's auc: 0.915796	valid_1's auc: 0.903417
[15000]	training's auc: 0.920141	valid_1's auc: 0.907553
[17500]	training's auc: 0.923099	valid_1's auc: 0.910466
[20000]	training's auc: 0.925569	valid_1's auc: 0.912835
[22500]	training's auc: 0.927541	valid_1's auc: 0.914735
[25000]	training's auc: 0.929016	valid_1'

In [47]:
# scale rd4 +pct975
print(param)
clf = lgb.train(param, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=2500, early_stopping_rounds = 4000)

{'bagging_freq': 5, 'bagging_fraction': 0.35, 'boost_from_average': 'false', 'boost': 'gbdt', 'feature_fraction': 0.4, 'learning_rate': 0.0083, 'max_depth': -1, 'metric': 'auc', 'max_bin': 180, 'min_data_in_leaf': 40, 'min_child_weight': 10, 'subsample': 0.7, 'num_leaves': 2, 'colsample_bytree': 0.03, 'num_threads': -1, 'tree_learner': 'serial', 'objective': 'binary', 'verbosity': -1}
Training until validation scores don't improve for 4000 rounds.
[2500]	training's auc: 0.856812	valid_1's auc: 0.846718
[5000]	training's auc: 0.88632	valid_1's auc: 0.874853
[7500]	training's auc: 0.901324	valid_1's auc: 0.889252
[10000]	training's auc: 0.909998	valid_1's auc: 0.89799
[12500]	training's auc: 0.915968	valid_1's auc: 0.9035
[15000]	training's auc: 0.920292	valid_1's auc: 0.907717
[17500]	training's auc: 0.923285	valid_1's auc: 0.910555
[20000]	training's auc: 0.925723	valid_1's auc: 0.912847
[22500]	training's auc: 0.927763	valid_1's auc: 0.914834
[25000]	training's auc: 0.929208	valid_1's

### XGboost for comparison

In [None]:
xgb_param = {'tree_method': 'hist',
             'objective': 'binary:logistic',
             'eval_metric': 'auc',
             'learning_rate': 0.01,
#              'max_depth': 5,
             'colsample_bytree': 0.03,
             'subsample': 0.8,
             'min_child_weight': 20,
             'gamma': 1.2,
#              'silent': 1,
             'n_jobs':16,
             'n_estimators': 20,
            }

In [77]:
dtrain = xgb.DMatrix(X_trn, label=y_train)
dval = xgb.DMatrix(X_valid, label=y_valid)

In [91]:
dtrain = xgb.DMatrix(X_trn, y_train)
dval = xgb.DMatrix(X_valid, y_valid)

In [139]:
model = xgb.XGBClassifier(max_depth=2,
                          learning_rate=0.01,
                          n_estimators=100000,
                          min_child_weight=40,
                          subsample=0.5,
                          colsample_bytree=0.04,
                          tree_method='hist',
                          colsample_bylevel=0.5,
#                           min_child_weight=30,
                          scale_pos_weight=2,
                          gamma=9,
                          max_bin=150,
                          n_jobs=15)

In [140]:
xgb_clf=model.fit(X_trn, y_train,
            eval_set=[(X_trn, y_train), (X_valid, y_valid)],
            eval_metric='auc',
            early_stopping_rounds=1000,
            verbose=1000)

[0]	validation_0-auc:0.526323	validation_1-auc:0.528088
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 1000 rounds.
[1000]	validation_0-auc:0.87009	validation_1-auc:0.857147
[2000]	validation_0-auc:0.8873	validation_1-auc:0.872382
[3000]	validation_0-auc:0.900403	validation_1-auc:0.88449
[4000]	validation_0-auc:0.909287	validation_1-auc:0.892416
[5000]	validation_0-auc:0.915592	validation_1-auc:0.89847
[6000]	validation_0-auc:0.920553	validation_1-auc:0.903009
[7000]	validation_0-auc:0.924206	validation_1-auc:0.906186
[8000]	validation_0-auc:0.927245	validation_1-auc:0.90883
[9000]	validation_0-auc:0.929781	validation_1-auc:0.910995
[10000]	validation_0-auc:0.931731	validation_1-auc:0.912657
[11000]	validation_0-auc:0.933449	validation_1-auc:0.914049
[12000]	validation_0-auc:0.934908	validation_1-auc:0.915139
[13000]	validation_0-auc:0.936175	validation_1-auc:0.916092
[14000]	validation_0

In [141]:
xgb_clf

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
       colsample_bytree=0.04, gamma=9, learning_rate=0.01, max_bin=150,
       max_delta_step=0, max_depth=2, min_child_weight=40, missing=None,
       n_estimators=100000, n_jobs=15, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=2, seed=None, silent=True,
       subsample=0.5, tree_method='hist')

### Try Blending to see if the performance got improved 

In [None]:
lgb_predict = clf.predict(X_valid)
xgb_predict = xgb_clf.predict_proba(X_valid)[:,1]

In [144]:
roc_auc_score(y_true=y_valid, y_score=lgb_predict)

0.9201445940028325

In [148]:
roc_auc_score(y_true=y_valid, y_score=xgb_predict)

0.9200115808771933

In [147]:
roc_auc_score(y_true=y_valid, y_score=(lgb_predict+ xgb_predict)/2)

0.9202838258611061

In the validation dataset, there is a small boost after blending, so I tried two models to blend, but for the final submission, it turns out my best submission is only got by single LightGBM model in the full dataset.

### Five Fold CV Prediction

In [158]:
best_iteration=[]
predictions = np.zeros(test_X.shape[0])
folds = StratifiedKFold(n_splits=num_folds, random_state=random_state)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X, y)):
    
    X_tr, y_tr = train_X[trn_idx], y[trn_idx]
    X_valid, y_valid = train_X[val_idx], y[val_idx]
    
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_valid, label=y_valid)
    
    clf = lgb.train(param, trn_data, 100000, valid_sets = [trn_data, val_data], verbose_eval=2500, early_stopping_rounds = 4000)
    best_iteration.append(clf.best_iteration)
    predictions += clf.predict(test_X, num_iteration=clf.best_iteration) / folds.n_splits
    

Fold idx:1
Training until validation scores don't improve for 4000 rounds.
[2500]	training's auc: 0.856295	valid_1's auc: 0.845521
[5000]	training's auc: 0.885992	valid_1's auc: 0.8746
[7500]	training's auc: 0.900693	valid_1's auc: 0.888716
[10000]	training's auc: 0.909713	valid_1's auc: 0.897667
[12500]	training's auc: 0.915754	valid_1's auc: 0.903398
[15000]	training's auc: 0.919988	valid_1's auc: 0.907517
[17500]	training's auc: 0.922953	valid_1's auc: 0.910469
[20000]	training's auc: 0.925452	valid_1's auc: 0.912786
[22500]	training's auc: 0.927405	valid_1's auc: 0.914635
[25000]	training's auc: 0.928878	valid_1's auc: 0.915967
[27500]	training's auc: 0.930134	valid_1's auc: 0.917156
[30000]	training's auc: 0.931159	valid_1's auc: 0.918049
[32500]	training's auc: 0.932104	valid_1's auc: 0.91892
[35000]	training's auc: 0.932868	valid_1's auc: 0.919327
[37500]	training's auc: 0.933486	valid_1's auc: 0.919787
[40000]	training's auc: 0.934074	valid_1's auc: 0.920062
[42500]	training's 

In [159]:
clf.params

{'bagging_freq': 5,
 'bagging_fraction': 0.35,
 'boost_from_average': 'false',
 'boost': 'gbdt',
 'feature_fraction': 0.3,
 'learning_rate': 0.0083,
 'max_depth': 8,
 'metric': 'auc',
 'max_bin': 165,
 'min_data_in_leaf': 40,
 'min_child_weight': 10,
 'subsample': 0.7,
 'num_leaves': 2,
 'colsample_bytree': 0.03,
 'num_threads': 36,
 'tree_learner': 'serial',
 'objective': 'binary',
 'verbosity': -1}

In [162]:
best_iteration

[49911, 53768, 56218, 49025, 45060]

Save the 5-fold CV LightGBM prediction.

In [160]:
np.save('lgb_prediction', predictions)

In [163]:
full_prediction = np.zeros(test_X.shape[0])
train_data = lgb.Dataset(train_X, label=y)
for i,iteration in tqdm_notebook(enumerate(best_iteration)):
    param = {
    'bagging_freq': 5,
    'feature_fraction_seed': i+828,
    'bagging_seed': i+828,
    'bagging_fraction': 0.35,
    'boost_from_average': 'false',
    'boost': 'gbdt',
    'feature_fraction': 0.3,
    'learning_rate': 0.0083,
    'max_depth': 8,
    'metric': 'auc',
    'max_bin': 165,
    'min_data_in_leaf': 40,
    'min_child_weight': 10,
    'subsample': 0.7,
    'num_leaves': 2,
    'colsample_bytree': 0.03,
    'num_threads': 36,
    'tree_learner': 'serial',
    'objective': 'binary',
    'verbosity': -1
}
    clf = lgb.train(param, train_data, iteration, verbose_eval=2500)
    full_prediction += clf.predict(test_X) / folds.n_splits

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [1]:
# In case the fake data would also be scored, I used the lgbm model predict the fake data as well
fake_prediction = np.zeros(test_X.shape[0])
fake_prediction += clf.predict(fake_X)

NameError: name 'np' is not defined

In [211]:
np.save('fake_prediction',fake_prediction)

In [165]:
np.save('lgbm_full_prediction',full_prediction)

In [173]:
best_iteration_xgb=[]
predictions = np.zeros(test_X.shape[0])
folds = StratifiedKFold(n_splits=num_folds, random_state=random_state)
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_X, y)):
    
    X_tr, y_tr = train_X[trn_idx], y[trn_idx]
    X_valid, y_valid = train_X[val_idx], y[val_idx]
    
    model = xgb.XGBClassifier(max_depth=2,
                          learning_rate=0.0083,
                          n_estimators=100000,
                          min_child_weight=40,
                          subsample=0.5,
                          colsample_bytree=0.04,
                          tree_method='hist',
                          colsample_bylevel=0.5,
#                           min_child_weight=30,
                          scale_pos_weight=2,
                          gamma=5,
                          max_bin=150,
                          n_jobs=36)
    
    xgb_clf=model.fit(X_tr, y_tr,
            eval_set=[(X_tr, y_tr), (X_valid, y_valid)],
            eval_metric='auc',
            early_stopping_rounds=1000,
            verbose=1000)
    
    print("Fold idx:{}".format(fold_ + 1))
    best_iteration_xgb.append(xgb_clf.best_iteration)
    predictions += xgb_clf.predict_proba(test_X)[:,1] / folds.n_splits

[0]	validation_0-auc:0.551115	validation_1-auc:0.553355
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 1000 rounds.
[1000]	validation_0-auc:0.867385	validation_1-auc:0.855691
[2000]	validation_0-auc:0.882694	validation_1-auc:0.868559
[3000]	validation_0-auc:0.89422	validation_1-auc:0.879347
[4000]	validation_0-auc:0.903675	validation_1-auc:0.887903
[5000]	validation_0-auc:0.910523	validation_1-auc:0.894233
[6000]	validation_0-auc:0.915842	validation_1-auc:0.89912
[7000]	validation_0-auc:0.919933	validation_1-auc:0.902689
[8000]	validation_0-auc:0.923303	validation_1-auc:0.905637
[9000]	validation_0-auc:0.925967	validation_1-auc:0.908014
[10000]	validation_0-auc:0.928274	validation_1-auc:0.90994
[11000]	validation_0-auc:0.93024	validation_1-auc:0.911549
[12000]	validation_0-auc:0.93198	validation_1-auc:0.912863
[13000]	validation_0-auc:0.933431	validation_1-auc:0.913969
[14000]	validation_

[28000]	validation_0-auc:0.944482	validation_1-auc:0.922331
[29000]	validation_0-auc:0.944997	validation_1-auc:0.922376
[30000]	validation_0-auc:0.945459	validation_1-auc:0.922347
Stopping. Best iteration:
[29159]	validation_0-auc:0.945067	validation_1-auc:0.922393

Fold idx:4
[0]	validation_0-auc:0.543228	validation_1-auc:0.547292
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 1000 rounds.
[1000]	validation_0-auc:0.862746	validation_1-auc:0.862918
[2000]	validation_0-auc:0.879937	validation_1-auc:0.877538
[3000]	validation_0-auc:0.893121	validation_1-auc:0.889152
[4000]	validation_0-auc:0.902447	validation_1-auc:0.89667
[5000]	validation_0-auc:0.909566	validation_1-auc:0.902762
[6000]	validation_0-auc:0.914836	validation_1-auc:0.907155
[7000]	validation_0-auc:0.919031	validation_1-auc:0.910466
[8000]	validation_0-auc:0.922392	validation_1-auc:0.913155
[9000]	validation_0-auc:0.925183	val

In [174]:
np.save('xgb_prediction', predictions)

In [None]:
xgb_full_prediction = np.zeros(test_X.shape[0])
# train_data = lgb.Dataset(train_X, label=y)
model = xgb.XGBClassifier(max_depth=2,
                      learning_rate=0.0083,
                      n_estimators=100000,
                      min_child_weight=40,
                      subsample=0.5,
                      colsample_bytree=0.04,
                      tree_method='hist',
                      colsample_bylevel=0.5,
#                           min_child_weight=30,
                      scale_pos_weight=3,
                      gamma=9,
                      max_bin=150,
                      n_jobs=15)

In [213]:
xgb_cv = np.load('xgb_prediction.npy')

In [189]:
lgbm_full_prediction = np.load('lgbm_full_prediction.npy')

In [177]:
lgbm_cv = np.load('lgb_prediction.npy')

## XGBoost Full Prediction

In [None]:
xgb_full_prediction = np.zeros(test_X.shape[0])
# train_data = lgb.Dataset(train_X, label=y)
model = xgb.XGBClassifier(max_depth=2,
                      learning_rate=0.0083,
                      n_estimators=30000,
                      min_child_weight=40,
                      subsample=0.5,
                      colsample_bytree=0.04,
                      tree_method='hist',
                      colsample_bylevel=0.5,
#                           min_child_weight=30,
                      scale_pos_weight=3,
                      gamma=9,
                      max_bin=150,
                      n_jobs=15)

xgb_clf=model.fit(train_X, y)
    
xgb_full_prediction += xgb_clf.predict_proba(test_X)[:,1]

In [186]:
np.save('xgb_full_prediction',xgb_full_prediction)

## Blending

Ensemble all predictions and check the correlation.

In [216]:
result = pd.DataFrame({'lgbm_full': lgbm_full_prediction,
                       'lgbm_cv': lgbm_cv,
                       'xgb_cv': xgb_cv,
                       'xgb_full': xgb_full_prediction})

In [215]:
result.corr()

Unnamed: 0,lgbm_cv,lgbm_full,xgb_cv,xgb_full
lgbm_cv,1.0,0.999529,0.985214,0.964282
lgbm_full,0.999529,1.0,0.985024,0.964269
xgb_cv,0.985214,0.985024,1.0,0.994143
xgb_full,0.964282,0.964269,0.994143,1.0


In [166]:
public = np.load('public_LB.npy')
private = np.load('private_LB.npy')
real_idx = np.hstack([public,private])
real_test = df_test.iloc[real_idx,:]

In [218]:
final_prediction = lgbm_full_prediction*0.55 + xgb_full_prediction*0.45

## Submission

In [219]:
submission = pd.DataFrame({"ID_code": df_test.ID_code.values})
submission["target"] = 0
submission.set_value(fake_idx,'target', fake_prediction)
submission.set_value(real_idx,'target', final_prediction)
submission.to_csv("submission_final_full.csv", index=False)

In [222]:
!kaggle competitions submit santander-customer-transaction-prediction -f submission_final_full.csv -m "My submission message"

100%|██████████████████████████████████████| 6.03M/6.03M [00:03<00:00, 1.69MB/s]
Successfully submitted to Santander Customer Transaction Prediction