In [2]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold, RepeatedKFold, GroupKFold, GridSearchCV, train_test_split, TimeSeriesSplit
from sklearn import metrics
from sklearn import linear_model
import gc
import warnings
warnings.filterwarnings("ignore")
import pickle
import sys
main_path = r'../..'
sys.path.append(main_path)
from BayDS import *
from BayDS.lib.pipeline import *
from BayDS.lib.io import *
import os

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
main_folder = r'../../Data/sub'
model_folder = r'../../Snapshots/Stacking/3009'

In [4]:
data_dir = f'e:/kaggle/05-LabelEncoded-last'
p = Pipeline(working_folder=f'{main_path}/Snapshots/1/catboost_last_dynamics')

p.add_node(LoaderNode, None, 'data',
           params={
               'input_directory': data_dir,
               'file': 'label_encoded_data.pkl'
           })
p.run(verbose=True)
p.save()

---------------------------
0: LoaderNode [2019-10-01 22:23:02]
params:
 {'input_directory': 'e:/kaggle/05-LabelEncoded-last', 'file': 'label_encoded_data.pkl'}


In [5]:
df = p.data['data']
DT_M=df['DT_M']
df = df[['isFraud', 'new_card_id']]

In [None]:
import yaml
lb = yaml.load(open(f'{main_folder}/lb.yaml','r'), Loader=yaml.FullLoader)

In [None]:
oofs=None
predictions=None
for iexp, experiment in enumerate(lb):
    print(experiment)
    oof = pd.read_csv(f'{main_folder}/{experiment["oofFile"]}')
    pred = pd.read_csv(f'{main_folder}/{experiment["predictionFile"]}')
    oof.set_index('TransactionID')
    pred.set_index('TransactionID')
    if len(oof.index) != 590540:
        print (f"Skipping {experiment}")
        continue
    if oofs is None:
       oofs = pd.DataFrame(index=oof.index)
       predictions = pd.DataFrame(index=pred.index)
#     print(oof)
    oofs[f"s{iexp}_{experiment['score']}"] = oof['isFraud']
    predictions[f"s{iexp}_{experiment['score']}"] = pred['isFraud']

    

In [None]:
oofs.columns
for col in oofs.columns:
    print(col, oofs[col].mean(), predictions[col].mean())

In [None]:
oofs.shape

In [None]:
predictions.shape

In [None]:
y = pd.read_pickle(r'e:\Kaggle\data\y.pkl')

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold)

In [None]:
from BayDS.lib.training import *

In [None]:
for col in oofs.columns:
    print(col,fast_auc(y,oofs[col]))

## LogRegression stacking

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
train_options = {
    "model_type":'sklearn',
    'model':model,
    'folds': folds,
    "params": {},
    "eval_metric":'auc',
    'averaging': 'usual',
    'splits': n_fold,
    'n_jobs': -1,
    'groups': None
}

from sklearn import preprocessing
standart_scaler = preprocessing.StandardScaler()
X = pd.DataFrame(standart_scaler.fit_transform(oofs), index=oofs.index, columns=oofs.columns)
result_dict_logreg = train_model_classification_vb(X=X, X_test=predictions, y=y, **train_options)

In [None]:
sub = pd.read_csv(f'../../data/sample_submission.csv')
sub['isFraud'] = result_dict_logreg['prediction']
sub.to_csv(f'{model_folder}/stacked_0110.csv', index=False)

## LightGBM Stacking (bad)

In [None]:


params = {
    'learning_rate': 0.001,
    'num_leaves': 20,
    'max_depth': 1,
    'min_child_weight': 10,
    'lambda_l1':2,
    'lambda_l2':3,
    'min_data_in_leaf' :10,
    'min_sum_hessian_in_leaf' : 0.0001,
    'bagging_fraction' : 0.8,
    'max_bin': 12,
    'feature_fraction' : 0.9,
    'bagging_freq' : 100,
    'min_gain_to_split': 0.1 }

train_options = {
    "model_type":'lgb',
    "params": params,
    "eval_metric":'auc',
    'early_stopping_rounds': 500,
    'n_estimators': 5000,
    'averaging': 'usual',
    'use_groups': False,
    'fold_name': folds.__class__.__name__,
    'n_splits': n_fold
}


result_dict_lgb = train_model_classification(X=oof, X_test=prediction, y=y, params=params, folds=folds,
                                         model_type=train_options['model_type'], 
                                         eval_metric=train_options['eval_metric'],
                                         plot_feature_importance=True,
                                         verbose=500, early_stopping_rounds=train_options['early_stopping_rounds'],
                                         n_estimators=train_options['n_estimators'], 
                                         averaging=train_options['averaging'],
                                         n_jobs=-1, groups=None)

## Keras Stacking 

In [None]:
def StackModel_maker():
    k.clear_session()
    
    numerical_inputs = Input(shape=[oof.shape[1]], name = 'all')
    numerical_logits = Dropout(.3)(numerical_inputs)
  
    x = numerical_logits

    x = Dense(50, activation = 'relu')(x)
    x = Dropout(.3)(x)
    x = Dense(10, activation = 'relu')(x)
    x = Dropout(.3)(x)
    x = BatchNormalization()(x)    
    
    out = Dense(1, activation = 'sigmoid')(x)    

    model = Model(inputs= [numerical_inputs],outputs=out)
    loss = "binary_crossentropy"
    model.compile(optimizer=Adam(lr = 0.0003), loss = loss)
    return model


params = {
    'batch_size': 16384,
    'epochs': 200,
    'verbose': True,
         }
train_options = {
    "model_type":'keras',
    "params": params,
    "eval_metric":'auc',
    'averaging': 'usual',
    'use_groups': False,
    'fold_name': folds.__class__.__name__,
    'n_splits': n_fold
   
}

In [None]:
with open(f'{model_folder}/training_params.json', 'w') as f:
    q = json.dumps(train_options,indent=2)
    f.write(q)
# StackModel_maker().save(f'{model_folder}/keras.mdl')

In [None]:
import keras
import tensorflow as tf

config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU':4}, log_device_placement=False) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [None]:
gc.collect()
result_dict_keras = train_model_classification(model=StackModel_maker, 
                                             X=oof,
                                             X_test=prediction,
                                             y=y, params=params, folds=folds,
                                             model_type=train_options['model_type'], 
                                             eval_metric=train_options['eval_metric'],
                                             averaging=train_options['averaging'],
                                             groups=None)

In [None]:
sub = pd.read_csv(f'../../data/sample_submission.csv')
sub['isFraud'] = result_dict_keras['prediction']
sub.to_csv(f'{model_folder}/stacked_keras.csv', index=False)

In [None]:
import pickle
with open(f'{model_folder}/results_dict_stacked_keras.pkl', 'wb') as f:
#     q = json.dumps(result_dict_lgb,indent=2)
    pickle.dump(result_dict_keras,f)
#     f.write(q)