## Allstate week 3

This week we will learn how to:

* tune LightGBM
* create Neural Networks with Keras (Theano or Tensorflow backend)
* tune Neural Networks
* create a simple ensemble of XGBoost, LightGBM and Neural Networks


In [2]:
# import xgboost as xgb
import pandas as pd
from sklearn import preprocessing, pipeline, metrics, grid_search, cross_validation
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.metrics import mean_absolute_error
# from pylightgbm.models import GBMRegressor
import gc

from scipy import sparse
%matplotlib inline

# import winsound



In [3]:
def logregobj(labels, preds):
    con = 2
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess 

def log_mae(labels,preds,lift=200):
    return mean_absolute_error(np.exp(labels)-lift, np.exp(preds)-lift)

log_mae_scorer = metrics.make_scorer(log_mae, greater_is_better = False)

def search_model(train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
##Grid Search for the best model
    model = grid_search.GridSearchCV(estimator  = est,
                                     param_grid = param_grid,
                                     scoring    = log_mae_scorer,
                                     verbose    = 10,
                                     n_jobs  = n_jobs,
                                     iid        = True,
                                     refit    = refit,
                                     cv      = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model



def xg_eval_mae(yhat, dtrain, lift=200):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-lift, np.exp(yhat)-lift)

def xgb_logregobj(preds, dtrain):
    con = 2
    labels = dtrain.get_label()
    x =preds-labels
    grad =con*x / (np.abs(x)+con)
    hess =con**2 / (np.abs(x)+con)**2
    return grad, hess


def search_model_mae (train_x, train_y, est, param_grid, n_jobs, cv, refit=False):
##Grid Search for the best model
    model = grid_search.GridSearchCV(estimator  = est,
                                     param_grid = param_grid,
                                     scoring    = 'neg_mean_absolute_error',
                                     verbose    = 10,
                                     n_jobs  = n_jobs,
                                     iid        = True,
                                     refit    = refit,
                                     cv      = cv)
    # Fit Grid Search Model
    model.fit(train_x, train_y)
    print("Best score: %0.3f" % model.best_score_)
    print("Best parameters set:", model.best_params_)
    print("Scores:", model.grid_scores_)
    return model

## Load Data

In [4]:
# Load data
start = time.time() 
train_data = pd.read_csv('../input/train.csv')
train_size=train_data.shape[0]
print ("Loading train data finished in %0.3fs" % (time.time() - start))        
del (train_data)
gc.collect()
test_data = pd.read_csv('../input/test.csv')
print ("Loading test data finished in %0.3fs" % (time.time() - start))        

# full_data = pd.read_csv('../input/FE_data_COMB.csv.tar.gz',nrows=2)

Loading train data finished in 2.765s
Loading test data finished in 4.567s


In [9]:
full_data.head()

Unnamed: 0,FE_data_COMB.csv,cat10,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,...,cat14_cat38,cat14_cat24,cat14_cat82,cat14_cat25,cat38_cat24,cat38_cat82,cat38_cat25,cat24_cat82,cat24_cat25,cat82_cat25
0,A,1,2,7,1,1,9,5,7,10,...,27,27,28,27,27,28,27,28,27,53
1,A,2,12,6,1,1,5,5,9,11,...,27,27,27,27,27,27,27,27,27,27


In [10]:
full_data1 = pd.read_csv('../input/FE_data_COMB.csv',nrows=2)

In [11]:
full_data1.head()

Unnamed: 0,cat1,cat10,cat100,cat101,cat102,cat103,cat104,cat105,cat106,cat107,...,cat14_cat38,cat14_cat24,cat14_cat82,cat14_cat25,cat38_cat24,cat38_cat82,cat38_cat25,cat24_cat82,cat24_cat25,cat82_cat25
0,A,1,2,7,1,1,9,5,7,10,...,27,27,28,27,27,28,27,28,27,53
1,A,2,12,6,1,1,5,5,9,11,...,27,27,27,27,27,27,27,27,27,27


## Merge train and test

This will save our time on duplicating logics for train and test and will also ensure the transformations applied on train and test are the same.

In [5]:
full_data=pd.concat([train_data
                       ,test_data])
del( train_data, test_data)
print ("Full Data set created.")

Full Data set created.


## Group features

In this step we will group the features into different groups so we can preprocess them seperately afterward.

In [6]:
data_types = full_data.dtypes  
cat_cols = list(data_types[data_types=='object'].index)
num_cols = list(data_types[data_types=='int64'].index) + list(data_types[data_types=='float64'].index)

id_col = 'id'
target_col = 'loss'
num_cols.remove('id')
num_cols.remove('loss')

print ("Categorical features:", cat_cols)
print ( "Numerica features:", num_cols)
print ( "ID: %s, target: %s" %( id_col, target_col))

('Categorical features:', ['cat1', 'cat10', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'cat109', 'cat11', 'cat110', 'cat111', 'cat112', 'cat113', 'cat114', 'cat115', 'cat116', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat2', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat3', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat4', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat5', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat6', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat7', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat8', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat9', 'cat90', 'c

## Categorical features 
### 1. Label Encoding (Factorizing)

In [7]:
LBL = preprocessing.LabelEncoder()
start=time.time()
for cat_col in cat_cols:
#     print ("Factorize feature %s" % (cat))
    full_data[cat_col] = LBL.fit_transform(full_data[cat_col])
print ('Label enconding finished in %f seconds' % (time.time()-start))


Label enconding finished in 35.289642 seconds


### 2. One Hot Encoding (get dummies)

OHE can be done by either Pandas' get_dummies() or SK Learn's OneHotEncoder. 

* get_dummies is easier to implement (can be used directly on raw categorical features, i.e. strings, but it takes longer time and is not memory efficient.

* OneHotEncoder requires the features being converted to numeric, which has already been done by LabelEncoder in previous step, and is much more efficient (7x faster).

* We will convert the OHE's results to a sparse matrix which uses way less memory as compared to dense matrix. However, not all algorithms and packagers support sparse matrix, e.g. Keras. In that case, we'll need to use other tricks to make it work.

In [8]:
OHE = preprocessing.OneHotEncoder(sparse=True)
start=time.time()
full_data_sparse=OHE.fit_transform(full_data[cat_cols])
print ('One-hot-encoding finished in %f seconds' % (time.time()-start))

print (full_data_sparse.shape)

## it should be (313864, 1176)

One-hot-encoding finished in 10.169861 seconds
(313864, 1176)


### 3. Leave-one-out Encoding

This is a very useful trick that has been used by many Kaggle winning solutions. It's particularly effective for high cardinality categorical features, postal code for instance. However, it doesn't seem to help a lot for this competition and the following code is just FYI. Feel free to skip it as it may take long time to run.

In [9]:
# start=time.time()
# loo_cols =[]
# for col in cat_cols:
#     print ("Leave-One-Out Encoding  %s" % (col))
#     print ("Leave-one-out encoding column %s for %s......" % (col, target_col))
#     aggr=full_data.groupby(col)[target_col].agg([np.mean]).join(full_data[:train_size].groupby(col)[target_col].agg([np.sum,np.size]),how='left')        
#     meanTagetAggr = np.mean(aggr['mean'].values)
#     aggr=full_data.join(aggr,how='left', on=col)[list(aggr.columns)+[target_col]]
#     loo_col = 'MEAN_BY_'+col+'_'+target_col
#     full_data[loo_col] = \
#     aggr.apply(lambda row: row['mean'] if math.isnan(row[target_col]) 
#                                                        else (row['sum']-row[target_col])/(row['size']-1)*random.uniform(0.95, 1.05) , axis=1)
#     loo_cols.append(loo_col)
#     print ("New feature %s created." % (loo_col))
# print ('Leave-one-out enconding finished in %f seconds' % (time.time()-start))

## Numeric features

We will apply two preprocessings on numeric features:

1. Apply box-cox transformations for skewed numeric features.

2. Scale numeric features so they will fall in the range between 0 and 1.

Please be advised that these preprocessings are not necessary for tree-based models, e.g. XGBoost. However, linear or linear-based models, which will be dicussed in following weeks, may benefit from them.

** Calculate skewness of each numeric features: **

In [10]:
from scipy.stats import skew, boxcox
skewed_cols = full_data[num_cols].apply(lambda x: skew(x.dropna()))
print (skewed_cols.sort_values())

cont2    -0.311146
cont3    -0.007023
cont14    0.250673
cont11    0.281139
cont12    0.291997
cont10    0.352116
cont13    0.376138
cont4     0.417559
cont6     0.458413
cont1     0.513205
cont8     0.673237
cont5     0.679610
cont7     0.825889
cont9     1.067247
dtype: float64


** Apply box-cox transformations: **

In [11]:
skewed_cols = skewed_cols[skewed_cols > 0.25].index.values
for skewed_col in skewed_cols:
    full_data[skewed_col], lam = boxcox(full_data[skewed_col] + 1)

** Apply Standard Scaling:**

In [12]:
SSL = preprocessing.StandardScaler()
for num_col in num_cols:
    full_data[num_col] = SSL.fit_transform(full_data[num_col].values.reshape(-1,1))

#### Note: LBL and OHE are likely exclusive so we will use one of them at a time combined with numeric features. In the following steps we will use OHE + Numeric to tune XGBoost models and you can apply the same process with OHE + Numeric features. Averaging results from two different models will likely generate better results.

In [13]:
lift = 200

full_data_sparse = sparse.hstack((full_data_sparse
                                  ,full_data[num_cols])
                                 , format='csr'
                                 )
print full_data_sparse.shape
train_x = full_data_sparse[:train_size]
test_x = full_data_sparse[train_size:]
train_y = np.log(full_data[:train_size].loss.values + lift)
ID = full_data.id[:train_size].values
print train_x.shape
# xgtrain = xgb.DMatrix(train_x, label=train_y,missing=np.nan) #used for Bayersian Optimization

from sklearn.cross_validation import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, train_size=.80, random_state=1234)

(313864, 1190)
(188318, 1190)


## Kearas

https://keras.io

Keras is a high-level neural networks library, written in Python and capable of running on top of either TensorFlow or Theano. It was developed with a focus on enabling fast experimentation. Being able to go from idea to result with the least possible delay is key to doing good research.


In [30]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,ParametricSoftplus,ThresholdedReLU,SReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD,Nadam
from keras.regularizers import WeightRegularizer, ActivityRegularizer,l2, activity_l2

## Comment out following lines if you are using Theano as backend
# import tensorflow as tf
# tf.python.control_flow_ops = tf

In [1]:
# custom metric function for Keras

def mae_log(y_true, y_pred): 
    return K.mean(K.abs((K.exp(y_pred)-200) - (K.exp(y_true)-200)))


# Keras deosn't support sparse matrix. 
# The following functions are useful to split a large sparse matrix into smaller batches so they can be loaded into mem.

def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

### Keras starter

Below is a quick starter example for creating a neural networks model using Keras. It covers the following aspects:
1. multiple layers: 1 input, 1 hidden and 1 output
2. normalization.
3. dropout regularization.
4. early stopping
5. activate function
6. optimizer
6. batch training

Advanced optimizers, activations and dropout regularization are the key characteristics that differentiate modern Neural Networks from conventional ones.

In [54]:
early_stop = EarlyStopping(monitor='val_mae_log', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0, mode='auto')
checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
                               monitor='val_mae_log', 
                               verbose=1, save_best_only=True, mode='min')

def create_model(input_dim):
    model = Sequential()
    init = 'glorot_normal'
    
    
    model.add(Dense(400, # number of input units: needs to be tuned
                    input_dim = input_dim, # fixed length: number of columns of X
                    init=init
                   ))
    
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
        
    model.add(Dense(200,init=init)) # number of hidden1 units. needs to be tuned.
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.5)) #dropout rate. needs to be tuned
    
    model.add(Dense(50,init=init)) # number of hidden2 units. needs to be tuned.
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.5)) #dropout rate. needs to be tuned
    
    model.add(Dense(1)) # 1 for regression 
    model.compile(loss = 'mae',
                  metrics=[mae_log],
                  optimizer = 'Adadelta' # optimizer. you may want to try different ones
                 )
    return(model)

model = create_model(X_train.shape[1])
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=1000,
                         samples_per_epoch=train_size,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop,checkpointer]
                         )

print min(fit.history['val_mae_log'])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
aa 1148.54210492


In [None]:
# Dropout 400 200 50
# 0.0 0.0 0.0 1173.22430653
# 0.1 0.1 0.1 1159.12022951
# 0.2 0.2 0.2 1155.23839803
# 0.3 0.3 0.3 1149.65308519
# 0.4 0.4 0.4 1149.41400722
# 0.5 0.5 0.5 1150.80265431
# 0.6 0.6 0.6 1162.70112915
# 0.4 0.5 0.5 1148.54210492
# 

In [None]:
# Dropout 200 100
# 0.0 0.0 1171.12563576
# 0.1 0.1  1160.17143049
# 0.2 0.2 1156.6883911
# 0.3 0.3 1153.0245463
# 0.4 0.4 1169.62663742
# 0.5 0.5 1149.56123014

In [None]:
# activation 0.4 0.2
# LeakyReLU 1156.6654871
# PReLU 1149.37891714
# ELU 1155.62879652
# ParametricSoftplus 1159.36182408
# ThresholdedReLU inf
# SReLU 1153.34983381

In [None]:
# initializations
# uniform  1155.03357597
# lecun_uniform 1152.83709172
# normal 1154.07652823
# zero 1818.27969088
# glorot_normal 1151.56500457
# glorot_uniform 1151.98137925
# he_normal 1153.37595458
# he_uniform 1164.39926316

In [None]:
# optimizer
# 'SGD' 1177.38471356
# 'SGD' lr = 0.1 1178.63960903
# 'RMSprop' 1161.1068612
# 'Adagrad' 1176.88879581
# 'Adadelta' 1150.60310845
# 'Adam' 1153.64722551
# 'Adamax' 1151.59510605
# 'Nadam' 1155.9498374

In [23]:
# [400 0.4 200 0.2] 1247.1040 1150.9042
# [600 0.4 200 0.2] 1311.5099 1155.55158192
# [800 0.4 200 0.2] 1280.1025 1151.91463396
# [1000 0.4 200 0.2] 1273.5918 1153.3740
# [800 0.4 400 0.2] 1379.2316 1164.4119
# [800 0.2 400 0.2] 1249.6743 1160.2573
# [900 0.2 300 0.2] 1281.4110 1158.5914
# [400 0.2 200 0.2] 1300.3075 1158.4239
# [400 0.6 200 0.6 100 0.6] 1259.5767 1156.3421
# [400 0.4 200 0.2 100 0.2] 1367.5554 1153.8893

1152.12092417909

### Cross Validation and ...

The following sample shows how to do cross validation for Keras with early stopping and much more. NN is time consuming, not to mention cross validation. In fact we can leverage every minutes we spent on training NN and make good use of them.

we'll first create the framework:

In [None]:
from sklearn.cross_validation import StratifiedKFold, KFold

early_stop = EarlyStopping(monitor='val_mae_log', patience=5, verbose=0, mode='auto')
checkpointer = ModelCheckpoint(filepath="weights.hdf5", monitor='val_mae_log', verbose=1, save_best_only=True, mode='min')

def nn_model(params):
    model = Sequential()
    model.add(Dense(params['input_size'], input_dim = params['input_dim']))

    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(params['input_drop_out']))
        
    model.add(Dense(params['hidden_size']))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(params['hidden_drop_out']))
    
    
#     nadam = Nadam(lr=1e-4)
    nadam = Nadam(lr=params['learning_rate'])
    
    model.add(Dense(1))
    model.compile(loss = 'mae', metrics=[mae_log], optimizer = 'adadelta')
    return(model)


def nn_blend_data(parameters, train_x, train_y, test_x, fold, early_stopping_rounds=0, batch_size=128):
    print ("Blend %d estimators for %d folds" % (len(parameters), fold))
    skf = list(KFold(len(train_y), fold))
    
    train_blend_x = np.zeros((train_x.shape[0], len(parameters)))
    test_blend_x = np.zeros((test_x.shape[0], len(parameters)))
    scores = np.zeros ((len(skf),len(parameters)))
    best_rounds = np.zeros ((len(skf),len(parameters)))
 
    for j, nn_params in enumerate(parameters):
        print ("Model %d: %s" %(j+1, nn_params))
        test_blend_x_j = np.zeros((test_x.shape[0], len(skf)))
        for i, (train, val) in enumerate(skf):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x[train]
            train_y_fold = train_y[train]
            val_x_fold = train_x[val]
            val_y_fold = train_y[val]

            # early stopping
            model = nn_model(nn_params)
            print (model)
            fit= model.fit_generator(generator=batch_generator(train_x_fold, train_y_fold, batch_size, True),
                                     nb_epoch=60,
                                     samples_per_epoch=train_x_fold.shape[0],
                                     validation_data=(val_x_fold.todense(), val_y_fold),
                                     callbacks=[
                                                EarlyStopping(monitor='val_mae_log'
                                                              , patience=early_stopping_rounds, verbose=0, mode='auto'),
                                                ModelCheckpoint(filepath="weights.hdf5"
                                                                , monitor='val_mae_log', 
                                                                verbose=1, save_best_only=True, mode='min')
                                                ]
                                     )

            best_round=len(fit.epoch)-early_stopping_rounds-1
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'mae', metrics=[mae_log], optimizer = 'adadelta')

         
            # print (mean_absolute_error(np.exp(y_val)-200, pred_y))
            val_y_predict_fold = model.predict_generator(generator=batch_generatorp(val_x_fold, batch_size, True),
                                        val_samples=val_x_fold.shape[0]
                                     )
            
            score = log_mae(val_y_fold, val_y_predict_fold,200)
            print ("Score: ", score, mean_absolute_error(val_y_fold, val_y_predict_fold))
            scores[i,j]=score
            train_blend_x[val, j] = val_y_predict_fold.reshape(val_y_predict_fold.shape[0])
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'mae', metrics=[mae_log], optimizer = 'adadelta')            
            test_blend_x_j[:,i] = model.predict_generator( which).reshape(test_x.shape[0])
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
   
        test_blend_x[:,j] = test_blend_x_j.mean(1)
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x, scores,best_rounds )

Then let's create a list of parameters that we thought might be working for NN, and cross validate each of them

In [None]:
nn_parameters = [
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :450 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 250 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1},
    { 'input_size' :400 ,
     'input_dim' : train_x.shape[1],
     'input_drop_out' : 0.5 ,
     'hidden_size' : 200 ,
     'hidden_drop_out' :0.2,
     'learning_rate': 0.1}

]

(train_blend_x, test_blend_x, blend_scores,best_round) = nn_blend_data(nn_parameters, train_x, train_y, test_x,
                                                         4,
                                                         5)


We can now create two submissions: 

* one is from the best CV score, the fourth in my case
* another is the average of all four

You can submit both and see if averaging helps.

In [None]:
pred_y = np.exp(test_blend_x[:,3:4]) - 200 # the forth column of test_blend_x
results = pd.DataFrame()
results['id'] = full_data[train_size:].id
results['loss'] = pred_y
results.to_csv("../output/sub_keras_starter.csv", index=False)
print ("Submission created.")

pred_y = np.exp(np.mean(test_blend_x,axis=1)) - 200

results = pd.DataFrame()
results['id'] = full_data[train_size:].id
results['loss'] = pred_y
results.to_csv("../output/sub_keras_mean.csv", index=False)
print ("Submission created.")

## Follow up questions
* So far we've already create five models/ submissions:
    * XGBoost with LE
    * XGBoost with OHE
    * LightGBM with LE
    * LightGBM with OHE
    * Keras
    
  Now let's create another submission, or more, by avaraging them or with whatever weights working for you. It should yield better results.
  
    
* Is there a way to ensemble the models even more effectively? 

In [23]:
from sklearn.cross_validation import KFold


allpredictions = pd.DataFrame()
kfolds = 10  # 10 folds is better!
kf = KFold(train_x.shape[0], n_folds=kfolds)
seed = 42

for i, (train_index, test_index) in enumerate(kf):
    print('Fold {0}'.format(i + 1))
    dtrain, dvalid = train_x[train_index], train_x[test_index]
    ytrain, yvalid = train_y[train_index], train_y[test_index]

    gbmr = GBMRegressor(
        num_threads=4,
        num_iterations=20000,
        learning_rate=0.005,
        num_leaves=68,
        max_bin = 526,
        min_data_in_leaf=127,
        metric='l1',
        feature_fraction=0.218683,
        bagging_fraction=0.961961,
        bagging_freq=1,
        early_stopping_round=600,
        verbose= False
    )
    
    
    gbmr.fit(dtrain, ytrain,test_data=[(dvalid, yvalid)])
    del dtrain
    del dvalid
    gc.collect()
    
    allpredictions['p' + str(i)] = gbmr.predict(test_x)
    del gbmr
    gc.collect()
    
print(allpredictions.head())
    
submission = pd.read_csv('../output/sample_submission.csv')
submission.iloc[:, 1] = np.exp(allpredictions.mean(axis=1).values) - lift
submission.to_csv('pyLightGBMmeansubmission_10Kfold.csv', index=None)
submission.iloc[:, 1] = np.exp(allpredictions.median(axis=1).values) - lift
submission.to_csv('pyLightGBMmediansubmission_10Kfold.csv', index=None)

Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10
        p0       p1       p2       p3       p4       p5       p6       p7  \
0  7.44263  7.45627  7.43563  7.42914  7.41162  7.45465  7.47006  7.44173   
1  7.68540  7.69088  7.66513  7.70157  7.66829  7.65112  7.69641  7.66304   
2  9.09098  9.09288  9.01976  9.06774  9.04892  9.08354  9.07739  9.05232   
3  8.73863  8.75128  8.71120  8.70794  8.74043  8.68432  8.73725  8.76205   
4  6.92609  6.91564  6.90176  6.92359  6.90748  6.92546  6.92156  6.88970   

        p8       p9  
0  7.44343  7.49335  
1  7.61425  7.68182  
2  9.06537  9.10046  
3  8.70831  8.76852  
4  6.94481  6.87771  
