In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn import preprocessing

import gc
from scipy.stats import skew, boxcox

from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,ParametricSoftplus,ThresholdedReLU,SReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD,Nadam
from keras.regularizers import WeightRegularizer, ActivityRegularizer,l2, activity_l2
from keras.utils.np_utils import to_categorical

Using Theano backend.


# Load Data

In [52]:
train_y = np.ravel(pd.read_csv('../input/' + 'labels_BrandenMurray.csv'))
train_y = to_categorical(train_y)

names = ['low_0','medium_0','high_0',
        'low_1','medium_1','high_1',
        'low_2','medium_2','high_2',
        'low_3','medium_3','high_3',
        'low_4','medium_4','high_4',
        'low_5','medium_5','high_5',
        'low_6','medium_6','high_6',
        'low_7','medium_7','high_7',
        'low_8','medium_8','high_8',
        'low_9','medium_9','high_9']

data_path = "../2ndlast/"
total_col = 0

In [4]:
# RFC 1st level 
file_train      = 'train_blend_RFC_entropy_last_2017-04-21-11-06' + '.csv'
file_test_mean  = 'test_blend_RFC_entropy_mean_last_2017-04-21-11-06' + '.csv'

train_rfc = pd.read_csv(data_path + file_train,      header = None)
test_rfc  = pd.read_csv(data_path + file_test_mean,  header = None)


n_column = train_rfc.shape[1]
total_col += n_column

train_rfc.columns = ['rfc_' + x for x in names[:n_column]]
test_rfc.columns  = ['rfc_' + x for x in names[:n_column]]


print train_rfc.iloc[:5,:3]

print test_rfc.iloc[:5,:3]

   rfc_low_0  rfc_medium_0  rfc_high_0
0   0.355361      0.544070    0.100569
1   0.508303      0.446720    0.044978
2   0.603091      0.349880    0.047029
3   0.616221      0.328639    0.055140
4   0.947230      0.049863    0.002907
   rfc_low_0  rfc_medium_0  rfc_high_0
0   0.288217      0.532268    0.179515
1   0.970891      0.025801    0.003308
2   0.908912      0.078535    0.012553
3   0.400539      0.476918    0.122542
4   0.700470      0.269945    0.029586


In [5]:
# LR 1st level
file_train      = 'train_blend_LR_last_2017-04-21-11-16' + '.csv'
file_test_mean  = 'test_blend_LR_mean_last_2017-04-21-11-16' + '.csv'

train_LR = pd.read_csv(data_path + file_train, header = None)
test_LR  = pd.read_csv(data_path + file_test_mean, header = None)

n_column = train_LR.shape[1]
total_col += n_column

train_LR.columns = ['LR_' + x for x in names[:n_column]]
test_LR.columns  = ['LR_' + x for x in names[:n_column]]

print train_LR.iloc[:5,:3]
print test_LR.iloc[:5,:3]

   LR_low_0  LR_medium_0  LR_high_0
0  0.259359     0.652577   0.088065
1  0.738646     0.234238   0.027115
2  0.396965     0.512991   0.090045
3  0.647052     0.312537   0.040412
4  0.923203     0.072255   0.004543
   LR_low_0  LR_medium_0  LR_high_0
0  0.282172     0.549423   0.168405
1  0.955532     0.040091   0.004377
2  0.925692     0.065197   0.009112
3  0.631038     0.309690   0.059272
4  0.803117     0.187725   0.009158


In [6]:
# ET 1st level
file_train      = 'train_blend_ET_entropy_last_2017-04-21-11-48' + '.csv'
file_test_mean  = 'test_blend_ET_entropy_mean_last_2017-04-21-11-48' + '.csv'

train_ET = pd.read_csv(data_path + file_train,      header = None)
test_ET  = pd.read_csv(data_path + file_test_mean,  header = None)

n_column = train_ET.shape[1]
total_col += n_column

train_ET.columns = ['ET_' + x for x in names[:n_column]]
test_ET.columns  = ['ET_' + x for x in names[:n_column]]

print train_ET.iloc[:5,:3]
print test_ET.iloc[:5,:3]

   ET_low_0  ET_medium_0  ET_high_0
0  0.332903     0.538085   0.129012
1  0.471780     0.454812   0.073408
2  0.582223     0.383893   0.033884
3  0.622462     0.328557   0.048981
4  0.926402     0.064996   0.008602
   ET_low_0  ET_medium_0  ET_high_0
0  0.309822     0.527181   0.162997
1  0.984336     0.014059   0.001605
2  0.956579     0.038080   0.005341
3  0.518490     0.384524   0.096986
4  0.759357     0.210049   0.030594


In [7]:
# KNN 1st level
file_train      = 'train_blend_KNN_uniform_last_2017-04-21-13-53' + '.csv'
file_test_mean  = 'test_blend_KNN_uniform_mean_last_2017-04-21-13-53' + '.csv'


train_KNN = pd.read_csv(data_path + file_train,      header = None)
test_KNN  = pd.read_csv(data_path + file_test_mean,  header = None)


n_column = train_KNN.shape[1]
total_col += n_column

train_KNN.columns      = ['KNN_uniform_' + x for x in names[:n_column]]
test_KNN.columns  = ['KNN_uniform_' + x for x in names[:n_column]]

print train_KNN.iloc[:5,:3]
print test_KNN.iloc[:5,:3]

   KNN_uniform_low_0  KNN_uniform_medium_0  KNN_uniform_high_0
0           0.507812              0.390625            0.101562
1           0.531250              0.359375            0.109375
2           0.671875              0.273438            0.054688
3           0.609375              0.250000            0.140625
4           0.843750              0.140625            0.015625
   KNN_uniform_low_0  KNN_uniform_medium_0  KNN_uniform_high_0
0           0.381250              0.457813            0.160938
1           0.968750              0.031250            0.000000
2           0.970313              0.029687            0.000000
3           0.693750              0.259375            0.046875
4           0.612500              0.321875            0.065625


In [8]:
# TFFM 1st level 0322
file_train      = 'train_blend_FM_BM_0322_2017-03-27-04-35' + '.csv'
file_test_mean  = 'test_blend_FM_mean_BM_0322_2017-03-27-04-35' + '.csv'

train_FM_0322      = pd.read_csv(data_path + file_train,      header = None)
test_FM_mean_0322  = pd.read_csv(data_path + file_test_mean,  header = None)

n_column = train_FM_0322.shape[1]
total_col += n_column

train_FM_0322.columns      = ['FM_0322_' + x for x in names[:n_column]]
test_FM_mean_0322.columns  = ['FM_0322_' + x for x in names[:n_column]]

print train_FM_0322.iloc[:5,:3]
print test_FM_mean_0322.iloc[:5,:3]

   FM_0322_low_0  FM_0322_medium_0  FM_0322_high_0
0       0.460187          0.436036        0.103776
1       0.268598          0.571916        0.159486
2       0.724799          0.239351        0.035851
3       0.669683          0.286716        0.043600
4       0.917878          0.073469        0.008653
   FM_0322_low_0  FM_0322_medium_0  FM_0322_high_0
0       0.449136          0.411890        0.138974
1       0.971615          0.020319        0.008066
2       0.909864          0.074040        0.016096
3       0.661175          0.269026        0.069799
4       0.705851          0.263069        0.031080


In [9]:
# Multinomial Naive Bayes 1st level
file_train      = 'train_blend_MNB_BM_MB_last_2017-04-21-14-02' + '.csv'
file_test_mean  = 'test_blend_MNB_mean_BM_MB_last_2017-04-21-14-02' + '.csv'


train_MNB      = pd.read_csv(data_path + file_train,      header = None)
test_MNB_mean  = pd.read_csv(data_path + file_test_mean,  header = None)


n_column = train_MNB.shape[1]
total_col += n_column

train_MNB.columns      = ['MNB_' + x for x in names[:n_column]]
test_MNB_mean.columns  = ['MNB_' + x for x in names[:n_column]]

print train_MNB.iloc[:5,:3]
print test_MNB_mean.iloc[:5,:3]

   MNB_low_0  MNB_medium_0  MNB_high_0
0   0.216985      0.579031    0.203985
1   0.560375      0.382293    0.057331
2   0.621019      0.332482    0.046499
3   0.312441      0.345115    0.342444
4   0.901678      0.090035    0.008287
   MNB_low_0  MNB_medium_0  MNB_high_0
0   0.218456      0.591575    0.189969
1   0.992103      0.007102    0.000795
2   0.971220      0.025168    0.003612
3   0.501988      0.411198    0.086813
4   0.754496      0.213158    0.032347


In [10]:
# TSNE 1st level

file_train = 'X_train_tsne_BM_MB_add_desc_2017-03-18-17-14' + '.csv'
file_test  = 'X_test_tsne_BM_MB_add_desc_2017-03-18-17-14' + '.csv'

train_tsne = pd.read_csv(data_path + file_train, header = None)
test_tsne  = pd.read_csv(data_path + file_test, header = None)


n_column = train_tsne.shape[1]
total_col += n_column

train_tsne.columns = ['tsne_0', 'tsne_1', 'tsne_2']
test_tsne.columns  = ['tsne_0', 'tsne_1', 'tsne_2']


print train_tsne.iloc[:5,:3]
print test_tsne.iloc[:5,:3]

      tsne_0     tsne_1    tsne_2
0  -8.398991  -2.415894 -3.602143
1   0.698237   0.335786  8.884257
2  -5.811380 -16.669975  7.145837
3  -0.371861 -25.894747 -2.076309
4 -15.371799   9.656209  5.813590
      tsne_0     tsne_1     tsne_2
0  -5.176846  -0.768422  -2.339259
1   9.003089  13.250301  -0.707032
2   4.188036  14.397186   4.573307
3  10.890132 -12.660774 -13.414140
4   6.011381   5.177731  15.669250


In [11]:
# TSNE 1st level 0322

file_train = 'X_train_tsne_BM_0322_2017-03-26-16-33' + '.csv'
file_test  = 'X_test_tsne_BM_0322_2017-03-26-16-33' + '.csv'

train_tsne_0322 = pd.read_csv(data_path + file_train, header = None)
test_tsne_0322  = pd.read_csv(data_path + file_test, header = None)

n_column = train_tsne_0322.shape[1]
total_col += n_column

train_tsne_0322.columns = ['tsne_0_0322', 'tsne_1_0322', 'tsne_2_0322']
test_tsne_0322.columns  = ['tsne_0_0322', 'tsne_1_0322', 'tsne_2_0322']

print train_tsne_0322.iloc[:5,:3]
print test_tsne_0322.iloc[:5,:3]

   tsne_0_0322  tsne_1_0322  tsne_2_0322
0    -6.649132    13.028168     8.329733
1     7.615566     0.067456   -14.932181
2     8.333528     8.561174   -13.536297
3    12.819587   -20.027314     0.661660
4    -5.513088    -5.609218    17.130673
   tsne_0_0322  tsne_1_0322  tsne_2_0322
0    -5.721674     7.011411    -6.499047
1     8.238390    -8.589710    13.771045
2   -11.383577   -16.071395    15.083511
3    -6.111491     6.348311   -10.222012
4     4.426022    15.553415    11.315777


In [12]:
# XGB 1st level

file_train     = 'train_blend_XGB_BM_2bagging_CV_MS_52571_2017-04-20-23-06' + '.csv'
file_test_mean = 'test_blend_XGB_BM_2bagging_CV_MS_52571_2017-04-20-23-06' + '.csv'


train_xgb      = pd.read_csv(data_path + file_train, header = None)
test_xgb_mean  = pd.read_csv(data_path + file_test_mean, header = None)

tmp_train = train_xgb*2
tmp_test  = test_xgb_mean*2

file_train     = 'train_blend_XGB_BM_20bagging_last_2017-04-21-19-08' + '.csv'
file_test_mean = 'test_blend_XGB_BM_20bagging_last_2017-04-21-19-08' + '.csv'

train_xgb      = pd.read_csv(data_path + file_train, header = None)
test_xgb_mean  = pd.read_csv(data_path + file_test_mean, header = None)

train_xgb      = (tmp_train + train_xgb*20) / 22.0
test_xgb_mean  = (tmp_test + test_xgb_mean*20) / 22.0

n_column = train_xgb.shape[1]
total_col += n_column

train_xgb.columns = ['xgb_' + x for x in names[:n_column]]
test_xgb_mean.columns = ['xgb_' + x for x in names[:n_column]]

print train_xgb.iloc[:5,:3]
print test_xgb_mean.iloc[:5,:3]

   xgb_low_0  xgb_medium_0  xgb_high_0
0   0.338520      0.631375    0.030105
1   0.563156      0.390819    0.046025
2   0.462242      0.498429    0.039328
3   0.930174      0.067369    0.002458
4   0.892201      0.106567    0.001232
   xgb_low_0  xgb_medium_0  xgb_high_0
0   0.178073      0.621011    0.200916
1   0.978270      0.012231    0.009499
2   0.939539      0.055440    0.005021
3   0.151284      0.616193    0.232523
4   0.698756      0.292996    0.008247


In [13]:
# XGB 1st level 30fold

file_train      = 'train_blend_XGB_last_30fold_2017-04-21-12-57' + '.csv'
file_test_mean  = 'test_blend_XGB_last_30fold_2017-04-21-12-57' + '.csv'


train_xgb_30fold      = pd.read_csv(data_path + file_train, header = None)
test_xgb_mean_30fold  = pd.read_csv(data_path + file_test_mean, header = None)


n_column = train_xgb_30fold.shape[1]
total_col += n_column

train_xgb_30fold.columns      = ['xgb_30fold_' + x for x in names[:n_column]]
test_xgb_mean_30fold.columns  = ['xgb_30fold_' + x for x in names[:n_column]]

print train_xgb_30fold.iloc[:5,:3]
print test_xgb_mean_30fold.iloc[:5,:3]

   xgb_30fold_low_0  xgb_30fold_medium_0  xgb_30fold_high_0
0          0.309433             0.660416           0.030151
1          0.565861             0.382300           0.051838
2          0.401785             0.566253           0.031961
3          0.931191             0.066339           0.002470
4          0.881444             0.117002           0.001555
   xgb_30fold_low_0  xgb_30fold_medium_0  xgb_30fold_high_0
0          0.174706             0.640575           0.184719
1          0.977405             0.013037           0.009558
2          0.944000             0.051268           0.004732
3          0.145425             0.592400           0.262175
4          0.670348             0.322061           0.007591


In [14]:
# XGB one vs rest 1st level

file_train      = 'train_blend_xgb_ovr_last_2017-04-21-10-09' + '.csv'
file_test_mean  = 'test_blend_xgb_ovr_mean_last_2017-04-21-10-09' + '.csv'

train_xgb_ovr      = pd.read_csv(data_path + file_train, header = None)
test_xgb_mean_ovr  = pd.read_csv(data_path + file_test_mean, header = None)


n_column = train_xgb_ovr.shape[1]
total_col += n_column

train_xgb_ovr.columns      = ['xgb_ovr_' + x for x in names[:n_column]]
test_xgb_mean_ovr.columns  = ['xgb_ovr_' + x for x in names[:n_column]]

sum_train = np.sum(train_xgb_ovr,axis=1)
sum_test  = np.sum(test_xgb_mean_ovr,axis=1)

for col in train_xgb_ovr.columns.values:
    train_xgb_ovr[col] = train_xgb_ovr[col] / sum_train
    test_xgb_mean_ovr[col] = test_xgb_mean_ovr[col] / sum_test


print train_xgb_ovr.iloc[:5,:3]
print test_xgb_mean_ovr.iloc[:5,:3]

   xgb_ovr_low_0  xgb_ovr_medium_0  xgb_ovr_high_0
0       0.313141          0.658802        0.028057
1       0.491897          0.450891        0.057212
2       0.486618          0.486574        0.026809
3       0.925839          0.071295        0.002866
4       0.863436          0.134349        0.002214
   xgb_ovr_low_0  xgb_ovr_medium_0  xgb_ovr_high_0
0       0.167358          0.646104        0.186538
1       0.981203          0.010042        0.008756
2       0.906588          0.088855        0.004558
3       0.167107          0.602472        0.230421
4       0.716008          0.274903        0.009089


In [15]:
# LightGBM 1st level

file_train      = 'train_blend_LightGBM_last_10bagging_2017-04-21-21-54' + '.csv'
file_test_mean  = 'test_blend_LightGBM_mean_last_10bagging_2017-04-21-21-54' + '.csv'


train_lgb      = pd.read_csv(data_path + file_train, header = None)
test_lgb_mean  = pd.read_csv(data_path + file_test_mean, header = None)

n_column = train_lgb.shape[1]
total_col += n_column

train_lgb.columns      = ['lgb_10bag_' + x for x in names[:n_column]]
test_lgb_mean.columns  = ['lgb_10bag_' + x for x in names[:n_column]]

print train_lgb.iloc[:5,:3]
print test_lgb_mean.iloc[:5,:3]

   lgb_10bag_low_0  lgb_10bag_medium_0  lgb_10bag_high_0
0         0.317617            0.650112          0.032271
1         0.597390            0.386899          0.015711
2         0.450143            0.517132          0.032726
3         0.898037            0.100391          0.001572
4         0.880435            0.118283          0.001282
   lgb_10bag_low_0  lgb_10bag_medium_0  lgb_10bag_high_0
0         0.186506            0.567313          0.246180
1         0.964578            0.025110          0.010312
2         0.913688            0.080543          0.005769
3         0.114846            0.657136          0.228019
4         0.650263            0.344835          0.004902


In [16]:
# Keras 1st level No.1

file_train      = 'train_blend_Keras_last_2017-04-20-21-23' + '.csv'
file_test_mean  = 'test_blend_Keras_mean_last_2017-04-20-21-23' + '.csv'


tmp_train = pd.read_csv(data_path + file_train, header = None)
tmp_test  = pd.read_csv(data_path + file_test_mean, header = None)

train_nn     = tmp_train[[0, 1, 2]]+(tmp_train[[3, 4, 5]]).rename(columns = {3:0,4:1,5:2})
test_nn_mean = tmp_test[[0, 1, 2]]+(tmp_test[[3, 4, 5]]).rename(columns = {3:0,4:1,5:2})

file_train      = 'train_blend_Keras_last_2017-04-20-22-05' + '.csv'
file_test_mean  = 'test_blend_Keras_mean_last_2017-04-20-22-05' + '.csv'

tmp_train = pd.read_csv(data_path + file_train, header = None)
tmp_test  = pd.read_csv(data_path + file_test_mean, header = None)

train_nn     = (train_nn + tmp_train[[0, 1, 2]]+(tmp_train[[3, 4, 5]]).rename(columns = {3:0,4:1,5:2}))/4
test_nn_mean = (test_nn_mean + tmp_test[[0, 1, 2]]+(tmp_test[[3, 4, 5]]).rename(columns = {3:0,4:1,5:2}))/4


n_column = train_nn.shape[1]
total_col += n_column

train_nn.columns      = ['nn_' + x for x in names[:n_column]]
test_nn_mean.columns  = ['nn_' + x for x in names[:n_column]]

print train_nn.iloc[:5,:3]
print test_nn_mean.iloc[:5,:3]

   nn_low_0  nn_medium_0  nn_high_0
0  0.335217     0.604643   0.060140
1  0.772956     0.207974   0.019070
2  0.529658     0.446699   0.023643
3  0.956680     0.042453   0.000867
4  0.938238     0.060709   0.001053
   nn_low_0  nn_medium_0  nn_high_0
0  0.276664     0.582024   0.141312
1  0.995422     0.004223   0.000356
2  0.979827     0.019278   0.000895
3  0.362736     0.446379   0.190885
4  0.729782     0.259918   0.010300


In [17]:
# Keras 1st level 30fold

file_train      = 'train_blend_Keras_last_30fold_2017-04-21-12-03' + '.csv'
file_test_mean  = 'test_blend_Keras_mean_last_30fold_2017-04-21-12-03' + '.csv'

train_nn_30fold     = pd.read_csv(data_path + file_train, header = None)
test_nn_mean_30fold  = pd.read_csv(data_path + file_test_mean, header = None)

n_column = train_nn_30fold.shape[1]
total_col += n_column

train_nn_30fold.columns      = ['nn_30fold_' + x for x in names[:n_column]]
test_nn_mean_30fold.columns  = ['nn_30fold_' + x for x in names[:n_column]]

print train_nn_30fold.iloc[:5,:3]
print test_nn_mean_30fold.iloc[:5,:3]

   nn_30fold_low_0  nn_30fold_medium_0  nn_30fold_high_0
0         0.378595            0.564656          0.056749
1         0.763784            0.215613          0.020603
2         0.577165            0.398481          0.024354
3         0.949192            0.049937          0.000871
4         0.958236            0.040963          0.000801
   nn_30fold_low_0  nn_30fold_medium_0  nn_30fold_high_0
0         0.286080            0.567262          0.146658
1         0.996091            0.003613          0.000296
2         0.986770            0.012566          0.000664
3         0.370175            0.443003          0.186822
4         0.738394            0.250685          0.010921


In [18]:
# Keras one vs rest 1st level
file_train      = 'train_blend_Keras_ovr_last_2017-04-21-10-15' + '.csv'
file_test_mean  = 'test_blend_Keras_ovr_last_2017-04-21-10-15' + '.csv'

train_nn_ovr      = pd.read_csv(data_path + file_train, header = None)
test_nn_mean_ovr  = pd.read_csv(data_path + file_test_mean, header = None)

n_column = train_nn_ovr.shape[1]
total_col += n_column

train_nn_ovr.columns      = ['nn_ovr_' + x for x in names[:n_column]]
test_nn_mean_ovr.columns  = ['nn_ovr_' + x for x in names[:n_column]]

sum_train = np.sum(train_nn_ovr,axis=1)
sum_test  = np.sum(test_nn_mean_ovr,axis=1)

for col in train_nn_ovr.columns.values:
    train_nn_ovr[col] = train_nn_ovr[col] / sum_train
    test_nn_mean_ovr[col] = test_nn_mean_ovr[col] / sum_test 

print train_nn_ovr.iloc[:5,:3]
print test_nn_mean_ovr.iloc[:5,:3]

   nn_ovr_low_0  nn_ovr_medium_0  nn_ovr_high_0
0      0.414059         0.533141       0.052800
1      0.752638         0.210038       0.037324
2      0.515976         0.440243       0.043781
3      0.780622         0.219167       0.000211
4      0.965501         0.032244       0.002255
   nn_ovr_low_0  nn_ovr_medium_0  nn_ovr_high_0
0      0.227279         0.679449       0.093272
1      0.976299         0.014297       0.009403
2      0.919964         0.066669       0.013367
3      0.266264         0.389498       0.344238
4      0.874583         0.119637       0.005780


In [19]:
# Keras 1st level 3layer 20 bagging
file_train      = 'train_blend_Keras_last_3layer_20bagging_2017-04-21-20-01' + '.csv'
file_test_mean  = 'test_blend_Keras_mean_last_3layer_20bagging_2017-04-21-20-01' + '.csv'

train_nn_3layer      = pd.read_csv(data_path + file_train, header = None)
test_nn_mean_3layer  = pd.read_csv(data_path + file_test_mean, header = None)

n_column = train_nn_3layer.shape[1]
total_col += n_column

train_nn_3layer.columns      = ['nn_3layer_' + x for x in names[:n_column]]
test_nn_mean_3layer.columns  = ['nn_3layer_' + x for x in names[:n_column]]


print train_nn_3layer.iloc[:5,:3]
print test_nn_mean_3layer.iloc[:5,:3]

   nn_3layer_low_0  nn_3layer_medium_0  nn_3layer_high_0
0         0.445882            0.488624          0.065494
1         0.761473            0.222332          0.016195
2         0.542404            0.415798          0.041798
3         0.956819            0.041689          0.001492
4         0.950208            0.048009          0.001783
   nn_3layer_low_0  nn_3layer_medium_0  nn_3layer_high_0
0         0.300305            0.540791          0.158904
1         0.995423            0.004364          0.000213
2         0.984619            0.014709          0.000672
3         0.376123            0.446293          0.177584
4         0.713087            0.267086          0.019827


In [20]:
print total_col

48


In [21]:
train_2nd      = pd.concat([train_rfc, train_LR, train_ET, train_KNN, train_FM_0322,    train_MNB,     train_tsne,
                            train_tsne_0322, train_xgb,     train_xgb_30fold,     train_xgb_ovr, 
                            train_nn,     train_nn_30fold,     train_nn_ovr,     train_nn_3layer,
                            train_lgb
#                             train_gp
                           ], axis = 1)

test_2nd_mean  = pd.concat([test_rfc,  test_LR,  test_ET,  test_KNN, test_FM_mean_0322, test_MNB_mean, test_tsne, 
                            test_tsne_0322,  test_xgb_mean, test_xgb_mean_30fold, test_xgb_mean_ovr,
                            test_nn_mean, test_nn_mean_30fold, test_nn_mean_ovr, test_nn_mean_3layer,
                            test_lgb_mean
#                             test_gp
                           ], axis = 1)

print 'train_2nd: {}\t test_2nd_mean:{}'.\
            format(train_2nd.shape,test_2nd_mean.shape)

train_2nd: (49352, 48)	 test_2nd_mean:(74659, 48)


In [57]:
data_path = "../input/"

train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')


ntrain = train_X_0322.shape[0]
sub_id = test_X_0322.listing_id.astype('int32').values

train_X = pd.read_csv(data_path + 'train_CV_MS_52571.csv')
test_X = pd.read_csv(data_path + 'test_CV_MS_52571.csv')

train_X = train_X_0322[['listing_id']].merge(train_X,on='listing_id',how='left')
test_X = test_X_0322[['listing_id']].merge(test_X,on='listing_id',how='left')

print train_X.shape, test_X.shape, train_y.shape

(49352, 222) (74659, 222) (49352, 3)


In [58]:
time_feature = pd.read_csv(data_path + 'listing_image_time.csv')
time_feature.columns = ['listing_id','time_stamp']
train_X = train_X.merge(time_feature,on='listing_id',how='left')
test_X = test_X.merge(time_feature,on='listing_id',how='left')

print train_X.shape
print test_X.shape

(49352, 223)
(74659, 223)


In [59]:
train_X = pd.concat([train_X,train_2nd],axis=1)
test_X = pd.concat([test_X,test_2nd_mean],axis=1)

print train_X.shape
print test_X.shape

(49352, 271)
(74659, 271)


In [60]:
full_data = pd.concat([train_X,test_X])
print full_data.shape

(124011, 271)


In [61]:
full_data = full_data.fillna(0)

for col in full_data.columns.values:
    full_data.loc[:,col] = (full_data[col]-full_data[col].mean())/full_data[col].std()
train_df_nn = full_data[:ntrain]
test_df_nn = full_data[ntrain:]

train_df_nn = sparse.csr_matrix(train_df_nn)
test_df_nn = sparse.csr_matrix(test_df_nn)


print train_df_nn.shape
print test_df_nn.shape

(49352, 271)
(74659, 271)


In [62]:
full_data.isnull().values.any()

False

In [70]:
X_train, X_val, y_train, y_val = train_test_split(train_df_nn, train_y, train_size=.80, random_state=3)

In [71]:
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [74]:
early_stop = EarlyStopping(monitor='val_loss', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0)
checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
                               monitor='val_loss', 
                               verbose=0, save_best_only=True)

def create_model(input_dim):
    model = Sequential()
    init = 'glorot_uniform'
    
    
    model.add(Dense(70, # number of input units: needs to be tuned
                    input_dim = input_dim, # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
        
    model.add(Dense(20,init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
#     model.add(Dense(20,init=init)) # number of hidden2 units. needs to be tuned.
#     model.add(Activation('sigmoid'))
#     model.add(PReLU())
#     model.add(BatchNormalization())    
#     model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
    model.add(Dense(3,
                   init = init,
                   activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
#                   metrics=[mae_log],
                  optimizer = 'Adamax' # optimizer. you may want to try different ones
                 )
    return(model)



model = create_model(X_train.shape[1])
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=1000,
                         samples_per_epoch=ntrain,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop,checkpointer]
                         )

print min(fit.history['val_loss'])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
0.496989972549


In [81]:


def nn_model(params):
    model = Sequential()
    init = 'glorot_uniform'
    
    model.add(Dense(params['input_size'], # number of input units: needs to be tuned
                    input_dim = params['input_dim'], # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(params['input_drop_out'])) #dropout rate. needs to be tuned
        
    model.add(Dense(params['hidden_size'],
                    init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(params['hidden_drop_out'])) #dropout rate. needs to be tuned
    
#     model.add(Dense(20,init=init)) # number of hidden2 units. needs to be tuned.
#     model.add(Activation('sigmoid'))
#     model.add(PReLU())
#     model.add(BatchNormalization())    
#     model.add(Dropout(0.5)) #dropout rate. needs to be tuned
    
    model.add(Dense(3,
                    init = init,
                    activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'Adamax' # optimizer. you may want to try different ones
                 )
    return(model)



def nn_blend_data(parameters, train_x, train_y, test_x, fold, early_stopping_rounds=10, batch_size=128,randomseed = 1234):
    
    
    early_stop = EarlyStopping(monitor='val_loss', # custom metric
                           patience=early_stopping_rounds, #early stopping for epoch
                           verbose=0)
    checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
                               monitor='val_loss', 
                               verbose=0, save_best_only=True)


    N_params = len(parameters)
#     print ("Blend %d estimators for %d folds" % (len(parameters), fold))
    skf = KFold(n_splits=fold, shuffle=True, random_state=randomseed)
    N_class = train_y.shape[1]
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    fold_start = time.time() 

    
    for j, nn_params in enumerate(parameters):
#         print ("Model %d: %s" %(j+1, nn_params))
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
#             print ("Model %d fold %d" %(j+1,i+1))
            train_x_fold = train_x[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = train_y[val_index]
            

            model = nn_model(nn_params)
#             print (model)
            fit= model.fit_generator(generator=batch_generator(train_x_fold, train_y_fold, batch_size, True),
                                     nb_epoch=70,
                                     samples_per_epoch=train_x_fold.shape[0],
                                     validation_data=(val_x_fold.todense(), val_y_fold),
                                     verbose = 0,
                                     callbacks=[early_stop, checkpointer]
                                    )

            best_round=len(fit.epoch)-early_stopping_rounds-1
            best_rounds[i,j]=best_round
#             print ("best round %d" % (best_round))
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'Adamax' )
            
            # print (mean_absolute_error(np.exp(y_val)-200, pred_y))
            val_y_predict_fold = model.predict_proba(x=val_x_fold.toarray(),verbose=0)
            score = log_loss(val_y_fold, val_y_predict_fold)
#             print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'Adamax' )            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = model.predict_proba(x=test_x.toarray(),verbose=0)
#             print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
        test_blend_x[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
#         print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print "Score for blended models is %f in %0.3fm" % (np.mean(scores), (time.time() - fold_start)/60)
    return (train_blend_x, test_blend_x, scores,best_rounds)

In [82]:
train_total = np.zeros((train_df_nn.shape[0],3))
test_total = np.zeros((test_df_nn.shape[0],3))
score_total = 0
count = 100
print 'Starting.........'
for n in range(count):
#     print n
    nn_parameters = [
        { 'input_size' :70 ,
         'input_dim' : train_X.shape[1],
         'input_drop_out' : 0.4 ,
         'hidden_size' : 20 ,
         'hidden_drop_out' :0.4},

    ]

    (train_blend_x, test_blend_x, blend_scores,best_round) = nn_blend_data(nn_parameters, train_df_nn, train_y, test_df_nn,
                                                             5,
                                                             10,128,n+500)
    train_total += train_blend_x
    test_total += test_blend_x
    score_total += np.mean(blend_scores)
    
    name_train_blend = '../tmp/train_2nd.csv'
    name_test_blend = '../tmp/test_2nd.csv'

    np.savetxt(name_train_blend,train_total, delimiter=",")
    np.savetxt(name_test_blend,test_total, delimiter=",")
    
train_total = train_total / count
test_total = test_total / count
score_total = score_total / count

Starting.........
Score for blended models is 0.489167 in 10.825m
Score for blended models is 0.490623 in 10.973m
Score for blended models is 0.488229 in 11.748m
Score for blended models is 0.490783 in 10.206m
Score for blended models is 0.493555 in 10.886m
Score for blended models is 0.493179 in 10.781m
Score for blended models is 0.480429 in 12.540m
Score for blended models is 0.490841 in 10.006m
Score for blended models is 0.485040 in 11.392m
Score for blended models is 0.488277 in 11.149m
Score for blended models is 0.487818 in 12.856m
Score for blended models is 0.480561 in 12.095m
Score for blended models is 0.497479 in 11.343m
Score for blended models is 0.492343 in 10.683m
Score for blended models is 0.489575 in 12.049m
Score for blended models is 0.484178 in 12.336m
Score for blended models is 0.496555 in 11.198m
Score for blended models is 0.486307 in 11.967m
Score for blended models is 0.492037 in 11.356m
Score for blended models is 0.489696 in 11.544m
Score for blended mode

In [83]:
print 'Done!'

Done!


In [87]:
test_total

array([[  1.62490442e-01,   5.84023116e-01,   2.53486442e-01],
       [  9.95380871e-01,   4.23049504e-03,   3.88634972e-04],
       [  9.72845538e-01,   2.58905193e-02,   1.26394330e-03],
       ..., 
       [  9.81898498e-01,   1.67049272e-02,   1.39657408e-03],
       [  9.58855166e-01,   3.91550019e-02,   1.98983122e-03],
       [  5.60936439e-01,   3.97504595e-01,   4.15589648e-02]])

In [86]:
now = datetime.now()
sub_name = '../output/sub_2ndKeras_last_100bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_total)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)


In [85]:

# now = datetime.now()

name_train_blend = '../output/train_blend_2ndKeras_100bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend = '../output/test_blend_2ndKeras_100bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



print (np.mean(blend_scores,axis=0))
print (np.mean(best_round,axis=0))
np.savetxt(name_train_blend,train_total, delimiter=",")
np.savetxt(name_test_blend,test_total, delimiter=",")

[ 0.49272779]
[ 38.4]
