In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn import preprocessing

import gc
from scipy.stats import skew, boxcox

from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,ParametricSoftplus,ThresholdedReLU,SReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD,Nadam
from keras.regularizers import WeightRegularizer, ActivityRegularizer,l2, activity_l2
from keras.utils.np_utils import to_categorical

Using Theano backend.


# Load Data

In [3]:
data_path = "../input/"

train_df=pd.read_json('../input/train.json').reset_index(drop = True)
target_num_map = {'high':2, 'medium':1, 'low':0}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
train_y = to_categorical(train_y)

train_X = pd.read_csv(data_path + 'train_CV_MS_52571.csv')
test_X = pd.read_csv(data_path + 'test_CV_MS_52571.csv')



train_X_0322 = pd.read_csv(data_path + 'train_BM_MB_add03052240.csv')
test_X_0322 = pd.read_csv(data_path + 'test_BM_MB_add03052240.csv')


ntrain = train_X.shape[0]
sub_id = test_X_0322.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 222) (74659, 222) (49352, 3)


In [4]:
time_feature = pd.read_csv(data_path + 'listing_image_time.csv')
time_feature.columns = ['listing_id','time_stamp']
train_X = train_X.merge(time_feature,on='listing_id',how='left')
test_X = test_X.merge(time_feature,on='listing_id',how='left')

print train_X.shape
print test_X.shape

(49352, 223)
(74659, 223)


In [5]:
full_data = pd.concat([train_X,test_X])
print full_data.shape

(124011, 223)


In [6]:
full_data.columns.values

array(['latitude', 'longitude', 'num_pricePerBed', 'num_bedBathSum',
       'num_pricePerBath', 'num_pricePerRoom', 'num_bedPerBath',
       'num_bedBathDiff', 'num_bedsPerc', 'num_photo_count',
       'num_features', 'num_desc_wordcount', 'num_desc_length_null',
       'listing_id', 'num_dist_from_center', 'num_OutlierAggregated',
       'num_pos_density', 'num_building_null', 'num_created_weekday',
       'num_created_weekofyear', 'num_created_day', 'num_created_month',
       'num_created_hour', 'num_bathrooms', 'num_bedrooms', 'num_price',
       'num_price_q', 'num_priceXroom', 'num_even_bathrooms',
       'display_address', 'manager_id', 'building_id', 'street_address',
       'position', 'num_location_6_3', 'num_location_6_1',
       'num_location_6_0', 'num_location_6_5', 'num_location_6_4',
       'num_location_6_2', 'num_room_type_0', 'num_room_type_1',
       'num_room_type_2', 'num_room_type_3', 'num_room_type_4',
       'num_room_type_5', 'num_room_type_6', 'num_room_type_

In [7]:
feat_to_use = ['latitude', 'longitude', 'num_pricePerBed', 'num_bedBathSum',
       'num_pricePerBath', 'num_pricePerRoom', 'num_bedPerBath',
       'num_bedBathDiff', 'num_bedsPerc', 'num_photo_count',
       'num_features', 'num_desc_wordcount', 
#                'num_desc_length_null',
       'listing_id', 'num_dist_from_center', 'num_OutlierAggregated',
       'num_pos_density', 
#                'num_building_null', 
               'num_created_weekday',
       'num_created_weekofyear', 'num_created_day', 'num_created_month',
       'num_created_hour', 'num_bathrooms', 'num_bedrooms', 'num_price',
       'num_price_q', 'num_priceXroom', 
#                'num_even_bathrooms',
       'display_address', 'manager_id', 'building_id', 'street_address',
       'position', 
#                'num_location_6_3', 'num_location_6_1',
#        'num_location_6_0', 'num_location_6_5', 'num_location_6_4',
#        'num_location_6_2', 'num_room_type_0', 'num_room_type_1',
#        'num_room_type_2', 'num_room_type_3', 'num_room_type_4',
#        'num_room_type_5', 'num_room_type_6', 'num_room_type_7',
#        'num_room_type_8', 'num_room_type_9', 'num_room_type_10',
#        'num_room_type_11', 'num_room_type_12', 'num_room_type_13',
#        'num_room_type_14', 'num_room_type_15', 'num_room_type_16',
#        'num_room_type_17', 'num_room_type_18', 'num_room_type_19',
       'num_6_median_price', 'num_6_price_ratio', 'num_6_price_diff',
       'num_6_median_price_bedroom', 'num_6_price_ratio_bedroom',
       'num_6_price_diff_bedroom', 
#                'feature_washer', 'feature_laundry',
#        'feature_prewar', 'feature_furnished', 'feature_parking',
#        'feature_utilities', 'feature_elevator', 'feature_marble',
#        'feature_concierge', 'feature_cats', 'feature_health',
#        'feature_pool', 'feature_onemounthfree', 'feature_parquet',
#        'feature_lowfee', 'feature_luxury', 'feature_nofee',
#        'feature_fireplace', 'feature_dogs', 'feature_transport',
#        'feature_loft', 
               'median_price_bed', 'ratio_bed', 'compound', 'neg',
       'neu', 'pos', 
#                'street', 'avenue', 'east', 'west', 'north', 'south',
#        'other_address', 'Zero_building_id', 'top_10_building',
#        'top_25_building', 'top_5_building', 'top_50_building',
#        'top_1_building', 'top_2_building', 'top_15_building',
#        'top_20_building', 'top_30_building', 
               'manager_level_low',
       'manager_level_medium', 'manager_level_high',
       'manager_id_price_low_median', 'manager_id_price_medium_median',
       'manager_id_price_high_median', 'manager_id_price_low_mean',
       'manager_id_price_medium_mean', 'manager_id_price_high_mean',
       'manager_id_price_low_max', 'manager_id_price_medium_max',
       'manager_id_price_high_max', 'manager_id_price_low_min',
       'manager_id_price_medium_min', 'manager_id_price_high_min',
       'manager_id_num_created_hour_low_median',
       'manager_id_num_created_hour_medium_median',
       'manager_id_num_created_hour_high_median',
       'manager_id_num_created_hour_low_mean',
       'manager_id_num_created_hour_medium_mean',
       'manager_id_num_created_hour_high_mean',
       'manager_id_num_created_hour_low_max',
       'manager_id_num_created_hour_medium_max',
       'manager_id_num_created_hour_high_max',
       'manager_id_num_created_hour_low_min',
       'manager_id_num_created_hour_medium_min',
       'manager_id_num_created_hour_high_min',
       'manager_id_num_6_price_diff_bedroom_low_median',
       'manager_id_num_6_price_diff_bedroom_medium_median',
       'manager_id_num_6_price_diff_bedroom_high_median',
       'manager_id_num_6_price_diff_bedroom_low_mean',
       'manager_id_num_6_price_diff_bedroom_medium_mean',
       'manager_id_num_6_price_diff_bedroom_high_mean',
       'manager_id_num_6_price_diff_bedroom_low_max',
       'manager_id_num_6_price_diff_bedroom_medium_max',
       'manager_id_num_6_price_diff_bedroom_high_max',
       'manager_id_num_6_price_diff_bedroom_low_min',
       'manager_id_num_6_price_diff_bedroom_medium_min',
       'manager_id_num_6_price_diff_bedroom_high_min',
       'manager_id_bedrooms_low_median',
       'manager_id_bedrooms_medium_median',
       'manager_id_bedrooms_high_median', 'manager_id_bedrooms_low_mean',
       'manager_id_bedrooms_medium_mean', 'manager_id_bedrooms_high_mean',
       'manager_id_bedrooms_low_max', 'manager_id_bedrooms_medium_max',
       'manager_id_bedrooms_high_max', 'manager_id_bedrooms_low_min',
       'manager_id_bedrooms_medium_min', 'manager_id_bedrooms_high_min',
       'manager_id_num_photo_count_low_median',
       'manager_id_num_photo_count_medium_median',
       'manager_id_num_photo_count_high_median',
       'manager_id_num_photo_count_low_mean',
       'manager_id_num_photo_count_medium_mean',
       'manager_id_num_photo_count_high_mean',
       'manager_id_num_photo_count_low_max',
       'manager_id_num_photo_count_medium_max',
       'manager_id_num_photo_count_high_max',
       'manager_id_num_photo_count_low_min',
       'manager_id_num_photo_count_medium_min',
       'manager_id_num_photo_count_high_min',
       'manager_id_Zero_building_id_low_median',
       'manager_id_Zero_building_id_medium_median',
       'manager_id_Zero_building_id_high_median',
       'manager_id_Zero_building_id_low_mean',
       'manager_id_Zero_building_id_medium_mean',
       'manager_id_Zero_building_id_high_mean',
       'manager_id_Zero_building_id_low_max',
       'manager_id_Zero_building_id_medium_max',
       'manager_id_Zero_building_id_high_max',
       'manager_id_Zero_building_id_low_min',
       'manager_id_Zero_building_id_medium_min',
       'manager_id_Zero_building_id_high_min',
       'manager_id_feature_nofee_low_median',
       'manager_id_feature_nofee_medium_median',
       'manager_id_feature_nofee_high_median',
       'manager_id_feature_nofee_low_mean',
       'manager_id_feature_nofee_medium_mean',
       'manager_id_feature_nofee_high_mean',
       'manager_id_feature_nofee_low_max',
       'manager_id_feature_nofee_medium_max',
       'manager_id_feature_nofee_high_max',
       'manager_id_feature_nofee_low_min',
       'manager_id_feature_nofee_medium_min',
       'manager_id_feature_nofee_high_min',
       'manager_id_longitude_low_median',
       'manager_id_longitude_medium_median',
       'manager_id_longitude_high_median', 'manager_id_longitude_low_mean',
       'manager_id_longitude_medium_mean',
       'manager_id_longitude_high_mean', 'manager_id_longitude_low_max',
       'manager_id_longitude_medium_max', 'manager_id_longitude_high_max',
       'manager_id_longitude_low_min', 'manager_id_longitude_medium_min',
       'manager_id_longitude_high_min', 'manager_id_latitude_low_median',
       'manager_id_latitude_medium_median',
       'manager_id_latitude_high_median', 'manager_id_latitude_low_mean',
       'manager_id_latitude_medium_mean', 'manager_id_latitude_high_mean',
       'manager_id_latitude_low_max', 'manager_id_latitude_medium_max',
       'manager_id_latitude_high_max', 'manager_id_latitude_low_min',
       'manager_id_latitude_medium_min', 'manager_id_latitude_high_min',
       'num_nan','time_stamp']

In [8]:
full_data = full_data.fillna(0)

for col in feat_to_use:
    full_data.loc[:,col] = (full_data[col]-full_data[col].mean())/full_data[col].std()
train_df_nn = full_data[:ntrain]
test_df_nn = full_data[ntrain:]

train_df_nn = sparse.csr_matrix(train_df_nn)
test_df_nn = sparse.csr_matrix(test_df_nn)


print train_df_nn.shape
print test_df_nn.shape

(49352, 223)
(74659, 223)


In [9]:
full_data.isnull().values.any()

False

In [10]:
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [11]:
X_train, X_val, y_train, y_val = train_test_split(train_df_nn, train_y, train_size=.80, random_state=1234)

In [12]:
early_stop = EarlyStopping(monitor='val_loss', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0)
checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
                               monitor='val_loss', 
                               verbose=0, save_best_only=True)

def create_model(input_dim):
    model = Sequential()
    init = 'glorot_uniform'
    
    
    model.add(Dense(100, # number of input units: needs to be tuned
                    input_dim = input_dim, # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
        
    model.add(Dense(40,init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
    model.add(Dense(15,init=init)) # number of hidden2 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
    model.add(Dense(3,
                   init = init,
                   activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
#                   metrics=[mae_log],
                  optimizer = 'Adamax' # optimizer. you may want to try different ones
                 )
    return(model)



model = create_model(X_train.shape[1])
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=1000,
                         samples_per_epoch=ntrain,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop,checkpointer]
                         )

print min(fit.history['val_loss'])

Epoch 1/1000



Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


In [None]:
# 150 0.4 50 0.4 15 0.4 'glorot_uniform' 'adam' 0.543332518312
# 150 0.5 50 0.5 15 0.5 'glorot_uniform' 'adam' 0.544221234441
# 100 0.5 25 0.5 9 0.5 'glorot_uniform' 'adam' 0.54719870663
# 100 0.4 30 0.4 9 0.4 'glorot_uniform' 'adam' 0.544765821466

In [None]:
# 100 0.5 50 0.5 'glorot_uniform' 'adam' 0.54511465921
# 100 0.5 25 0.5 'glorot_uniform' 'adam' 0.543118116239
# 100 0.4 30 0.4 'glorot_uniform' 'adam' 0.541240284411
# 100 0.4 30 0.4 'glorot_uniform' 'Adamax' 0.542234674134

In [None]:
# 100 0.5 'glorot_uniform' 'adam' 0.543593994089
# 100 0.4 'glorot_uniform' 'adam' 0.542798319778
# 100 0.3 'glorot_uniform' 'adam' 0.54412619796
# 100 0.2 'glorot_uniform' 'adam' 0.547258452084

In [None]:
# 70 0.4 'glorot_uniform' 'adam' 0.544325359686
# 100 0.4 'glorot_uniform' 'adam' 0.542798319778
# 150 0.4 'glorot_uniform' 'adam' 0.544117797527
# 200 0.4 'glorot_uniform' 'adam' 0.545132525568
# 300 0.4 'glorot_uniform' 'adam' 0.545077440339

In [None]:
# 100 0.4 'glorot_uniform' 'adam' 0.542798319778
# 100 0.4 'glorot_uniform' 'RMSprop' 0.547544663425
# 100 0.4 'glorot_uniform' 'Adamax' 0.542127071416
# 100 0.4 'glorot_uniform' 'Nadam' 0.544927256042

In [None]:
# 100 0.4 'he_uniform' 'adam' 0.54419020321
# 100 0.4 'he_normal' 'adam' 0.543867479612
# 100 0.4 'glorot_uniform' 'adam' 0.542798319778
# 100 0.4 'glorot_normal' 'adam' 0.546524272962
# 100 0.4 'lecun_uniform' 'adam' 0.544478366113



In [None]:
# 150 0.4 50 0.5 20 0.5 'he_normal'  val 0.543127303723
# 150 0.4 50 0.5 20 0.5 'he_uniform'  val 0.542279646729


In [51]:
test_df_nn

<74659x412 sparse matrix of type '<type 'numpy.float64'>'
	with 6084908 stored elements in Compressed Sparse Row format>

In [46]:
model.load_weights("weights.hdf5")

model.compile(loss = 'categorical_crossentropy',optimizer = 'adam' )

In [55]:
pred_y = model.predict_proba(x=test_df_nn.toarray(),batch_size = 128,verbose=0)

In [56]:
pred_y

array([[  3.98445845e-01,   4.90590602e-01,   1.10963546e-01],
       [  9.91283834e-01,   8.24213494e-03,   4.74051310e-04],
       [  9.88962233e-01,   1.05425483e-02,   4.95240442e-04],
       ..., 
       [  9.92257774e-01,   7.49900285e-03,   2.43204151e-04],
       [  9.84745502e-01,   1.47970403e-02,   4.57483169e-04],
       [  6.17200851e-01,   3.49761784e-01,   3.30373496e-02]], dtype=float32)

In [57]:
now = datetime.now()
sub_name = '../output/sub_Keras_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y)
out_df.columns = ["low", "medium","high"]
out_df["listing_id"] = sub_id
out_df.to_csv(sub_name, index=False)

In [19]:


def nn_model(params):
    model = Sequential()
    init = 'glorot_uniform'
    
    model.add(Dense(params['input_size'], # number of input units: needs to be tuned
                    input_dim = params['input_dim'], # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(params['input_drop_out'])) #dropout rate. needs to be tuned
        
    model.add(Dense(params['hidden_size'],
                    init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(params['hidden_drop_out'])) #dropout rate. needs to be tuned
    
    model.add(Dense(params['hidden_size1'],init=init)) # number of hidden2 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(params['hidden_drop_out1'])) #dropout rate. needs to be tuned
    
    model.add(Dense(3,
                    init = init,
                    activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'Adamax' # optimizer. you may want to try different ones
                 )
    return(model)



def nn_blend_data(parameters, train_x, train_y, test_x, fold, early_stopping_rounds=0, batch_size=128,randseed=0):
    N_params = len(parameters)
#     print ("Blend %d estimators for %d folds" % (len(parameters), fold))
    skf = KFold(n_splits=fold,shuffle=True,random_state=randseed)
    N_class = train_y.shape[1]
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    fold_start = time.time() 

    
    for j, nn_params in enumerate(parameters):
#         print ("Model %d: %s" %(j+1, nn_params))
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
#             print ("Model %d fold %d" %(j+1,i+1))
            
            train_x_fold = train_x[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = train_y[val_index]
            

            model = nn_model(nn_params)
#             print (model)
            fit= model.fit_generator(generator=batch_generator(train_x_fold, train_y_fold, 128, True),
                                     nb_epoch=70,
                                     samples_per_epoch=train_x_fold.shape[0],
                                     validation_data=(val_x_fold.todense(), val_y_fold),
                                     verbose = 0,
                                     callbacks=[ModelCheckpoint(filepath="weights.hdf5", 
                                                                monitor='val_loss', 
                                                                verbose=0, save_best_only=True)]
                                    )

            best_round=len(fit.epoch)-early_stopping_rounds-1
            best_rounds[i,j]=best_round
#             print ("best round %d" % (best_round))
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'Adamax' )
            
            # print (mean_absolute_error(np.exp(y_val)-200, pred_y))
            val_y_predict_fold = model.predict_proba(x=val_x_fold.toarray(),verbose=0)
            score = log_loss(val_y_fold, val_y_predict_fold)
#             print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'Adamax' )            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = model.predict_proba(x=test_x.toarray(),verbose=0)
#             print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, )            
            
        test_blend_x[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
#         print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print "Score for blended models is %f in %0.3fm" % (np.mean(scores), (time.time() - fold_start)/60)
    return (train_blend_x, test_blend_x, scores,best_rounds)

In [20]:
train_total = np.zeros((train_df_nn.shape[0],3))
test_total = np.zeros((test_df_nn.shape[0],3))
name_train_blend = '../tmp/train_kares.csv'
name_test_blend = '../tmp/test_kares.csv'
score_total = 0
count = 20

print 'Starting............'
for n in range(count):
    nn_parameters = [
        { 'input_size' :100 ,
         'input_dim' : train_X.shape[1],
         'input_drop_out' : 0.4 ,
         'hidden_size' : 40 ,
         'hidden_drop_out' :0.4,
        'hidden_size1':15,
        'hidden_drop_out1':0.4},

    ]

    (train_blend_x, test_blend_x, blend_scores,best_round) = nn_blend_data(nn_parameters, train_df_nn, train_y, test_df_nn,
                                                             5,
                                                             10,128,n)
    train_total += train_blend_x
    test_total += test_blend_x
    score_total += np.mean(blend_scores)
    


    np.savetxt(name_train_blend,train_total, delimiter=",")
    np.savetxt(name_test_blend,test_total, delimiter=",")
    
train_total = train_total / count
test_total = test_total / count
score_total = score_total / count

Starting............
Score for blended models is 0.533909 in 22.402m
Score for blended models is 0.532641 in 22.312m
Score for blended models is 0.532599 in 22.288m
Score for blended models is 0.532401 in 22.329m
Score for blended models is 0.533066 in 22.199m
Score for blended models is 0.533293 in 22.153m
Score for blended models is 0.532767 in 22.277m
Score for blended models is 0.532134 in 22.349m
Score for blended models is 0.532149 in 22.332m
Score for blended models is 0.532735 in 22.367m
Score for blended models is 0.532439 in 22.257m
Score for blended models is 0.533261 in 22.330m
Score for blended models is 0.532618 in 22.324m
Score for blended models is 0.533259 in 22.289m
Score for blended models is 0.532986 in 22.378m
Score for blended models is 0.533128 in 22.275m
Score for blended models is 0.532502 in 22.281m
Score for blended models is 0.532666 in 22.733m
Score for blended models is 0.533299 in 22.321m
Score for blended models is 0.532571 in 22.116m


In [21]:
train_blend_x = pd.DataFrame(train_total)
train_blend_x.columns = ["low", "medium", "hig"]
train_blend_x["listing_id"] = train_X.listing_id.values

test_blend_x = pd.DataFrame(test_total)
test_blend_x.columns = ["low", "medium", "hig"]
test_blend_x["listing_id"] = test_X.listing_id.values

In [22]:
tmp_train = train_X_0322[['listing_id']].merge(train_blend_x,on = 'listing_id', how = 'left')[["low", "medium", "hig"]].values
tmp_test_mean = test_X_0322[['listing_id']].merge(test_blend_x,on = 'listing_id', how = 'left')[["low", "medium", "hig"]].values

In [23]:
now = datetime.now()

name_train_blend = '../output/train_blend_Keras_last_3layer_20bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_Keras_mean_last_3layer_20bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



print (np.mean(blend_scores,axis=0))
print (np.mean(best_round,axis=0))
np.savetxt(name_train_blend,tmp_train, delimiter=",")
np.savetxt(name_test_blend_mean,tmp_test_mean, delimiter=",")

[ 0.53257061]
[ 59.]


In [24]:
# now = datetime.now()
sub_name = '../output/sub_Keras_mean_last_3layer_20bagging_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(tmp_test_mean)
out_df.columns = ["low", "medium", "high"]
out_df["listing_id"] = test_X_0322.listing_id.values
out_df.to_csv(sub_name, index=False)