In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn import preprocessing

import gc
from scipy.stats import skew, boxcox

from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,ParametricSoftplus,ThresholdedReLU,SReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD,Nadam
from keras.regularizers import WeightRegularizer, ActivityRegularizer,l2, activity_l2
from keras.utils.np_utils import to_categorical

Using Theano backend.


# Load Data

In [3]:
data_path = "../input/"
train_X = pd.read_csv(data_path + 'train_BM_0331.csv')
test_X = pd.read_csv(data_path + 'test_BM_0331.csv')

ntrain = train_X.shape[0]
sub_id = test_X.listing_id.astype('int32').values
# all_features = features_to_use + desc_sparse_cols + feat_sparse_cols
print train_X.shape, test_X.shape, train_y.shape

(49352, 412) (74659, 412) (49352, 3)


In [4]:
null_ind = test_X.num_loc_price_diff.isnull()
test_X['num_loc_price_diff'] = test_X['num_price'] - test_X['num_loc_median_price']
# test_X[null_ind][['num_loc_price_diff','num_price','num_loc_median_price']]

In [5]:
full_data = pd.concat([train_X,test_X])
print full_data.shape

(124011, 412)


In [6]:
feat_to_use = ['building_id_mean_med', 'building_id_mean_high',
       'manager_id_mean_med', 'manager_id_mean_high', 'median_price_bed',
       'ratio_bed', 'compound', 'neg', 'neu', 'pos', 
#                'street', 'avenue',
#        'east', 'west', 'north', 'south', 'other_address', 'top_10_manager',
#        'top_25_manager', 'top_5_manager', 'top_50_manager',
#        'top_1_manager', 'top_2_manager', 'top_15_manager',
#        'top_20_manager', 'top_30_manager', 'Zero_building_id',
#        'top_10_building', 'top_25_building', 'top_5_building',
#        'top_50_building', 'top_1_building', 'top_2_building',
#        'top_15_building', 'top_20_building', 'top_30_building',
       'listing_id', 'num_latitude', 'num_longitude',
       'num_dist_from_center', 'num_OutlierAggregated', 'num_pos_density',
       'num_building_null', 'num_fbuilding', 'num_fmanager',
       'num_created_weekday', 'num_created_weekofyear', 'num_created_day',
       'num_created_month', 'num_created_hour', 'num_bathrooms',
       'num_bedrooms', 'num_price', 'num_price_q', 'num_priceXroom',
       'num_even_bathrooms', 'num_features', 'num_photos',
       'num_desc_length', 'num_desc_length_null',
#                'num_location_6_3',
#        'num_location_6_1', 'num_location_6_0', 'num_location_6_5',
#        'num_location_6_4', 'num_location_6_2', 'num_location_40_18',
#        'num_location_40_31', 'num_location_40_11', 'num_location_40_24',
#        'num_location_40_14', 'num_location_40_36', 'num_location_40_3',
#        'num_location_40_7', 'num_location_40_33', 'num_location_40_5',
#        'num_location_40_37', 'num_location_40_12', 'num_location_40_16',
#        'num_location_40_2', 'num_location_40_20', 'num_location_40_34',
#        'num_location_40_9', 'num_location_40_0', 'num_location_40_21',
#        'num_location_40_26', 'num_location_40_13', 'num_location_40_25',
#        'num_location_40_32', 'num_location_40_19', 'num_location_40_17',
#        'num_location_40_4', 'num_location_40_15', 'num_location_40_35',
#        'num_location_40_22', 'num_location_40_30', 'num_location_40_1',
#        'num_location_40_23', 'num_location_40_10', 'num_location_40_38',
#        'num_location_40_28', 'num_location_40_6', 'num_location_40_29',
#        'num_location_40_27', 'num_location_40_39', 'num_location_40_8',
#        'num_room_type_0', 'num_room_type_1', 'num_room_type_2',
#        'num_room_type_3', 'num_room_type_4', 'num_room_type_5',
#        'num_room_type_6', 'num_room_type_7', 'num_room_type_8',
#        'num_room_type_9', 'num_room_type_10', 'num_room_type_11',
#        'num_room_type_12', 'num_room_type_13', 'num_room_type_14',
#        'num_room_type_15', 'num_room_type_16', 'num_room_type_17',
#        'num_room_type_18', 'num_room_type_19', 
               'num_6_median_price',
       'num_6_price_ratio', 'num_6_price_diff', 'num_loc_median_price',
       'num_loc_price_ratio', 'num_loc_price_diff', 'num_loc_ratio',
       'num_loc_diff', 'hcc_pos_pred_1', 'hcc_pos_pred_2', 'building_id',
       'display_address', 'manager_id', 'street_address',
       'num_pricePerBed', 'num_pricePerBath', 'num_pricePerRoom',
       'num_bedPerBath', 'num_bedBathDiff', 'num_bedBathSum',
       'num_bedsPerc', 
#                'hcc_building_id_pred_1', 'hcc_building_id_pred_2',
#        'hcc_manager_id_pred_1', 'hcc_manager_id_pred_2',
               
#        'feature_1_month_free', 'feature_24/7_concierge',
#        'feature_24/7_doorman', 'feature_24/7_doorman_concierge',
#        'feature_actual_apt._photos', 'feature_air_conditioning',
#        'feature_all_pets_ok', 'feature_all_utilities_included',
#        'feature_assigned-parking-space', 'feature_attended_lobby',
#        'feature_backyard', 'feature_balcony', 'feature_basement_storage',
#        'feature_basketball_court', 'feature_bike_room',
#        'feature_bike_storage', 'feature_billiards_room',
#        'feature_billiards_table_and_wet_bar', 'feature_brand_new',
#        'feature_breakfast_bar', 'feature_bright', 'feature_brownstone',
#        'feature_building-common-outdoor-space', 'feature_business_center',
#        'feature_cable/satellite_tv', 'feature_cable_ready',
#        'feature_call/text_abraham_caro_@_917-373-0862',
#        'feature_cats_allowed', 'feature_central_a/c', 'feature_central_ac',
#        'feature_central_air', 'feature_chefs_kitchen',
#        "feature_children's_playroom", 'feature_childrens_playroom',
#        'feature_cinema_room', 'feature_city_view',
#        'feature_close_to_subway', 'feature_closets_galore!',
#        'feature_club_sun_deck_has_spectacular_city_and_river_views',
#        'feature_cold_storage', 'feature_common_backyard',
#        'feature_common_garden', 'feature_common_outdoor_space',
#        'feature_common_parking/garage', 'feature_common_roof_deck',
#        'feature_common_storage', 'feature_common_terrace',
#        'feature_community_recreation_facilities',
#        'feature_complimentary_sunday_brunch', 'feature_concierge',
#        'feature_concierge_service', 'feature_condo_finishes',
#        'feature_courtyard', 'feature_crown_moldings', 'feature_deck',
#        'feature_deco_brick_wall', 'feature_decorative_fireplace',
#        'feature_dining_room', 'feature_dishwasher', 'feature_dogs_allowed',
#        'feature_doorman', 'feature_dry_cleaning_service',
#        'feature_dryer_in_unit', 'feature_duplex', 'feature_duplex_lounge',
#        'feature_eat-in_kitchen', 'feature_eat_in_kitchen',
#        'feature_elegant_glass-enclosed_private_lounge_with_magnificent_river_views',
#        'feature_elevator', 'feature_exclusive',
#        'feature_exercise/yoga_studio', 'feature_exposed_brick',
#        'feature_extra_room', 'feature_fireplace', 'feature_fireplaces',
#        'feature_fitness_center', 'feature_fitness_room', 'feature_flex-2',
#        'feature_flex-3', 'feature_free_wifi_in_club_lounge',
#        'feature_ft_doorman', 'feature_full-time_doorman',
#        'feature_full_service_garage',
#        'feature_fully-equipped_club_fitness_center',
#        'feature_fully__equipped', 'feature_furnished', 'feature_game_room',
#        'feature_garage', 'feature_garbage_disposal', 'feature_garden',
#        'feature_garden/patio', 'feature_granite_countertops',
#        'feature_granite_kitchen', 'feature_green_building',
#        'feature_guarantors_accepted', 'feature_gut_renovated',
#        'feature_gym', 'feature_gym/fitness', 'feature_gym_in_building',
#        'feature_hardwood', 'feature_hardwood_floors',
#        'feature_health_club', 'feature_hi_rise',
#        'feature_high-speed_internet', 'feature_high_ceiling',
#        'feature_high_ceilings', 'feature_high_speed_internet',
#        'feature_highrise', 'feature_housekeeping_service',
#        'feature_in-unit_washer/dryer', 'feature_indoor_pool',
#        'feature_intercom', 'feature_jacuzzi', 'feature_large_living_room',
#        'feature_laundry', 'feature_laundry_&_housekeeping',
#        'feature_laundry_in_building', 'feature_laundry_in_unit',
#        'feature_laundry_on_every_floor', 'feature_laundry_on_floor',
#        'feature_laundry_room', 'feature_light', 'feature_live-in_super',
#        'feature_live-in_superintendent', 'feature_live/work',
#        'feature_live_in_super', 'feature_loft', 'feature_lounge',
#        'feature_lounge_room', 'feature_lowrise', 'feature_luxury_building',
#        'feature_magnificent_venetian-style', 'feature_mail_room',
#        'feature_marble_bath', 'feature_marble_bathroom',
#        'feature_media_room', 'feature_media_screening_room',
#        'feature_microwave', 'feature_midrise', 'feature_multi-level',
#        'feature_new_construction', 'feature_newly_renovated',
#        'feature_no_fee', 'feature_no_pets', 'feature_on-site_atm_machine',
#        'feature_on-site_attended_garage', 'feature_on-site_garage',
#        'feature_on-site_laundry', 'feature_on-site_parking',
#        'feature_on-site_parking_available', 'feature_on-site_parking_lot',
#        'feature_on-site_super', 'feature_one_month_free',
#        'feature_outdoor_areas', 'feature_outdoor_entertainment_space',
#        'feature_outdoor_pool',
#        'feature_outdoor_roof_deck_overlooking_new_york_harbor_and_battery_park',
#        'feature_outdoor_space', 'feature_package_room', 'feature_parking',
#        'feature_parking_available', 'feature_parking_space',
#        'feature_part-time_doorman', 'feature_party_room', 'feature_patio',
#        'feature_penthouse', 'feature_pet_friendly', 'feature_pets',
#        'feature_pets_allowed', 'feature_pets_on_approval',
#        'feature_playroom', 'feature_playroom/nursery', 'feature_pool',
#        'feature_post-war', 'feature_post_war', 'feature_pre-war',
#        'feature_pre_war', 'feature_prewar', 'feature_private-balcony',
#        'feature_private-outdoor-space', 'feature_private_backyard',
#        'feature_private_balcony', 'feature_private_deck',
#        'feature_private_garden',
#        'feature_private_laundry_room_on_every_floor',
#        'feature_private_outdoor_space', 'feature_private_parking',
#        'feature_private_roof_deck', 'feature_private_roofdeck',
#        'feature_private_terrace', 'feature_publicoutdoor',
#        'feature_queen_size_bedrooms', 'feature_queen_sized_rooms',
#        'feature_reduced_fee', 'feature_renovated',
#        'feature_renovated_kitchen', 'feature_residents_garden',
#        'feature_residents_lounge', 'feature_roof-deck',
#        'feature_roof_access', 'feature_roof_deck',
#        'feature_roof_deck_with_grills', 'feature_roofdeck',
#        'feature_rooftop_deck', 'feature_rooftop_terrace',
#        'feature_s/s_appliances', 'feature_sauna', 'feature_screening_room',
#        'feature_separate_kitchen', 'feature_shared_backyard',
#        'feature_shared_garden', 'feature_shares_ok',
#        'feature_short_term_allowed', 'feature_simplex', 'feature_skylight',
#        'feature_skylight_atrium', 'feature_southern_exposure',
#        'feature_spa_services', 'feature_ss_appliances',
#        'feature_stainless_steel', 'feature_stainless_steel_appliances',
#        'feature_state-of-the-art_fitness_center', 'feature_storage',
#        'feature_storage_available', 'feature_storage_facilities_available',
#        'feature_storage_room', 'feature_sublet', 'feature_subway',
#        'feature_sundeck', 'feature_swimming_pool', 'feature_tenant_lounge',
#        'feature_terrace', 'feature_terraces_/_balconies',
#        'feature_tons_of_natural_light', 'feature_valet',
#        'feature_valet_parking', 'feature_valet_service',
#        'feature_valet_services',
#        'feature_valet_services_including_dry_cleaning',
#        'feature_video_intercom', 'feature_view', 'feature_virtual_doorman',
#        'feature_virtual_tour', 'feature_walk-in_closet', 'feature_walk-up',
#        'feature_walk_in_closet', 'feature_walk_in_closet(s)',
#        'feature_washer/dryer', 'feature_washer/dryer_hookup',
#        'feature_washer/dryer_in-unit', 'feature_washer/dryer_in_building',
#        'feature_washer/dryer_in_unit', 'feature_washer_&_dryer',
#        'feature_washer_in_unit', 'feature_wheelchair_access',
#        'feature_wheelchair_ramp', 'feature_wifi', 'feature_wifi_access',
#        'feature_wood-burning_fireplace', 'feature_yard',
#        'feature_yoga_classes'
               ]

In [7]:
for col in feat_to_use:
    full_data.loc[:,col] = preprocessing.StandardScaler().fit_transform(full_data[col].values.reshape(-1,1))
train_df_nn = full_data[:ntrain]
test_df_nn = full_data[ntrain:]

train_df_nn = sparse.csr_matrix(train_df_nn)
test_df_nn = sparse.csr_matrix(test_df_nn)


print train_df_nn.shape
print test_df_nn.shape



(49352, 412)
(74659, 412)


In [8]:
full_data.isnull().values.any()

False

In [10]:
train_y = np.ravel(pd.read_csv(data_path + 'labels_BrandenMurray.csv'))
# train_y = to_categorical(train_y)

In [16]:
y_low =[]
for i in range(train_X.shape[0]):
    y_low.append(1 if train_y[i] == 0 else 0)
    
y_low = np.array(y_low)  
print np.sum(y_low)
y_low = to_categorical(y_low)

34284


In [17]:
X_train, X_val, y_train, y_val = train_test_split(train_df_nn, y_low, train_size=.80, random_state=1234)

In [18]:
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [22]:
early_stop = EarlyStopping(monitor='val_loss', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0)
checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
                               monitor='val_loss', 
                               verbose=0, save_best_only=True)

def create_model(input_dim):
    model = Sequential()
    init = 'glorot_uniform'
    
    
    model.add(Dense(200, # number of input units: needs to be tuned
                    input_dim = input_dim, # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
        
    model.add(Dense(80,init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
#     model.add(Dense(9,init=init)) # number of hidden2 units. needs to be tuned.
#     model.add(Activation('sigmoid'))
#     model.add(PReLU())
#     model.add(BatchNormalization())    
#     model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
    model.add(Dense(2,
                   init = init,
                   activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
#                   metrics=[mae_log],
                  optimizer = 'adam' # optimizer. you may want to try different ones
                 )
    return(model)



model = create_model(X_train.shape[1])
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=1000,
                         samples_per_epoch=ntrain,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop,checkpointer]
                         )

print min(fit.history['val_loss'])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
0.3983338265


In [29]:


def nn_model(params):
    model = Sequential()
    init = 'glorot_normal'
    
    model.add(Dense(params['input_size'], # number of input units: needs to be tuned
                    input_dim = params['input_dim'], # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(params['input_drop_out'])) #dropout rate. needs to be tuned
        
    model.add(Dense(params['hidden_size'],
                    init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(params['hidden_drop_out'])) #dropout rate. needs to be tuned
    
#     model.add(Dense(20,init=init)) # number of hidden2 units. needs to be tuned.
#     model.add(Activation('sigmoid'))
#     model.add(PReLU())
#     model.add(BatchNormalization())    
#     model.add(Dropout(0.5)) #dropout rate. needs to be tuned
    
    model.add(Dense(2,
                    init = init,
                    activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adam' # optimizer. you may want to try different ones
                 )
    return(model)



def nn_blend_data(parameters, train_x, train_y, test_x, fold, early_stopping_rounds=0, batch_size=128):
    N_params = len(parameters)
    print ("Blend %d estimators for %d folds" % (len(parameters), fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = train_y.shape[1]
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    

    
    for j, nn_params in enumerate(parameters):
        print ("Model %d: %s" %(j+1, nn_params))
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = train_y[val_index]
            

            model = nn_model(nn_params)
#             print (model)
            fit= model.fit_generator(generator=batch_generator(train_x_fold, train_y_fold, 128, True),
                                     nb_epoch=70,
                                     samples_per_epoch=train_x_fold.shape[0],
                                     validation_data=(val_x_fold.todense(), val_y_fold),
                                     verbose = 0,
                                     callbacks=[ModelCheckpoint(filepath="weights.hdf5", 
                                                                monitor='val_loss', 
                                                                verbose=0, save_best_only=True)]
                                    )

            best_round=len(fit.epoch)-early_stopping_rounds-1
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'adam' )
            
            # print (mean_absolute_error(np.exp(y_val)-200, pred_y))
            val_y_predict_fold = model.predict_proba(x=val_x_fold.toarray(),verbose=0)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'adam' )            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = model.predict_proba(x=test_x.toarray(),verbose=0)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
        test_blend_x[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x, scores,best_rounds)

In [30]:
# 200 0.4 80 0.4 'glorot_uniform' 'adam' 0.3983338265
# 100 0.5 30 0.5 'glorot_uniform' 'adam' 0.400189068724
# 100 0.4 30 0.4 'glorot_uniform' 'adam' 0.399582461623
# 100 0.4 30 0.4 'glorot_uniform' 'Adamax' 0.397478998776

In [31]:
nn_parameters = [
    { 'input_size' :200 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 80 ,
     'hidden_drop_out' :0.4},
    { 'input_size' :100 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 30 ,
     'hidden_drop_out' :0.4}

]

(train_blend_x_low, test_blend_x_low, blend_scores,best_round) = nn_blend_data(nn_parameters, train_df_nn, y_low, test_df_nn,
                                                         10,
                                                         5)

Blend 2 estimators for 10 folds
Model 1: {'input_size': 200, 'input_drop_out': 0.4, 'hidden_drop_out': 0.4, 'hidden_size': 80, 'input_dim': 412}
Model 1 fold 1
best round 64
('Score: ', 0.38914003093726507)
Model 1 fold 1 fitting finished in 434.233s
Model 1 fold 2
best round 64
('Score: ', 0.38246933278471729)
Model 1 fold 2 fitting finished in 433.652s
Model 1 fold 3
best round 64
('Score: ', 0.38937678736809528)
Model 1 fold 3 fitting finished in 434.996s
Model 1 fold 4
best round 64
('Score: ', 0.38047057304827325)
Model 1 fold 4 fitting finished in 440.853s
Model 1 fold 5
best round 64
('Score: ', 0.39706287425003256)
Model 1 fold 5 fitting finished in 433.695s
Model 1 fold 6
best round 64
('Score: ', 0.39289705687625626)
Model 1 fold 6 fitting finished in 436.556s
Model 1 fold 7
best round 64
('Score: ', 0.39864745915227312)
Model 1 fold 7 fitting finished in 435.180s
Model 1 fold 8
best round 64
('Score: ', 0.40288639700001533)
Model 1 fold 8 fitting finished in 435.340s
Model 1

In [33]:
train_blend_x_low

array([[ 0.46456146,  0.53543854,  0.55918074,  0.44081929],
       [ 0.65307534,  0.34692469,  0.64788789,  0.35211208],
       [ 0.298702  ,  0.701298  ,  0.31918669,  0.68081331],
       ..., 
       [ 0.75917244,  0.24082758,  0.59385377,  0.40614626],
       [ 0.38399851,  0.61600149,  0.33006418,  0.66993582],
       [ 0.82922602,  0.170774  ,  0.86132485,  0.13867518]])

In [34]:

now = datetime.now()

name_train_blend = '../output/train_blend_Keras_0331_ovr_low_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend = '../output/test_blend_Keras_0331_ovr_low_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



print (np.mean(blend_scores,axis=0))
print (np.mean(best_round,axis=0))
np.savetxt(name_train_blend,train_blend_x_low, delimiter=",")
np.savetxt(name_test_blend,test_blend_x_low, delimiter=",")

[ 0.39398288  0.39350244]
[ 64.  64.]


# y_medium

In [35]:
y_medium =[]
for i in range(train_X.shape[0]):
    y_medium.append(1 if train_y[i] == 1 else 0)
    
y_medium = np.array(y_medium)  
print np.sum(y_medium)
y_medium = to_categorical(y_medium)

11229


In [36]:
X_train, X_val, y_train, y_val = train_test_split(train_df_nn, y_medium, train_size=.80, random_state=1234)

In [37]:
early_stop = EarlyStopping(monitor='val_loss', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0)
checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
                               monitor='val_loss', 
                               verbose=0, save_best_only=True)

def create_model(input_dim):
    model = Sequential()
    init = 'glorot_uniform'
    
    
    model.add(Dense(200, # number of input units: needs to be tuned
                    input_dim = input_dim, # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
        
    model.add(Dense(80,init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
#     model.add(Dense(9,init=init)) # number of hidden2 units. needs to be tuned.
#     model.add(Activation('sigmoid'))
#     model.add(PReLU())
#     model.add(BatchNormalization())    
#     model.add(Dropout(0.4)) #dropout rate. needs to be tuned
    
    model.add(Dense(2,
                   init = init,
                   activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
#                   metrics=[mae_log],
                  optimizer = 'adam' # optimizer. you may want to try different ones
                 )
    return(model)



model = create_model(X_train.shape[1])
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=1000,
                         samples_per_epoch=ntrain,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop,checkpointer]
                         )

print min(fit.history['val_loss'])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
0.428629088134


In [38]:
nn_parameters = [
    { 'input_size' :200 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 80 ,
     'hidden_drop_out' :0.4},
    { 'input_size' :100 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 30 ,
     'hidden_drop_out' :0.4}

]

(train_blend_x_medium , test_blend_x_medium , blend_scores,best_round) = nn_blend_data(nn_parameters,train_df_nn, 
                                                                                       y_medium , 
                                                                                       test_df_nn,
                                                         10,
                                                         5)

Blend 2 estimators for 10 folds
Model 1: {'input_size': 200, 'input_drop_out': 0.4, 'hidden_drop_out': 0.4, 'hidden_size': 80, 'input_dim': 412}
Model 1 fold 1
best round 64
('Score: ', 0.42495074071473649)
Model 1 fold 1 fitting finished in 532.408s
Model 1 fold 2
best round 64
('Score: ', 0.4100714709242152)
Model 1 fold 2 fitting finished in 442.508s
Model 1 fold 3
best round 64
('Score: ', 0.42379029264446894)
Model 1 fold 3 fitting finished in 433.691s
Model 1 fold 4
best round 64
('Score: ', 0.41671803610460934)
Model 1 fold 4 fitting finished in 434.629s
Model 1 fold 5
best round 64
('Score: ', 0.42797873356288629)
Model 1 fold 5 fitting finished in 435.883s
Model 1 fold 6
best round 64
('Score: ', 0.42772061567152725)
Model 1 fold 6 fitting finished in 434.727s
Model 1 fold 7
best round 64
('Score: ', 0.4321141383811945)
Model 1 fold 7 fitting finished in 434.209s
Model 1 fold 8
best round 64
('Score: ', 0.43773896025676917)
Model 1 fold 8 fitting finished in 436.058s
Model 1 f

In [39]:
now = datetime.now()

name_train_blend = '../output/train_blend_Keras_0331_ovr_medium_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend = '../output/test_blend_Keras_0331_ovr_medium_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



print (np.mean(blend_scores,axis=0))
print (np.mean(best_round,axis=0))
np.savetxt(name_train_blend,train_blend_x_medium, delimiter=",")
np.savetxt(name_test_blend,test_blend_x_medium, delimiter=",")

[ 0.42636395  0.42553463]
[ 64.  64.]


# y_high

In [40]:
y_high =[]
for i in range(train_X.shape[0]):
    y_high.append(1 if train_y[i] == 2 else 0)
    
y_high = np.array(y_high)  
print np.sum(y_high)
y_high = to_categorical(y_high)

3839


In [42]:
X_train, X_val, y_train, y_val = train_test_split(train_df_nn, y_high, train_size=.80, random_state=1234)

In [43]:
nn_parameters = [
    { 'input_size' :200 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 80 ,
     'hidden_drop_out' :0.4},
    { 'input_size' :100 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 30 ,
     'hidden_drop_out' :0.4}

]

(train_blend_x_hihg, test_blend_x_hihg, blend_scores,best_round) = nn_blend_data(nn_parameters,train_df_nn, 
                                                                                       y_high , 
                                                                                       test_df_nn,
                                                         10,
                                                         5)

Blend 2 estimators for 10 folds
Model 1: {'input_size': 200, 'input_drop_out': 0.4, 'hidden_drop_out': 0.4, 'hidden_size': 80, 'input_dim': 412}
Model 1 fold 1
best round 64
('Score: ', 0.17049442023116387)
Model 1 fold 1 fitting finished in 434.581s
Model 1 fold 2
best round 64
('Score: ', 0.17302170162217112)
Model 1 fold 2 fitting finished in 433.833s
Model 1 fold 3
best round 64
('Score: ', 0.1820879041929036)
Model 1 fold 3 fitting finished in 433.848s
Model 1 fold 4
best round 64
('Score: ', 0.16819531986422559)
Model 1 fold 4 fitting finished in 435.834s
Model 1 fold 5
best round 64
('Score: ', 0.19059734687505989)
Model 1 fold 5 fitting finished in 432.931s
Model 1 fold 6
best round 64
('Score: ', 0.17243870005728451)
Model 1 fold 6 fitting finished in 435.523s
Model 1 fold 7
best round 64
('Score: ', 0.17667642113197501)
Model 1 fold 7 fitting finished in 433.020s
Model 1 fold 8
best round 64
('Score: ', 0.19013913734430607)
Model 1 fold 8 fitting finished in 434.752s
Model 1 

In [44]:
now = datetime.now()

name_train_blend = '../output/train_blend_Keras_0331_ovr_high_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend = '../output/test_blend_Keras_0331_ovr_high_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



print (np.mean(blend_scores,axis=0))
print (np.mean(best_round,axis=0))
np.savetxt(name_train_blend,train_blend_x_hihg, delimiter=",")
np.savetxt(name_test_blend,test_blend_x_hihg, delimiter=",")

[ 0.17706992  0.17661029]
[ 64.  64.]


In [57]:
train_blend_x_low[:10]

array([[ 0.46456146,  0.53543854,  0.55918074,  0.44081929],
       [ 0.65307534,  0.34692469,  0.64788789,  0.35211208],
       [ 0.298702  ,  0.701298  ,  0.31918669,  0.68081331],
       [ 0.0652181 ,  0.93478191,  0.11783206,  0.88216794],
       [ 0.01999347,  0.98000652,  0.02754674,  0.97245324],
       [ 0.286787  ,  0.71321297,  0.37462229,  0.62537771],
       [ 0.30486089,  0.69513911,  0.39302596,  0.60697407],
       [ 0.77407664,  0.22592336,  0.76265699,  0.23734303],
       [ 0.04599821,  0.95400178,  0.04560963,  0.95439035],
       [ 0.04567733,  0.95432264,  0.08780251,  0.91219747]])

In [46]:
train_blend_x_medium[:10]

array([[ 0.44033614,  0.55966389,  0.48364738,  0.51635259],
       [ 0.56981212,  0.43018788,  0.5069834 ,  0.4930166 ],
       [ 0.71032268,  0.28967732,  0.71398932,  0.28601068],
       [ 0.79661608,  0.20338391,  0.82728297,  0.17271705],
       [ 0.97628778,  0.02371221,  0.97221869,  0.02778132],
       [ 0.72416741,  0.27583256,  0.77770293,  0.22229706],
       [ 0.68811274,  0.31188726,  0.72412932,  0.27587068],
       [ 0.56539565,  0.43460438,  0.57953453,  0.42046544],
       [ 0.95258194,  0.04741804,  0.97551912,  0.02448091],
       [ 0.93746662,  0.06253336,  0.91640061,  0.0835994 ]])

In [48]:
train_blend_x_hihg[:10]

array([[  9.48133171e-01,   5.18668257e-02,   9.56971407e-01,
          4.30286191e-02],
       [  9.07860279e-01,   9.21396986e-02,   9.03508782e-01,
          9.64911953e-02],
       [  9.92276788e-01,   7.72319501e-03,   9.86826658e-01,
          1.31733138e-02],
       [  9.99772906e-01,   2.27101613e-04,   9.99815583e-01,
          1.84422839e-04],
       [  9.99597371e-01,   4.02599951e-04,   9.98769820e-01,
          1.23019773e-03],
       [  9.92041230e-01,   7.95879401e-03,   9.85059083e-01,
          1.49408951e-02],
       [  9.93075550e-01,   6.92442246e-03,   9.90770400e-01,
          9.22962371e-03],
       [  5.62102914e-01,   4.37897086e-01,   5.53607643e-01,
          4.46392328e-01],
       [  9.97734606e-01,   2.26539746e-03,   9.96763468e-01,
          3.23651708e-03],
       [  9.98655975e-01,   1.34401838e-03,   9.98934984e-01,
          1.06500229e-03]])

In [55]:
train_blend_x = np.hstack([train_blend_x_low[:,[1,3]],train_blend_x_medium[:,[1,3]],train_blend_x_hihg[:,[1,3]]])
train_blend_x.shape

(49352, 6)

In [56]:
train_blend_x

array([[ 0.53543854,  0.44081929,  0.55966389,  0.51635259,  0.05186683,
         0.04302862],
       [ 0.34692469,  0.35211208,  0.43018788,  0.4930166 ,  0.0921397 ,
         0.0964912 ],
       [ 0.701298  ,  0.68081331,  0.28967732,  0.28601068,  0.0077232 ,
         0.01317331],
       ..., 
       [ 0.24082758,  0.40614626,  0.61263579,  0.53841865,  0.17041728,
         0.15469342],
       [ 0.61600149,  0.66993582,  0.37558073,  0.34411123,  0.02602372,
         0.03474894],
       [ 0.170774  ,  0.13867518,  0.5747925 ,  0.52559209,  0.37286255,
         0.20092012]])

In [58]:
test_blend_x_mean = np.hstack([test_blend_x_low[:,[1,3]],test_blend_x_medium[:,[1,3]],test_blend_x_hihg[:,[1,3]]])
test_blend_x_mean.shape

(74659, 6)

In [60]:
test_blend_x_mean

array([[  4.45016491e-01,   4.42961633e-01,   4.96813539e-01,
          5.02928707e-01,   4.73185966e-02,   4.74120317e-02],
       [  9.89467651e-01,   9.91195834e-01,   1.93006209e-02,
          1.22408562e-02,   9.66879284e-03,   7.65174278e-03],
       [  9.72961313e-01,   9.74756777e-01,   6.58714678e-02,
          6.76788870e-02,   8.12055035e-03,   1.01542623e-02],
       ..., 
       [  9.92661548e-01,   9.90405649e-01,   1.40206488e-02,
          1.15989062e-02,   1.32738879e-03,   1.26324503e-03],
       [  9.83876938e-01,   9.89677304e-01,   1.01579200e-02,
          8.76117819e-03,   2.72480752e-04,   3.60779543e-04],
       [  5.93080962e-01,   6.36519444e-01,   3.77584869e-01,
          3.54368082e-01,   1.71170815e-02,   2.06032272e-02]])

In [59]:
name_train_blend = '../output/train_blend_Keras_ovr_BM_0331_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend_mean = '../output/test_blend_Keras_ovr_BM_0331_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



# print (np.mean(blend_scores_xgb,axis=0))
# print (np.mean(best_rounds_xgb,axis=0))
np.savetxt(name_train_blend,train_blend_x, delimiter=",")
np.savetxt(name_test_blend_mean,test_blend_x_mean, delimiter=",")
