In [1]:
import pandas as pd
import time
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import random
from sklearn import preprocessing

import gc
from scipy.stats import skew, boxcox

from scipy import sparse
from sklearn.metrics import log_loss
from datetime import datetime

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

seed = 2017

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU,LeakyReLU,ELU,ParametricSoftplus,ThresholdedReLU,SReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.optimizers import SGD,Nadam
from keras.regularizers import WeightRegularizer, ActivityRegularizer,l2, activity_l2
from keras.utils.np_utils import to_categorical

Using Theano backend.


# Load Data

In [2]:
data_path = "../input/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file).reset_index()
test_df = pd.read_json(test_file).reset_index()
ntrain = train_df.shape[0]
print train_df.shape
print test_df.shape
print ntrain

(49352, 16)
(74659, 15)
49352


In [3]:
# sc_price
tmp = pd.concat([train_df['price'],test_df['price']])
ulimit = np.percentile(tmp.values, 99)
llimit = np.percentile(tmp.values, 1)

train_df.loc[:,'sc_price'] = train_df['price'].values.reshape(-1, 1)
test_df.loc[:,'sc_price'] = test_df['price'].values.reshape(-1, 1)

train_df.loc[train_df['sc_price']>ulimit, ['sc_price']] = ulimit
test_df.loc[test_df['sc_price']>ulimit, ['sc_price']] = ulimit
train_df.loc[train_df['sc_price']<llimit, ['sc_price']] = llimit
test_df.loc[test_df['sc_price']<llimit, ['sc_price']] = llimit



# sc_ba_price
inx_train = train_df['bathrooms'] == 0
inx_test = test_df['bathrooms'] == 0

non0_inx_train = ~inx_train
non0_inx_test = ~inx_test

train_df.loc[non0_inx_train,'sc_ba_price'] = train_df.loc[non0_inx_train,'sc_price']\
                                                /train_df.loc[non0_inx_train,'bathrooms']
test_df.loc[non0_inx_test,'sc_ba_price'] = test_df.loc[non0_inx_test,'sc_price']\
                                                /test_df.loc[non0_inx_test,'bathrooms']

train_df.loc[inx_train,'sc_ba_price'] = 0
test_df.loc[inx_test,'sc_ba_price'] = 0

train_df.loc[non0_inx_train,'bathrooms0'] = 1
test_df.loc[non0_inx_test,'bathrooms0'] = 1

train_df.loc[inx_train,'bathrooms0'] = 0
test_df.loc[inx_test,'bathrooms0'] = 0

# price per bedrooms

inx_train = train_df['bedrooms'] == 0
inx_test = test_df['bedrooms'] == 0

non0_inx_train = ~inx_train
non0_inx_test = ~inx_test

train_df.loc[non0_inx_train,'sc_be_price'] = train_df.loc[non0_inx_train,'sc_price'] \
                                                /train_df.loc[non0_inx_train,'bedrooms']
test_df.loc[non0_inx_test,'sc_be_price'] = test_df.loc[non0_inx_test,'sc_price']\
                                                /test_df.loc[non0_inx_test,'bedrooms']

train_df.loc[inx_train,'sc_be_price'] = 0
test_df.loc[inx_test,'sc_be_price'] = 0

train_df.loc[non0_inx_train,'bedrooms0'] = 1
test_df.loc[non0_inx_test,'bedrooms0'] = 1

train_df.loc[inx_train,'bedrooms0'] = 0
test_df.loc[inx_test,'bedrooms0'] = 0
# bathrooms

ulimit = 5

train_df['sc_bathrooms']=train_df['bathrooms']
test_df['sc_bathrooms']=test_df['bathrooms']

train_df.loc[train_df['sc_bathrooms']>ulimit,['sc_bathrooms']] = ulimit
test_df.loc[test_df['sc_bathrooms']>ulimit,['sc_bathrooms']] = ulimit

# bedrooms

ulimit = 8

train_df['sc_bedrooms']=train_df['bedrooms']
test_df['sc_bedrooms']=test_df['bedrooms']

train_df.loc[train_df['sc_bedrooms']>ulimit, ['sc_bedrooms']] = ulimit
test_df.loc[test_df['sc_bedrooms']>ulimit,['sc_bedrooms']] = ulimit

# longitude

tmp = pd.concat([train_df['longitude'],test_df['longitude']])
llimit = np.percentile(tmp.values, 0.1)
ulimit = np.percentile(tmp.values, 99.9)

train_df['sc_longitude']=train_df['longitude']
test_df['sc_longitude']=test_df['longitude']

train_df.loc[train_df['sc_longitude']>ulimit, ['sc_longitude']] = ulimit
test_df.loc[test_df['sc_longitude']>ulimit, ['sc_longitude']] = ulimit
train_df.loc[train_df['sc_longitude']<llimit, ['sc_longitude']] = llimit
test_df.loc[test_df['sc_longitude']<llimit, ['sc_longitude']] = llimit

# latitude

tmp = pd.concat([train_df['latitude'],test_df['latitude']])
llimit = np.percentile(tmp.values, 0.1)
ulimit = np.percentile(tmp.values, 99.9)

train_df['sc_latitude']=train_df['latitude']
test_df['sc_latitude']=test_df['latitude']

train_df.loc[train_df['sc_latitude']>ulimit, ['sc_latitude']] = ulimit
test_df.loc[test_df['sc_latitude']>ulimit, ['sc_latitude']] = ulimit
train_df.loc[train_df['sc_latitude']<llimit, ['sc_latitude']] = llimit
test_df.loc[test_df['sc_latitude']<llimit, ['sc_latitude']] = llimit


features_to_use  = ["sc_bathrooms", "sc_bedrooms", "sc_latitude", "sc_longitude",
                    "sc_price", "sc_ba_price", "sc_be_price"]

In [4]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words", "created_month", 
                        "created_day", "created_hour"])


In [5]:
# full_data=pd.concat([train_df,test_df])

# # SSL = preprocessing.StandardScaler()
# # for col in features_to_use:
# #     full_data[col], lam = boxcox(full_data[col] - full_data[col].min() + 1)
# #     full_data[col] = SSL.fit_transform(full_data[col].values.reshape(-1,1)) 
# skewed_cols = full_data[features_to_use].apply(lambda x: skew(x.dropna()))

# SSL = preprocessing.StandardScaler()
# skewed_cols = skewed_cols[skewed_cols > 0.25].index.values
# for skewed_col in skewed_cols:
#     full_data[skewed_col], lam = boxcox(full_data[skewed_col] - full_data[skewed_col].min() + 1)
#     print skewed_col, '\t', lam
# for col in features_to_use:
#     full_data[col] = SSL.fit_transform(full_data[col].values.reshape(-1,1))
#     train_df[col] = full_data.iloc[:ntrain][col]
#     test_df[col] = full_data.iloc[ntrain:][col]

    
# del full_data

In [6]:
features_to_use.extend(["listing_id","bedrooms0",'bathrooms0'])
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [7]:
train_df['features'] = train_df["features"]\
                        .apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))\
                        .apply(lambda x: x.lower())
test_df['features'] = test_df["features"]\
                        .apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))\
                        .apply(lambda x: x.lower())

print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

sparse_features = tfidf.get_feature_names()

0                                                     
1    doorman elevator fitness_center cats_allowed d...
2    laundry_in_building dishwasher hardwood_floors...
3                               hardwood_floors no_fee
4                                              pre-war
Name: features, dtype: object


In [8]:
full_data = pd.concat([train_df[features_to_use],test_df[features_to_use]])
full_data = preprocessing.StandardScaler().fit_transform(full_data)
train_df_nn = full_data[:ntrain]
test_df_nn = full_data[ntrain:]

print train_df_nn.shape
print test_df_nn.shape

(49352, 20)
(74659, 20)


In [11]:
train_X = sparse.hstack([train_df_nn, tr_sparse]).tocsr()
test_X = sparse.hstack([test_df_nn, te_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
weight_num_map = {'high':1, 'medium':1, 'low':1}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
train_y = to_categorical(train_y)
W_train = np.array(train_df['interest_level'].apply(lambda x: weight_num_map[x]))

all_features = features_to_use + sparse_features
print train_X.shape, test_X.shape

(49352, 220) (74659, 220)


In [12]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, train_size=.80, random_state=1234)

In [13]:
def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0

In [35]:


early_stop = EarlyStopping(monitor='val_loss', # custom metric
                           patience=5, #early stopping for epoch
                           verbose=0)
checkpointer = ModelCheckpoint(filepath="weights.hdf5", 
                               monitor='val_loss', 
                               verbose=1, save_best_only=True)

def create_model(input_dim):
    model = Sequential()
    init = 'he_normal'
    
    
    model.add(Dense(150, # number of input units: needs to be tuned
                    input_dim = input_dim, # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(0.4)) #dropout rate. needs to be tuned
        
    model.add(Dense(50,init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.5)) #dropout rate. needs to be tuned
    
    model.add(Dense(20,init=init)) # number of hidden2 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.5)) #dropout rate. needs to be tuned
    
    model.add(Dense(3,
                   init = init,
                   activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
#                   metrics=[mae_log],
                  optimizer = 'adam' # optimizer. you may want to try different ones
                 )
    return(model)



model = create_model(X_train.shape[1])
fit= model.fit_generator(generator=batch_generator(X_train, y_train, 128, True),
                         nb_epoch=1000,
                         samples_per_epoch=ntrain,
                         validation_data=(X_val.todense(), y_val),
                         callbacks=[early_stop,checkpointer]
                         )

print min(fit.history['val_loss'])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
0.5832976706


In [36]:
model.load_weights("weights.hdf5")

model.compile(loss = 'categorical_crossentropy',optimizer = 'adam' )

In [37]:
pred_y = model.predict_proba(x=test_X.toarray(),verbose=0)

In [38]:
now = datetime.now()
sub_name = '../output/sub_Keras_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(pred_y)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv(sub_name, index=False)

In [52]:


def nn_model(params):
    model = Sequential()
    init = 'he_normal'
    
    model.add(Dense(params['input_size'], # number of input units: needs to be tuned
                    input_dim = params['input_dim'], # fixed length: number of columns of X
                    init=init,
                   ))
    model.add(Activation('sigmoid'))
    model.add(PReLU()) # activation function
    model.add(BatchNormalization()) # normalization
    model.add(Dropout(params['input_drop_out'])) #dropout rate. needs to be tuned
        
    model.add(Dense(params['hidden_size'],
                    init=init)) # number of hidden1 units. needs to be tuned.
    model.add(Activation('sigmoid'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(params['hidden_drop_out'])) #dropout rate. needs to be tuned
    
#     model.add(Dense(20,init=init)) # number of hidden2 units. needs to be tuned.
#     model.add(Activation('sigmoid'))
#     model.add(PReLU())
#     model.add(BatchNormalization())    
#     model.add(Dropout(0.5)) #dropout rate. needs to be tuned
    
    model.add(Dense(3,
                    init = init,
                    activation = 'softmax')) # 1 for regression 
    model.compile(loss = 'categorical_crossentropy',
                  optimizer = 'adam' # optimizer. you may want to try different ones
                 )
    return(model)



def nn_blend_data(parameters, train_x, train_y, test_x, fold, early_stopping_rounds=0, batch_size=128):
    N_params = len(parameters)
    print ("Blend %d estimators for %d folds" % (len(parameters), fold))
    skf = KFold(n_splits=fold,random_state=seed)
    N_class = train_y.shape[1]
    
    train_blend_x = np.zeros((train_x.shape[0], N_class*N_params))
    test_blend_x = np.zeros((test_x.shape[0], N_class*N_params))
    scores = np.zeros ((fold,N_params))
    best_rounds = np.zeros ((fold, N_params))
    

    
    for j, nn_params in enumerate(parameters):
        print ("Model %d: %s" %(j+1, nn_params))
        test_blend_x_j = np.zeros((test_x.shape[0], N_class*fold))
        
        for i, (train_index, val_index) in enumerate(skf.split(train_x)):
            print ("Model %d fold %d" %(j+1,i+1))
            fold_start = time.time() 
            train_x_fold = train_x[train_index]
            train_y_fold = train_y[train_index]
            val_x_fold = train_x[val_index]
            val_y_fold = train_y[val_index]
            

            model = nn_model(nn_params)
#             print (model)
            fit= model.fit_generator(generator=batch_generator(train_x_fold, train_y_fold, 128, True),
                                     nb_epoch=60,
                                     samples_per_epoch=train_x_fold.shape[0],
                                     validation_data=(val_x_fold.todense(), val_y_fold),
                                     verbose = 0,
                                     callbacks=[ModelCheckpoint(filepath="weights.hdf5", 
                                                                monitor='val_loss', 
                                                                verbose=0, save_best_only=True)]
                                    )

            best_round=len(fit.epoch)-early_stopping_rounds-1
            best_rounds[i,j]=best_round
            print ("best round %d" % (best_round))
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'adam' )
            
            # print (mean_absolute_error(np.exp(y_val)-200, pred_y))
            val_y_predict_fold = model.predict_proba(x=val_x_fold.toarray(),verbose=0)
            score = log_loss(val_y_fold, val_y_predict_fold)
            print ("Score: ", score)
            scores[i,j]=score   
            train_blend_x[val_index, (j*N_class):(j+1)*N_class] = val_y_predict_fold
            
            model.load_weights("weights.hdf5")
            # Compile model (required to make predictions)
            model.compile(loss = 'categorical_crossentropy',optimizer = 'adam' )            
            test_blend_x_j[:,(i*N_class):(i+1)*N_class] = model.predict_proba(x=test_X.toarray(),verbose=0)
            print ("Model %d fold %d fitting finished in %0.3fs" % (j+1,i+1, time.time() - fold_start))            
            
        test_blend_x[:,(j*N_class):(j+1)*N_class] = \
                np.stack([test_blend_x_j[:,range(0,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(1,N_class*fold,N_class)].mean(1),
                          test_blend_x_j[:,range(2,N_class*fold,N_class)].mean(1)]).T
            
        print ("Score for model %d is %f" % (j+1,np.mean(scores[:,j])))
    print ("Score for blended models is %f" % (np.mean(scores)))
    return (train_blend_x, test_blend_x, scores,best_rounds)

In [53]:
nn_parameters = [
    { 'input_size' :300 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 100 ,
     'hidden_drop_out' :0.2},
    { 'input_size' :200 ,
     'input_dim' : train_X.shape[1],
     'input_drop_out' : 0.4 ,
     'hidden_size' : 50 ,
     'hidden_drop_out' :0.2}

]

(train_blend_x, test_blend_x, blend_scores,best_round) = nn_blend_data(nn_parameters, train_X, train_y, test_X,
                                                         4,
                                                         5)

Blend 2 estimators for 4 folds
Model 1: {'input_size': 300, 'input_drop_out': 0.4, 'hidden_drop_out': 0.2, 'hidden_size': 100, 'input_dim': 220}
Model 1 fold 1
best round 54
('Score: ', 0.58930159188570641)
Model 1 fold 1 fitting finished in 457.649s
Model 1 fold 2
best round 54
('Score: ', 0.58212658337893708)
Model 1 fold 2 fitting finished in 458.268s
Model 1 fold 3
best round 54
('Score: ', 0.58625029173067966)
Model 1 fold 3 fitting finished in 458.267s
Model 1 fold 4
best round 54
('Score: ', 0.58740975337936085)
Model 1 fold 4 fitting finished in 462.457s
Score for model 1 is 0.586272
Model 2: {'input_size': 200, 'input_drop_out': 0.4, 'hidden_drop_out': 0.2, 'hidden_size': 50, 'input_dim': 220}
Model 2 fold 1
best round 54
('Score: ', 0.5915979231118349)
Model 2 fold 1 fitting finished in 313.168s
Model 2 fold 2
best round 54
('Score: ', 0.58284341458860967)
Model 2 fold 2 fitting finished in 312.474s
Model 2 fold 3
best round 54
('Score: ', 0.58856996830310526)
Model 2 fold 3 

In [45]:
train_y.shape

(49352, 3)

In [55]:
test_blend_x[:,3:6]

array([[ 0.03418285,  0.29842315,  0.66739401],
       [ 0.08886517,  0.2184982 ,  0.69263662],
       [ 0.01659429,  0.15124959,  0.83215612],
       ..., 
       [ 0.02705079,  0.2883161 ,  0.68463309],
       [ 0.58800882,  0.35606831,  0.05592286],
       [ 0.00831845,  0.11601497,  0.87566657]])

In [13]:
train_y[:20]

array([1, 2, 0, 2, 2, 1, 2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 2])

In [56]:
now = datetime.now()
sub_name = '../output/sub_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

out_df = pd.DataFrame(test_blend_x[:,3:6])
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv(sub_name, index=False)


In [73]:

now = datetime.now()

name_train_blend = '../output/train_blend_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
name_test_blend = '../output/test_blend_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'



print (np.mean(blend_scores_gbm,axis=0))
print (np.mean(best_rounds_gbm,axis=0))
np.savetxt(name_train_blend,train_blend_x_gbm, delimiter=",")
np.savetxt(name_test_blend,test_blend_x_gbm, delimiter=",")

[ 0.54336879  0.54314207  0.54391966  0.54451063  0.54393703]
4863.6


In [45]:
now = datetime.now()
df = pd.read_json(open("../input/test.json", "r"))
labels2idx ={'high': 0, 'low': 2, 'medium': 1}
sub_name = '../output/sub_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

sub = pd.DataFrame()
sub["listing_id"] = df["listing_id"]

y_test = np.zeros((df.shape[0], 3))

for N in range(3):
    y_test[:,N] = pd.DataFrame(test_blend_x_gbm).iloc[:,[x for x in range(test_blend_x_gbm.shape[1]) if x%3 == N]].mean(axis=1)
    
for label in ["high", "medium", "low"]:
    sub[label] = y_test[:, labels2idx[label]]
sub.to_csv(sub_name, index=False)

In [42]:
now = datetime.now()
df = pd.read_json(open("../input/test.json", "r"))
sub_name = '../output/sub_LightGBM_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

# sub = pd.DataFrame()
tmp2["listing_id"] = df["listing_id"].values
# tmp1.columns = ['0','1','2']
# for label in ["high", "medium", "low"]:
#     sub[label] = tmp2.iloc[:,label].values
tmp2.to_csv(sub_name, index=False)