### Load data from S3

In [1]:
import boto3
import pandas as pd
from sagemaker import get_execution_role

train_data_location = ''
test_data_location = ''
submission_location = ''

In [2]:
import joblib

In [3]:
df_train = pd.read_csv(train_data_location)
df_test = pd.read_csv(test_data_location)
df_submission = pd.read_csv(submission_location)

In [4]:
df_test.shape, df_submission.shape

((41697600, 19), (24936697, 2))

In [5]:
# from pandas.tseries.holiday import USFederalHolidayCalendar
# cal = USFederalHolidayCalendar()
# holidays = cal.holidays(start='2016-01-01', end='2018-12-31').to_pydatetime()
# print(holidays)

In [6]:
df_train['timestamp']=pd.to_datetime(df_train['timestamp'])
df_test['timestamp']=pd.to_datetime(df_test['timestamp'])
# df_submission=pd.to_datetime(df_submission['timestamp'])

In [7]:
def preprocess(df):
    df["hour"] = df["timestamp"].dt.hour
#     df["day"] = df["timestamp"].dt.day
    df["weekend"] = df["timestamp"].dt.weekday
    df["month"] = df["timestamp"].dt.month
    df["dayofweek"] = df["timestamp"].dt.dayofweek

In [8]:
preprocess(df_train)
preprocess(df_test)

In [9]:
# df_train['is_holiday'] = df_train.apply(lambda row: row.timestamp in holidays, axis = 1)
# df_test['is_holiday'] = df_test.apply(lambda row: row.timestamp in holidays, axis = 1)

In [10]:
df_train = df_train.drop(['Unnamed: 0'], axis=1)
df_test = df_test.drop(['Unnamed: 0'], axis=1)
# df_submission = df_submission.drop(['Unnamed: 0'], axis=1)

In [11]:
# x_submit = df_test[['building_id', 'meter', 'site_id', 'primary_use', 'square_feet', 'month', 'day', 'hour']]

In [12]:
# submission = pd.read_csv('data/test_submission_full.csv')

In [13]:
# submission.head()

In [14]:
# df_test.timestamp

### Filter missing value entries

In [15]:
filter_variables = [
    'air_temperature', 'dew_temperature', 'precip_depth_1_hr', 
    'sea_level_pressure', 'wind_direction', 'wind_speed'
]

In [16]:
# Filter out zeros
def filter_by_columns(df, variables):
    for var in variables:
        df = df[df[var].notnull()]
    return df

In [17]:
df_train = filter_by_columns(df_train, df_train.columns)

In [18]:
import numpy as np
def describe_stats_by_uses(train):
    usages = df_train['primary_use'].unique()
    df = {'primary_use': [], 'meter_reading_mean': [], 'meter_reading_std': []}
    df = pd.DataFrame(data=df)
    mean = []
    std = []
#     print(usages)
    for use in usages:
#         print(use)
        df_filter = train[train.primary_use == use]
#         print(df_filter.head())
        mean.append(np.mean(df_filter['meter_reading']))
        std.append(np.std(df_filter['meter_reading']))
    df['primary_use'] = usages
    df['meter_reading_mean'] = mean
    df['meter_reading_std'] = std
    return df

In [19]:
df_meter_reading_by_usage = describe_stats_by_uses(df_train)

In [20]:
df_meter_reading_by_usage

Unnamed: 0,primary_use,meter_reading_mean,meter_reading_std
0,Education,6410.346163,288503.022712
1,Lodging/residential,327.948961,1133.527547
2,Office,606.529602,3688.81331
3,Entertainment/public assembly,540.230224,11645.436617
4,Other,143.073003,458.542699
5,Retail,170.388693,367.559422
6,Parking,172.075026,693.211104
7,Public services,305.621619,1284.002356
8,Warehouse/storage,54.393144,66.802817
9,Food sales and service,305.3417,570.281145


### Train / Val / Test Split

In [21]:
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [22]:
df_train.shape

(14963318, 20)

In [23]:
# train, test 80/20 split with weather information
# x_train, x_test, y_train, y_test = train_test_split(df_train[['meter', 'site_id', 'primary_use', 
#                                                               'air_temperature', 'dew_temperature', 
#                                                               'precip_depth_1_hr', 'sea_level_pressure',
#                                                               'wind_direction', 'wind_speed']], 
#                                                     df_train['meter_reading'],
#                                                     test_size = 0.2,
#                                                     shuffle = True)

In [24]:
df_train.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'air_temperature', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'beaufort_scale', 'year', 'month', 'day', 'hour',
       'weekend', 'dayofweek'],
      dtype='object')

In [25]:
print(len(df_train['building_id'].unique()))
print(len(df_train['site_id'].unique()))
print(len(df_train['primary_use'].unique()))
print(len(df_train['meter'].unique()))
print(len(df_train['wind_direction'].unique()))

1273
13
16
4
37


In [26]:
def preprocess_datetime(df):
    df["hour"] = df["timestamp"].dt.hour
#     df["day"] = df["timestamp"].dt.day
    df["weekend"] = df["timestamp"].dt.weekday
    df["month"] = df["timestamp"].dt.month
    df["dayofweek"] = df["timestamp"].dt.dayofweek
    hour_rad = df["hour"].values / 24. * 2 * np.pi
    df["hour_sin"] = np.sin(hour_rad)
    df["hour_cos"] = np.cos(hour_rad)

In [27]:
preprocess_datetime(df_train)
preprocess_datetime(df_test)

In [28]:
df_train.columns

Index(['building_id', 'meter', 'timestamp', 'meter_reading', 'site_id',
       'primary_use', 'square_feet', 'air_temperature', 'dew_temperature',
       'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction',
       'wind_speed', 'beaufort_scale', 'year', 'month', 'day', 'hour',
       'weekend', 'dayofweek', 'hour_sin', 'hour_cos'],
      dtype='object')

In [29]:
# df_train['dayofweek']

In [30]:
numericals = [
    'square_feet', 
    'beaufort_scale',
    'precip_depth_1_hr',
    'dew_temperature',
    'air_temperature',
    'hour_sin', 
    'hour_cos',
    'sea_level_pressure',
    'wind_direction',
    'wind_speed',
#     'building_id',
#     'site_id'
]

In [31]:
categoricals = [
#     'building_id',
    'site_id', 
    'primary_use', 
#     'month',
#     'hour', 
#     'day', 
    'meter',  
    'month',
#     'weekend', 
    'dayofweek'
#     'wind_direction'
]

In [32]:
feat_cols = numericals + categoricals

In [33]:
# df_train.head()

In [34]:
# print(len(df_train['building_id'].unique()))
# print(len(df_train['site_id'].unique()))
# print(len(df_train['primary_use'].unique()))
# print(len(df_train['meter'].unique()))
# print(len(df_train['wind_direction'].unique()))

In [35]:
# df_train['wind_direction']

In [36]:
# feat_cols = categoricals + numericals

In [37]:
for category in categoricals:
    df_train[category] = df_train[category].astype('category')
    df_test[category] = df_test[category].astype('category')
#     df_submission[category] = df_submission[category].astype('category')

In [38]:
# df_train['meter'] = to_categorical(df_train['meter'])

In [39]:
# df_train_categorical = df_train[categoricals]

In [40]:
# df_train_categorical.columns

In [41]:
df_train = pd.get_dummies(df_train)
df_test = pd.get_dummies(df_test)
# df_submission = pd.get_dummies(df_submission)

In [42]:
# feat_cols_dummies = df_train.columns

In [43]:
# feat_cols += [
#     'meter_0', 'meter_1', 'meter_2',
#     'meter_3', 'site_id_0', 'site_id_2', 'site_id_3', 'site_id_4',
#     'site_id_6', 'site_id_7', 'site_id_8', 'site_id_9', 'site_id_10',
#     'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15',
#     'primary_use_Education', 'primary_use_Entertainment/public assembly',
#     'primary_use_Food sales and service', 'primary_use_Healthcare',
#     'primary_use_Lodging/residential',
#     'primary_use_Manufacturing/industrial', 'primary_use_Office',
#     'primary_use_Other', 'primary_use_Parking',
#     'primary_use_Public services', 'primary_use_Religious worship',
#     'primary_use_Retail', 'primary_use_Services',
#     'primary_use_Technology/science', 'primary_use_Utility',
#     'primary_use_Warehouse/storage'
# ]

In [44]:
df_train.columns

Index(['building_id', 'timestamp', 'meter_reading', 'square_feet',
       'air_temperature', 'dew_temperature', 'precip_depth_1_hr',
       'sea_level_pressure', 'wind_direction', 'wind_speed', 'beaufort_scale',
       'year', 'day', 'hour', 'weekend', 'hour_sin', 'hour_cos', 'meter_0',
       'meter_1', 'meter_2', 'meter_3', 'site_id_0', 'site_id_2', 'site_id_3',
       'site_id_4', 'site_id_6', 'site_id_7', 'site_id_8', 'site_id_9',
       'site_id_10', 'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15',
       'primary_use_Education', 'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Services',
       'primary_use_Technology/science', 'primary_

In [45]:
feat_cols = numericals + [
    'meter_0', 'meter_1', 'meter_2', 'meter_3', 
    'site_id_0', 'site_id_2', 'site_id_3', 'site_id_4', 'site_id_6', 'site_id_7', 'site_id_8', 'site_id_9',
    'site_id_10', 'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15',
    'primary_use_Education', 'primary_use_Entertainment/public assembly',
    'primary_use_Food sales and service', 'primary_use_Healthcare',
    'primary_use_Lodging/residential',
    'primary_use_Manufacturing/industrial', 'primary_use_Office',
    'primary_use_Other', 'primary_use_Parking',
    'primary_use_Public services', 'primary_use_Religious worship',
    'primary_use_Retail', 'primary_use_Services',
    'primary_use_Technology/science', 'primary_use_Utility',
    'primary_use_Warehouse/storage', 'month_1', 'month_2', 'month_3',
    'month_4', 'month_5', 'month_6', 'month_7', 'month_8', 'month_9',
    'month_10', 'month_11', 'month_12', 'dayofweek_0', 'dayofweek_1',
    'dayofweek_2', 'dayofweek_3', 'dayofweek_4', 'dayofweek_5',
    'dayofweek_6'
]

In [46]:
feat_cols

['square_feet',
 'beaufort_scale',
 'precip_depth_1_hr',
 'dew_temperature',
 'air_temperature',
 'hour_sin',
 'hour_cos',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed',
 'meter_0',
 'meter_1',
 'meter_2',
 'meter_3',
 'site_id_0',
 'site_id_2',
 'site_id_3',
 'site_id_4',
 'site_id_6',
 'site_id_7',
 'site_id_8',
 'site_id_9',
 'site_id_10',
 'site_id_11',
 'site_id_13',
 'site_id_14',
 'site_id_15',
 'primary_use_Education',
 'primary_use_Entertainment/public assembly',
 'primary_use_Food sales and service',
 'primary_use_Healthcare',
 'primary_use_Lodging/residential',
 'primary_use_Manufacturing/industrial',
 'primary_use_Office',
 'primary_use_Other',
 'primary_use_Parking',
 'primary_use_Public services',
 'primary_use_Religious worship',
 'primary_use_Retail',
 'primary_use_Services',
 'primary_use_Technology/science',
 'primary_use_Utility',
 'primary_use_Warehouse/storage',
 'month_1',
 'month_2',
 'month_3',
 'month_4',
 'month_5',
 'month_6',
 'month_7',
 'month_8'

In [47]:
# # # train, test 80/20 split without weather information
x_train, x_test, y_train, y_test = train_test_split(df_train[feat_cols], 
                                                    df_train['meter_reading'],
                                                    test_size = 0.2,
                                                    shuffle = True)

In [48]:
x_submit = df_test[feat_cols]

In [49]:
# x_train['hour'] = np.cos(x_train['hour'])
# x_test['hour'] = np.cos(x_test['hour'])
# x_submit['hour'] = np.cos(x_submit['hour'])

In [50]:
# one-hot encoding for categorical variables
# label_encoder = preprocessing.LabelEncoder()
# x_train['primary_use'] = label_encoder.fit_transform(x_train['primary_use'])
# x_test['primary_use'] = label_encoder.fit_transform(x_test['primary_use'])
# x_submit['primary_use'] = label_encoder.fit_transform(x_submit['primary_use'])

In [51]:
x_train.columns

Index(['square_feet', 'beaufort_scale', 'precip_depth_1_hr', 'dew_temperature',
       'air_temperature', 'hour_sin', 'hour_cos', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'meter_0', 'meter_1', 'meter_2',
       'meter_3', 'site_id_0', 'site_id_2', 'site_id_3', 'site_id_4',
       'site_id_6', 'site_id_7', 'site_id_8', 'site_id_9', 'site_id_10',
       'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15',
       'primary_use_Education', 'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Services',
       'primary_use_Technology/science', 'primary_use_Utility',
       'primary_use_Warehouse/storage', 'month_1', 'month_2', 'month_3',

In [52]:
# x_train['primary_use']

In [53]:
# standardization
#     'square_feet', 
#     'beaufort_scale',
#     'precip_depth_1_hr',
#     'dew_temperature',
#     'air_temperature',
#     'month',
#     'hour',
#     'day', 
#     'sea_level_pressure',
#     'wind_direction',
#     'wind_speed'
    
numerical_standardize = ['square_feet', 'precip_depth_1_hr', 'dew_temperature', 
                         'air_temperature', 
                         'sea_level_pressure', 'wind_direction', 'wind_speed']
standard_scaler = preprocessing.StandardScaler().fit(x_train[numerical_standardize])
x_train[numerical_standardize] = standard_scaler.transform(x_train[numerical_standardize])
x_test[numerical_standardize] = standard_scaler.transform(x_test[numerical_standardize])
x_submit[numerical_standardize] = standard_scaler.transform(x_submit[numerical_standardize])
# standard_scaler = preprocessing.StandardScaler().fit(x_submit)
# x_submit = standard_scaler.transform(x_submit)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [54]:
# train, validation 80/20 split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.2, shuffle = True)

In [55]:
x_train.shape, x_val.shape, x_test.shape
# , x_submit.shape

((9576523, 62), (2394131, 62), (2992664, 62))

In [56]:
# x_train[numericals]

### Evaluation metric (RMSLE)

In [57]:
import math

# Function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i, pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

### Baseline (Linear Regression)

In [44]:
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

In [45]:
clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100]).fit(x_val, y_val.values)
y_pred_lr = clf.predict(x_test)

In [46]:
y_pred_lr[y_pred_lr < 0] = 0

In [47]:
baseline_rmsle = rmsle(list(y_test), list(y_pred_lr))

In [48]:
print('Ridge Regression RMSLE Score is {}'.format(baseline_rmsle))

Ridge Regression RMSLE Score is 4.583467525689335


In [49]:
lasso = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10, 100]).fit(x_val, y_val.values)
y_pred_lr = lasso.predict(x_test)

  positive)


In [50]:
y_pred_lr[y_pred_lr < 0] = 0

In [51]:
Lasso_rmsle = rmsle(list(y_test), list(y_pred_lr))

In [52]:
print('Lasso Regression RMSLE Score is {}'.format(Lasso_rmsle))

Lasso Regression RMSLE Score is 4.582095287863697


### Support Vector Regressor

In [53]:
from sklearn import svm
from sklearn.svm import LinearSVC, SVC
from sklearn import linear_model

In [54]:
# Linear Regressor
svr = linear_model.SGDRegressor(early_stopping=True, validation_fraction=0.2)
svr.fit(x_train, y_train.values)

SGDRegressor(alpha=0.0001, average=False, early_stopping=True, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=1000,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.2, verbose=0,
             warm_start=False)

In [55]:
y_pred_svr = svr.predict(x_test)
y_pred_svr[y_pred_svr < 0] = 0

In [56]:
svr_rmsle = rmsle(list(y_test), list(y_pred_svr))

In [57]:
print('SGDRegressor RMSLE Score is {}'.format(svr_rmsle))

SGDRegressor RMSLE Score is 4.692523335534162


### xgboost

In [58]:
!pip install xgboost

Collecting xgboost
[?25l  Downloading https://files.pythonhosted.org/packages/c1/24/5fe7237b2eca13ee0cfb100bec8c23f4e69ce9df852a64b0493d49dae4e0/xgboost-0.90-py2.py3-none-manylinux1_x86_64.whl (142.8MB)
[K    100% |████████████████████████████████| 142.8MB 340kB/s eta 0:00:01
Installing collected packages: xgboost
Successfully installed xgboost-0.90
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [59]:
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [60]:
import xgboost as xgb

In [61]:
from sklearn.model_selection import GridSearchCV

In [None]:
# xgb_model = XGBClassifier(
#     objective='reg:squarederror', 
#     silent=True, 
#     eval_metric='merror'
# )
# test_params = {
#     'learning_rate': [0.01, 0.1],
#     'max_depth': [2, 3, 4], 
#     'min_child_weight': [1, 2, 3]
# }
# model = GridSearchCV(estimator = xgb_model, param_grid = test_params, cv=5)
# # x_cv = get_feature_layer(cnn_model, x_comb_train)
# # y_cv = y_comb_train.argmax(axis=1)
# # model.fit(x_cv, y_cv)
# model.fit(x_val, y_val)
# print(model.best_params_)

In [62]:
dtrain = xgb.DMatrix(x_train, label=y_train)
dval = xgb.DMatrix(x_val, label=y_val)
dtest = xgb.DMatrix(x_test, label=y_test)

  if getattr(data, 'base', None) is not None and \


In [71]:
pars = {
    'colsample_bytree': 0.8,                 
    'learning_rate': 0.35,
    'max_depth': 5,
    'subsample': 0.8,
    'objective': 'reg:squarederror',
}

In [72]:
model = xgb.train(
    pars,
    dtrain,
    num_boost_round=200,
    evals=[(dtrain, 'train'), (dval, 'val')],
    verbose_eval=5,
    early_stopping_rounds=40,
)

[0]	train-rmse:173139	val-rmse:173545
Multiple eval metrics have been passed: 'val-rmse' will be used for early stopping.

Will train until val-rmse hasn't improved in 40 rounds.
[5]	train-rmse:139100	val-rmse:138558
[10]	train-rmse:116364	val-rmse:115192
[15]	train-rmse:110288	val-rmse:109069
[20]	train-rmse:107724	val-rmse:106402
[25]	train-rmse:104427	val-rmse:103268
[30]	train-rmse:101386	val-rmse:101109
[35]	train-rmse:98495	val-rmse:98232.7
[40]	train-rmse:97237.3	val-rmse:97242.7
[45]	train-rmse:95827.8	val-rmse:95719.1
[50]	train-rmse:94696.9	val-rmse:94589.7
[55]	train-rmse:93187.1	val-rmse:93323.5
[60]	train-rmse:92896.7	val-rmse:93107.2
[65]	train-rmse:92096	val-rmse:92426
[70]	train-rmse:91900.7	val-rmse:92306.9
[75]	train-rmse:91451.8	val-rmse:91928.4
[80]	train-rmse:90650	val-rmse:91359.3
[85]	train-rmse:89088.6	val-rmse:89808
[90]	train-rmse:88483.7	val-rmse:89294.6
[95]	train-rmse:88296.3	val-rmse:89186.7
[100]	train-rmse:87923.9	val-rmse:88853.3
[105]	train-rmse:87665.

In [73]:
dpredict = model.predict(dtest)

In [74]:
dpredict[dpredict < 0] = 0

In [75]:
xgb_rmsle = rmsle(list(y_test), list(dpredict))

In [76]:
print('XGBoost RMSLE Score is {}'.format(xgb_rmsle))

XGBoost RMSLE Score is 3.630609326250741


In [78]:
joblib.dump(model, 'xgboost_3.h5') 

['xgboost_3.h5']

### lightgbm

In [58]:
# x_test.head()

In [58]:
!pip install lightgbm

Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/0b/9d/ddcb2f43aca194987f1a99e27edf41cf9bc39ea750c3371c2a62698c509a/lightgbm-2.3.1-py2.py3-none-manylinux1_x86_64.whl (1.2MB)
[K    100% |████████████████████████████████| 1.2MB 22.7MB/s ta 0:00:01
Installing collected packages: lightgbm
Successfully installed lightgbm-2.3.1
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [59]:
import lightgbm as lgb

In [60]:
# Use small max_bin
# Use small num_leaves
# Use min_data_in_leaf and min_sum_hessian_in_leaf
# Use bagging by set bagging_fraction and bagging_freq
# Use feature sub-sampling by set feature_fraction
# Use bigger training data
# Try lambda_l1, lambda_l2 and min_gain_to_split for regularization
# Try max_depth to avoid growing deep tree

### parameter default setting

boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
importance_type='split', learning_rate=0.1, max_depth=-1,
min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
subsample=1.0, subsample_for_bin=200000, subsample_freq=0

### using a small learning_rate for better accuracy

In [52]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'learning_rate': 0.001,
}

In [53]:
dftrainLGB = lgb.Dataset(data = x_train, label = y_train.values)

In [54]:
# training with more rounds
cv_results = lgb.cv(
    params,
    dftrainLGB,
    num_boost_round=1000,
    nfold=10,
    early_stopping_rounds=100,
    stratified=False,
    verbose_eval=True
)

[1]	cv_agg's rmse: 177071 + 6944.17
[2]	cv_agg's rmse: 176989 + 6943.27
[3]	cv_agg's rmse: 176907 + 6942.37
[4]	cv_agg's rmse: 176825 + 6941.38
[5]	cv_agg's rmse: 176744 + 6940.48
[6]	cv_agg's rmse: 176662 + 6939.49
[7]	cv_agg's rmse: 176581 + 6938.49
[8]	cv_agg's rmse: 176499 + 6937.51
[9]	cv_agg's rmse: 176419 + 6936.6
[10]	cv_agg's rmse: 176338 + 6935.61
[11]	cv_agg's rmse: 176257 + 6934.57
[12]	cv_agg's rmse: 176176 + 6933.57
[13]	cv_agg's rmse: 176096 + 6932.63
[14]	cv_agg's rmse: 176015 + 6931.62
[15]	cv_agg's rmse: 175934 + 6930.77
[16]	cv_agg's rmse: 175854 + 6929.79
[17]	cv_agg's rmse: 175774 + 6929.1
[18]	cv_agg's rmse: 175694 + 6928.3
[19]	cv_agg's rmse: 175614 + 6927.52
[20]	cv_agg's rmse: 175534 + 6926.75
[21]	cv_agg's rmse: 175454 + 6925.45
[22]	cv_agg's rmse: 175374 + 6924.59
[23]	cv_agg's rmse: 175294 + 6923.64
[24]	cv_agg's rmse: 175215 + 6923.03
[25]	cv_agg's rmse: 175134 + 6921.98
[26]	cv_agg's rmse: 175056 + 6920.97
[27]	cv_agg's rmse: 174976 + 6920.16
[28]	cv_agg's

[221]	cv_agg's rmse: 161785 + 6782.77
[222]	cv_agg's rmse: 161728 + 6780.86
[223]	cv_agg's rmse: 161669 + 6778.98
[224]	cv_agg's rmse: 161611 + 6776.99
[225]	cv_agg's rmse: 161554 + 6774.98
[226]	cv_agg's rmse: 161495 + 6774.16
[227]	cv_agg's rmse: 161437 + 6771.99
[228]	cv_agg's rmse: 161380 + 6770.94
[229]	cv_agg's rmse: 161322 + 6769.18
[230]	cv_agg's rmse: 161264 + 6768.53
[231]	cv_agg's rmse: 161206 + 6766.59
[232]	cv_agg's rmse: 161149 + 6764.84
[233]	cv_agg's rmse: 161092 + 6763.56
[234]	cv_agg's rmse: 161035 + 6761.81
[235]	cv_agg's rmse: 160976 + 6761.01
[236]	cv_agg's rmse: 160920 + 6759.67
[237]	cv_agg's rmse: 160862 + 6757.75
[238]	cv_agg's rmse: 160806 + 6756
[239]	cv_agg's rmse: 160749 + 6754.5
[240]	cv_agg's rmse: 160693 + 6753.18
[241]	cv_agg's rmse: 160636 + 6752.15
[242]	cv_agg's rmse: 160579 + 6750.18
[243]	cv_agg's rmse: 160523 + 6749.65
[244]	cv_agg's rmse: 160467 + 6747.4
[245]	cv_agg's rmse: 160410 + 6746.19
[246]	cv_agg's rmse: 160354 + 6744.44
[247]	cv_agg's rm

[438]	cv_agg's rmse: 150871 + 6479.82
[439]	cv_agg's rmse: 150828 + 6478.29
[440]	cv_agg's rmse: 150785 + 6476.5
[441]	cv_agg's rmse: 150742 + 6474.53
[442]	cv_agg's rmse: 150699 + 6472.91
[443]	cv_agg's rmse: 150656 + 6471.21
[444]	cv_agg's rmse: 150615 + 6469.47
[445]	cv_agg's rmse: 150572 + 6468.21
[446]	cv_agg's rmse: 150530 + 6466.91
[447]	cv_agg's rmse: 150488 + 6464.61
[448]	cv_agg's rmse: 150447 + 6463.61
[449]	cv_agg's rmse: 150405 + 6462.95
[450]	cv_agg's rmse: 150363 + 6460.95
[451]	cv_agg's rmse: 150321 + 6459.07
[452]	cv_agg's rmse: 150280 + 6457.47
[453]	cv_agg's rmse: 150238 + 6455.25
[454]	cv_agg's rmse: 150197 + 6454.84
[455]	cv_agg's rmse: 150155 + 6453.09
[456]	cv_agg's rmse: 150114 + 6451.41
[457]	cv_agg's rmse: 150073 + 6450.43
[458]	cv_agg's rmse: 150032 + 6448.74
[459]	cv_agg's rmse: 149991 + 6447.5
[460]	cv_agg's rmse: 149949 + 6446.45
[461]	cv_agg's rmse: 149908 + 6444.95
[462]	cv_agg's rmse: 149867 + 6443.28
[463]	cv_agg's rmse: 149826 + 6442.08
[464]	cv_agg's

[655]	cv_agg's rmse: 142784 + 6120.48
[656]	cv_agg's rmse: 142752 + 6117.84
[657]	cv_agg's rmse: 142718 + 6116.33
[658]	cv_agg's rmse: 142686 + 6113.82
[659]	cv_agg's rmse: 142655 + 6110.71
[660]	cv_agg's rmse: 142624 + 6108.37
[661]	cv_agg's rmse: 142592 + 6105.84
[662]	cv_agg's rmse: 142559 + 6102.12
[663]	cv_agg's rmse: 142527 + 6098.84
[664]	cv_agg's rmse: 142495 + 6095.61
[665]	cv_agg's rmse: 142464 + 6092.44
[666]	cv_agg's rmse: 142432 + 6089.24
[667]	cv_agg's rmse: 142400 + 6087.67
[668]	cv_agg's rmse: 142369 + 6085.2
[669]	cv_agg's rmse: 142337 + 6083.03
[670]	cv_agg's rmse: 142306 + 6081.97
[671]	cv_agg's rmse: 142275 + 6080.32
[672]	cv_agg's rmse: 142245 + 6077.84
[673]	cv_agg's rmse: 142213 + 6074.12
[674]	cv_agg's rmse: 142182 + 6070.93
[675]	cv_agg's rmse: 142150 + 6069.67
[676]	cv_agg's rmse: 142118 + 6066.77
[677]	cv_agg's rmse: 142087 + 6065.32
[678]	cv_agg's rmse: 142055 + 6063.44
[679]	cv_agg's rmse: 142024 + 6060.68
[680]	cv_agg's rmse: 141993 + 6058.75
[681]	cv_agg'

[872]	cv_agg's rmse: 136822 + 5758.06
[873]	cv_agg's rmse: 136798 + 5756.97
[874]	cv_agg's rmse: 136775 + 5754.94
[875]	cv_agg's rmse: 136751 + 5753.26
[876]	cv_agg's rmse: 136726 + 5751.46
[877]	cv_agg's rmse: 136703 + 5749.99
[878]	cv_agg's rmse: 136679 + 5747.79
[879]	cv_agg's rmse: 136655 + 5745.59
[880]	cv_agg's rmse: 136632 + 5744.23
[881]	cv_agg's rmse: 136608 + 5742.51
[882]	cv_agg's rmse: 136584 + 5739.93
[883]	cv_agg's rmse: 136560 + 5737.8
[884]	cv_agg's rmse: 136537 + 5737.17
[885]	cv_agg's rmse: 136513 + 5735.86
[886]	cv_agg's rmse: 136489 + 5733.84
[887]	cv_agg's rmse: 136466 + 5731.97
[888]	cv_agg's rmse: 136443 + 5729.82
[889]	cv_agg's rmse: 136419 + 5728.56
[890]	cv_agg's rmse: 136395 + 5728.03
[891]	cv_agg's rmse: 136373 + 5726.19
[892]	cv_agg's rmse: 136350 + 5724.77
[893]	cv_agg's rmse: 136326 + 5722.63
[894]	cv_agg's rmse: 136302 + 5721.45
[895]	cv_agg's rmse: 136280 + 5720.38
[896]	cv_agg's rmse: 136256 + 5718.97
[897]	cv_agg's rmse: 136234 + 5717.36
[898]	cv_agg'

In [55]:
# params
print('Current parameters:\n', params)
print('\nBest num_boost_round:', len(cv_results['rmse-mean']))
print('Best CV score:', cv_results['rmse-mean'][-1])

Current parameters:
 {'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'rmse'}, 'learning_rate': 0.001}

Best num_boost_round: 1000
Best CV score: 134023.82354100602


In [56]:
num_boost_rounds_lgb=len(cv_results['rmse-mean'])
model_lgb = lgb.train(params, dftrainLGB, num_boost_round=num_boost_rounds_lgb)

In [57]:
y_pred_lgb = model_lgb.predict(x_test)
y_pred_lgb[y_pred_lgb < 0] = 0

In [58]:
lgb_rmsle = rmsle(list(y_test), list(y_pred_lgb))

In [59]:
print('lightgbm RMSLE Score [learning rate, num of iteration] is {}'.format(lgb_rmsle))

lightgbm RMSLE Score [learning rate, num of iteration] is 3.695083389567998


### using larger num_of_leaves for better accuracy

In [60]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'num_leave': 100
}

In [61]:
# training with more rounds
cv_results = lgb.cv(
    params,
    dftrainLGB,
    num_boost_round=500,
    nfold=10,
    early_stopping_rounds=100,
    stratified=False,
    verbose_eval=True
)

[1]	cv_agg's rmse: 169217 + 6853.87
[2]	cv_agg's rmse: 162469 + 6922.88
[3]	cv_agg's rmse: 156712 + 6828.7
[4]	cv_agg's rmse: 151651 + 6627.8
[5]	cv_agg's rmse: 147296 + 6559.29
[6]	cv_agg's rmse: 143636 + 6405.42
[7]	cv_agg's rmse: 140350 + 6249.69
[8]	cv_agg's rmse: 137676 + 6148.8
[9]	cv_agg's rmse: 135387 + 6055.95
[10]	cv_agg's rmse: 133132 + 5877.36
[11]	cv_agg's rmse: 131383 + 5906.72
[12]	cv_agg's rmse: 129652 + 5709.31
[13]	cv_agg's rmse: 128072 + 5646.13
[14]	cv_agg's rmse: 126503 + 5558.58
[15]	cv_agg's rmse: 125195 + 5334.46
[16]	cv_agg's rmse: 124021 + 5156.91
[17]	cv_agg's rmse: 122870 + 4972.13
[18]	cv_agg's rmse: 121873 + 4914.21
[19]	cv_agg's rmse: 120958 + 4751.24
[20]	cv_agg's rmse: 120216 + 4555.82
[21]	cv_agg's rmse: 119492 + 4539.3
[22]	cv_agg's rmse: 118735 + 4468.24
[23]	cv_agg's rmse: 118028 + 4465
[24]	cv_agg's rmse: 117405 + 4344.15
[25]	cv_agg's rmse: 116903 + 4336.18
[26]	cv_agg's rmse: 116431 + 4350.64
[27]	cv_agg's rmse: 115974 + 4330.3
[28]	cv_agg's rmse

[221]	cv_agg's rmse: 102621 + 3332.14
[222]	cv_agg's rmse: 102599 + 3343.09
[223]	cv_agg's rmse: 102573 + 3334.6
[224]	cv_agg's rmse: 102525 + 3303.09
[225]	cv_agg's rmse: 102505 + 3298.84
[226]	cv_agg's rmse: 102495 + 3297.07
[227]	cv_agg's rmse: 102467 + 3316.04
[228]	cv_agg's rmse: 102435 + 3310.8
[229]	cv_agg's rmse: 102415 + 3308.99
[230]	cv_agg's rmse: 102396 + 3285.82
[231]	cv_agg's rmse: 102391 + 3283.5
[232]	cv_agg's rmse: 102339 + 3309.12
[233]	cv_agg's rmse: 102295 + 3312.79
[234]	cv_agg's rmse: 102272 + 3320.81
[235]	cv_agg's rmse: 102248 + 3350.76
[236]	cv_agg's rmse: 102164 + 3355.85
[237]	cv_agg's rmse: 102148 + 3368.91
[238]	cv_agg's rmse: 102127 + 3348.54
[239]	cv_agg's rmse: 102105 + 3351.65
[240]	cv_agg's rmse: 102076 + 3351.57
[241]	cv_agg's rmse: 102010 + 3328.41
[242]	cv_agg's rmse: 101966 + 3343.61
[243]	cv_agg's rmse: 101893 + 3347.81
[244]	cv_agg's rmse: 101828 + 3365.35
[245]	cv_agg's rmse: 101773 + 3341.43
[246]	cv_agg's rmse: 101760 + 3338.13
[247]	cv_agg's 

[435]	cv_agg's rmse: 96968.7 + 3499.89
[436]	cv_agg's rmse: 96956.6 + 3492.77
[437]	cv_agg's rmse: 96946.1 + 3475.52
[438]	cv_agg's rmse: 96943.8 + 3474.01
[439]	cv_agg's rmse: 96921.6 + 3480.86
[440]	cv_agg's rmse: 96885.9 + 3474.73
[441]	cv_agg's rmse: 96853 + 3467.16
[442]	cv_agg's rmse: 96831.1 + 3465.15
[443]	cv_agg's rmse: 96812 + 3464.82
[444]	cv_agg's rmse: 96798.1 + 3442.1
[445]	cv_agg's rmse: 96771.2 + 3418.38
[446]	cv_agg's rmse: 96772.4 + 3426.5
[447]	cv_agg's rmse: 96709.1 + 3470
[448]	cv_agg's rmse: 96695.2 + 3481.41
[449]	cv_agg's rmse: 96673.2 + 3459.07
[450]	cv_agg's rmse: 96655.8 + 3438.24
[451]	cv_agg's rmse: 96649.4 + 3436.39
[452]	cv_agg's rmse: 96653.3 + 3439.8
[453]	cv_agg's rmse: 96643 + 3450.74
[454]	cv_agg's rmse: 96621.3 + 3451.22
[455]	cv_agg's rmse: 96613.8 + 3453.05
[456]	cv_agg's rmse: 96591.1 + 3448.67
[457]	cv_agg's rmse: 96576.7 + 3439.44
[458]	cv_agg's rmse: 96561.9 + 3446.55
[459]	cv_agg's rmse: 96546 + 3430.31
[460]	cv_agg's rmse: 96535.5 + 3423.9
[

In [62]:
# params
print('Current parameters:\n', params)
print('\nBest num_boost_round:', len(cv_results['rmse-mean']))
print('Best CV score:', cv_results['rmse-mean'][-1])

Current parameters:
 {'boosting_type': 'gbdt', 'objective': 'regression', 'metric': {'rmse'}, 'num_leave': 100}

Best num_boost_round: 500
Best CV score: 95725.27285569778


In [63]:
num_boost_rounds_lgb=len(cv_results['rmse-mean'])
model_lgb = lgb.train(params, dftrainLGB, num_boost_round=num_boost_rounds_lgb)

In [64]:
y_pred_lgb = model_lgb.predict(x_test)
y_pred_lgb[y_pred_lgb < 0] = 0

In [65]:
lgb_rmsle = rmsle(list(y_test), list(y_pred_lgb))

In [66]:
print('lightgbm RMSLE Score [num_of_leaves] is {}'.format(lgb_rmsle))

lightgbm RMSLE Score [num_of_leaves] is 2.515375561461128


### Parameter Tuning [grid search]

In [97]:
import gc
gc.collect()

7

In [98]:
# x_train

In [61]:
from sklearn.model_selection import GridSearchCV
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 0.25],
    'num_leaves': [10, 30, 50],
    'max_depth': [-1, 5, 10]
}

gbm = GridSearchCV(estimator, param_grid)
gbm.fit(x_train, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda=0.0, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid='warn', n_jobs=None,
             param_grid={'learning_rate': [0.01, 0.1, 0.25],
                         'max_depth': [-1, 5, 10], 'num_leaves': [10, 30, 50]},
             pre_dispatc

In [62]:
gbm.best_params_

{'learning_rate': 0.25, 'max_depth': -1, 'num_leaves': 50}

In [63]:
print('Best parameters found by grid search are: ', gbm.best_params_)

Best parameters found by grid search are:  {'learning_rate': 0.25, 'max_depth': -1, 'num_leaves': 50}


### Default setting LightGBM model

In [64]:
lgb_model = lgb.LGBMRegressor()

In [65]:
lgb_model

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [66]:
# x_train.head()

In [67]:
lgb_model.set_params(
    n_estimators = 10000,
    learning_rate = 0.25, 
    max_depth = -1, 
    num_leaves = 50
)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.25, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=-1, num_leaves=50, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [68]:
x_train.columns

Index(['square_feet', 'beaufort_scale', 'precip_depth_1_hr', 'dew_temperature',
       'air_temperature', 'hour_sin', 'hour_cos', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'meter_0', 'meter_1', 'meter_2',
       'meter_3', 'site_id_0', 'site_id_2', 'site_id_3', 'site_id_4',
       'site_id_6', 'site_id_7', 'site_id_8', 'site_id_9', 'site_id_10',
       'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15',
       'primary_use_Education', 'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Services',
       'primary_use_Technology/science', 'primary_use_Utility',
       'primary_use_Warehouse/storage', 'month_1', 'month_2', 'month_3',

In [69]:
lgb_model.fit(
    x_train, 
    y_train.values, 
    eval_metric='rmlse',
    eval_set=[(x_val, y_val)],
    verbose = True,
    early_stopping_rounds=500
)

[1]	valid_0's l2: 1.71127e+10
Training until validation scores don't improve for 500 rounds
[2]	valid_0's l2: 1.17248e+10
[3]	valid_0's l2: 8.39639e+09
[4]	valid_0's l2: 6.58314e+09
[5]	valid_0's l2: 5.51823e+09
[6]	valid_0's l2: 4.87777e+09
[7]	valid_0's l2: 4.43911e+09
[8]	valid_0's l2: 4.06313e+09
[9]	valid_0's l2: 3.8036e+09
[10]	valid_0's l2: 3.66955e+09
[11]	valid_0's l2: 3.48395e+09
[12]	valid_0's l2: 3.32999e+09
[13]	valid_0's l2: 3.24615e+09
[14]	valid_0's l2: 3.20275e+09
[15]	valid_0's l2: 3.16612e+09
[16]	valid_0's l2: 3.1326e+09
[17]	valid_0's l2: 3.08947e+09
[18]	valid_0's l2: 3.07858e+09
[19]	valid_0's l2: 3.05755e+09
[20]	valid_0's l2: 3.03611e+09
[21]	valid_0's l2: 3.03354e+09
[22]	valid_0's l2: 3.03036e+09
[23]	valid_0's l2: 2.99506e+09
[24]	valid_0's l2: 2.96223e+09
[25]	valid_0's l2: 2.959e+09
[26]	valid_0's l2: 2.95607e+09
[27]	valid_0's l2: 2.94593e+09
[28]	valid_0's l2: 2.92837e+09
[29]	valid_0's l2: 2.92916e+09
[30]	valid_0's l2: 2.92847e+09
[31]	valid_0's l2: 2.

[261]	valid_0's l2: 2.22593e+09
[262]	valid_0's l2: 2.21776e+09
[263]	valid_0's l2: 2.21736e+09
[264]	valid_0's l2: 2.21869e+09
[265]	valid_0's l2: 2.21807e+09
[266]	valid_0's l2: 2.21793e+09
[267]	valid_0's l2: 2.21921e+09
[268]	valid_0's l2: 2.21841e+09
[269]	valid_0's l2: 2.21841e+09
[270]	valid_0's l2: 2.21679e+09
[271]	valid_0's l2: 2.21676e+09
[272]	valid_0's l2: 2.21683e+09
[273]	valid_0's l2: 2.21683e+09
[274]	valid_0's l2: 2.21671e+09
[275]	valid_0's l2: 2.21652e+09
[276]	valid_0's l2: 2.21652e+09
[277]	valid_0's l2: 2.21675e+09
[278]	valid_0's l2: 2.21658e+09
[279]	valid_0's l2: 2.21654e+09
[280]	valid_0's l2: 2.21604e+09
[281]	valid_0's l2: 2.21461e+09
[282]	valid_0's l2: 2.21389e+09
[283]	valid_0's l2: 2.21423e+09
[284]	valid_0's l2: 2.21283e+09
[285]	valid_0's l2: 2.2132e+09
[286]	valid_0's l2: 2.21349e+09
[287]	valid_0's l2: 2.21629e+09
[288]	valid_0's l2: 2.21142e+09
[289]	valid_0's l2: 2.21191e+09
[290]	valid_0's l2: 2.21193e+09
[291]	valid_0's l2: 2.21532e+09
[292]	val

[520]	valid_0's l2: 2.05451e+09
[521]	valid_0's l2: 2.05568e+09
[522]	valid_0's l2: 2.05566e+09
[523]	valid_0's l2: 2.05568e+09
[524]	valid_0's l2: 2.05568e+09
[525]	valid_0's l2: 2.05566e+09
[526]	valid_0's l2: 2.05551e+09
[527]	valid_0's l2: 2.05549e+09
[528]	valid_0's l2: 2.05281e+09
[529]	valid_0's l2: 2.05397e+09
[530]	valid_0's l2: 2.05396e+09
[531]	valid_0's l2: 2.05396e+09
[532]	valid_0's l2: 2.05201e+09
[533]	valid_0's l2: 2.05178e+09
[534]	valid_0's l2: 2.05171e+09
[535]	valid_0's l2: 2.05219e+09
[536]	valid_0's l2: 2.05218e+09
[537]	valid_0's l2: 2.04982e+09
[538]	valid_0's l2: 2.04937e+09
[539]	valid_0's l2: 2.0492e+09
[540]	valid_0's l2: 2.04857e+09
[541]	valid_0's l2: 2.04846e+09
[542]	valid_0's l2: 2.04839e+09
[543]	valid_0's l2: 2.0482e+09
[544]	valid_0's l2: 2.04811e+09
[545]	valid_0's l2: 2.04807e+09
[546]	valid_0's l2: 2.04746e+09
[547]	valid_0's l2: 2.04722e+09
[548]	valid_0's l2: 2.04695e+09
[549]	valid_0's l2: 2.04614e+09
[550]	valid_0's l2: 2.04597e+09
[551]	vali

[778]	valid_0's l2: 2.01862e+09
[779]	valid_0's l2: 2.0184e+09
[780]	valid_0's l2: 2.01847e+09
[781]	valid_0's l2: 2.01845e+09
[782]	valid_0's l2: 2.01847e+09
[783]	valid_0's l2: 2.01831e+09
[784]	valid_0's l2: 2.01808e+09
[785]	valid_0's l2: 2.01812e+09
[786]	valid_0's l2: 2.01812e+09
[787]	valid_0's l2: 2.01802e+09
[788]	valid_0's l2: 2.01802e+09
[789]	valid_0's l2: 2.01805e+09
[790]	valid_0's l2: 2.01808e+09
[791]	valid_0's l2: 2.01716e+09
[792]	valid_0's l2: 2.017e+09
[793]	valid_0's l2: 2.01681e+09
[794]	valid_0's l2: 2.01681e+09
[795]	valid_0's l2: 2.01687e+09
[796]	valid_0's l2: 2.01686e+09
[797]	valid_0's l2: 2.01745e+09
[798]	valid_0's l2: 2.01739e+09
[799]	valid_0's l2: 2.01738e+09
[800]	valid_0's l2: 2.01739e+09
[801]	valid_0's l2: 2.01724e+09
[802]	valid_0's l2: 2.01687e+09
[803]	valid_0's l2: 2.01687e+09
[804]	valid_0's l2: 2.01675e+09
[805]	valid_0's l2: 2.01673e+09
[806]	valid_0's l2: 2.01673e+09
[807]	valid_0's l2: 2.01672e+09
[808]	valid_0's l2: 2.01672e+09
[809]	valid

[1034]	valid_0's l2: 1.99462e+09
[1035]	valid_0's l2: 1.9946e+09
[1036]	valid_0's l2: 1.99461e+09
[1037]	valid_0's l2: 1.99462e+09
[1038]	valid_0's l2: 1.99464e+09
[1039]	valid_0's l2: 1.99577e+09
[1040]	valid_0's l2: 1.9934e+09
[1041]	valid_0's l2: 1.99354e+09
[1042]	valid_0's l2: 1.99366e+09
[1043]	valid_0's l2: 1.99366e+09
[1044]	valid_0's l2: 1.99298e+09
[1045]	valid_0's l2: 1.99322e+09
[1046]	valid_0's l2: 1.99314e+09
[1047]	valid_0's l2: 1.99323e+09
[1048]	valid_0's l2: 1.99323e+09
[1049]	valid_0's l2: 1.99323e+09
[1050]	valid_0's l2: 1.99323e+09
[1051]	valid_0's l2: 1.99319e+09
[1052]	valid_0's l2: 1.99319e+09
[1053]	valid_0's l2: 1.99316e+09
[1054]	valid_0's l2: 1.99193e+09
[1055]	valid_0's l2: 1.99185e+09
[1056]	valid_0's l2: 1.99134e+09
[1057]	valid_0's l2: 1.99117e+09
[1058]	valid_0's l2: 1.99117e+09
[1059]	valid_0's l2: 1.99083e+09
[1060]	valid_0's l2: 1.99083e+09
[1061]	valid_0's l2: 1.99079e+09
[1062]	valid_0's l2: 1.98994e+09
[1063]	valid_0's l2: 1.99043e+09
[1064]	valid

[1283]	valid_0's l2: 1.97922e+09
[1284]	valid_0's l2: 1.97923e+09
[1285]	valid_0's l2: 1.97928e+09
[1286]	valid_0's l2: 1.9793e+09
[1287]	valid_0's l2: 1.97941e+09
[1288]	valid_0's l2: 1.9794e+09
[1289]	valid_0's l2: 1.97942e+09
[1290]	valid_0's l2: 1.97951e+09
[1291]	valid_0's l2: 1.97955e+09
[1292]	valid_0's l2: 1.97994e+09
[1293]	valid_0's l2: 1.9799e+09
[1294]	valid_0's l2: 1.9799e+09
[1295]	valid_0's l2: 1.98004e+09
[1296]	valid_0's l2: 1.97994e+09
[1297]	valid_0's l2: 1.97984e+09
[1298]	valid_0's l2: 1.98009e+09
[1299]	valid_0's l2: 1.98008e+09
[1300]	valid_0's l2: 1.98007e+09
[1301]	valid_0's l2: 1.98007e+09
[1302]	valid_0's l2: 1.98019e+09
[1303]	valid_0's l2: 1.97996e+09
[1304]	valid_0's l2: 1.97996e+09
[1305]	valid_0's l2: 1.98004e+09
[1306]	valid_0's l2: 1.98029e+09
[1307]	valid_0's l2: 1.98031e+09
[1308]	valid_0's l2: 1.9798e+09
[1309]	valid_0's l2: 1.97974e+09
[1310]	valid_0's l2: 1.9798e+09
[1311]	valid_0's l2: 1.9797e+09
[1312]	valid_0's l2: 1.97928e+09
[1313]	valid_0's 

[1534]	valid_0's l2: 1.97314e+09
[1535]	valid_0's l2: 1.97253e+09
[1536]	valid_0's l2: 1.97233e+09
[1537]	valid_0's l2: 1.97234e+09
[1538]	valid_0's l2: 1.97256e+09
[1539]	valid_0's l2: 1.97277e+09
[1540]	valid_0's l2: 1.97278e+09
[1541]	valid_0's l2: 1.97246e+09
[1542]	valid_0's l2: 1.97292e+09
[1543]	valid_0's l2: 1.97323e+09
[1544]	valid_0's l2: 1.97323e+09
[1545]	valid_0's l2: 1.97321e+09
[1546]	valid_0's l2: 1.97322e+09
[1547]	valid_0's l2: 1.97322e+09
[1548]	valid_0's l2: 1.97322e+09
[1549]	valid_0's l2: 1.97322e+09
[1550]	valid_0's l2: 1.97346e+09
[1551]	valid_0's l2: 1.9731e+09
[1552]	valid_0's l2: 1.97306e+09
[1553]	valid_0's l2: 1.97303e+09
[1554]	valid_0's l2: 1.97341e+09
[1555]	valid_0's l2: 1.97352e+09
[1556]	valid_0's l2: 1.97349e+09
[1557]	valid_0's l2: 1.97349e+09
[1558]	valid_0's l2: 1.97349e+09
[1559]	valid_0's l2: 1.9725e+09
[1560]	valid_0's l2: 1.9725e+09
[1561]	valid_0's l2: 1.97279e+09
[1562]	valid_0's l2: 1.97279e+09
[1563]	valid_0's l2: 1.97278e+09
[1564]	valid_

[1785]	valid_0's l2: 1.96963e+09
[1786]	valid_0's l2: 1.96894e+09
[1787]	valid_0's l2: 1.96903e+09
[1788]	valid_0's l2: 1.96913e+09
[1789]	valid_0's l2: 1.96917e+09
[1790]	valid_0's l2: 1.96916e+09
[1791]	valid_0's l2: 1.969e+09
[1792]	valid_0's l2: 1.96811e+09
[1793]	valid_0's l2: 1.96813e+09
[1794]	valid_0's l2: 1.96621e+09
[1795]	valid_0's l2: 1.96628e+09
[1796]	valid_0's l2: 1.96617e+09
[1797]	valid_0's l2: 1.96615e+09
[1798]	valid_0's l2: 1.96611e+09
[1799]	valid_0's l2: 1.9661e+09
[1800]	valid_0's l2: 1.96596e+09
[1801]	valid_0's l2: 1.96596e+09
[1802]	valid_0's l2: 1.96576e+09
[1803]	valid_0's l2: 1.96576e+09
[1804]	valid_0's l2: 1.96551e+09
[1805]	valid_0's l2: 1.9655e+09
[1806]	valid_0's l2: 1.96553e+09
[1807]	valid_0's l2: 1.96554e+09
[1808]	valid_0's l2: 1.96555e+09
[1809]	valid_0's l2: 1.96555e+09
[1810]	valid_0's l2: 1.96562e+09
[1811]	valid_0's l2: 1.96527e+09
[1812]	valid_0's l2: 1.96516e+09
[1813]	valid_0's l2: 1.96416e+09
[1814]	valid_0's l2: 1.96416e+09
[1815]	valid_0

[2034]	valid_0's l2: 1.94716e+09
[2035]	valid_0's l2: 1.94707e+09
[2036]	valid_0's l2: 1.94606e+09
[2037]	valid_0's l2: 1.94604e+09
[2038]	valid_0's l2: 1.94603e+09
[2039]	valid_0's l2: 1.94592e+09
[2040]	valid_0's l2: 1.94603e+09
[2041]	valid_0's l2: 1.94603e+09
[2042]	valid_0's l2: 1.94603e+09
[2043]	valid_0's l2: 1.94667e+09
[2044]	valid_0's l2: 1.94634e+09
[2045]	valid_0's l2: 1.94634e+09
[2046]	valid_0's l2: 1.94649e+09
[2047]	valid_0's l2: 1.9465e+09
[2048]	valid_0's l2: 1.9465e+09
[2049]	valid_0's l2: 1.94652e+09
[2050]	valid_0's l2: 1.94652e+09
[2051]	valid_0's l2: 1.94652e+09
[2052]	valid_0's l2: 1.94651e+09
[2053]	valid_0's l2: 1.94651e+09
[2054]	valid_0's l2: 1.94658e+09
[2055]	valid_0's l2: 1.9466e+09
[2056]	valid_0's l2: 1.9466e+09
[2057]	valid_0's l2: 1.94662e+09
[2058]	valid_0's l2: 1.94661e+09
[2059]	valid_0's l2: 1.94661e+09
[2060]	valid_0's l2: 1.94649e+09
[2061]	valid_0's l2: 1.9465e+09
[2062]	valid_0's l2: 1.9465e+09
[2063]	valid_0's l2: 1.94649e+09
[2064]	valid_0's

[2284]	valid_0's l2: 1.94193e+09
[2285]	valid_0's l2: 1.94249e+09
[2286]	valid_0's l2: 1.94232e+09
[2287]	valid_0's l2: 1.94222e+09
[2288]	valid_0's l2: 1.94218e+09
[2289]	valid_0's l2: 1.94206e+09
[2290]	valid_0's l2: 1.94171e+09
[2291]	valid_0's l2: 1.94154e+09
[2292]	valid_0's l2: 1.94151e+09
[2293]	valid_0's l2: 1.94135e+09
[2294]	valid_0's l2: 1.94135e+09
[2295]	valid_0's l2: 1.94135e+09
[2296]	valid_0's l2: 1.9413e+09
[2297]	valid_0's l2: 1.94145e+09
[2298]	valid_0's l2: 1.94145e+09
[2299]	valid_0's l2: 1.94103e+09
[2300]	valid_0's l2: 1.94035e+09
[2301]	valid_0's l2: 1.94035e+09
[2302]	valid_0's l2: 1.94016e+09
[2303]	valid_0's l2: 1.94026e+09
[2304]	valid_0's l2: 1.94026e+09
[2305]	valid_0's l2: 1.94026e+09
[2306]	valid_0's l2: 1.94025e+09
[2307]	valid_0's l2: 1.94028e+09
[2308]	valid_0's l2: 1.94027e+09
[2309]	valid_0's l2: 1.94027e+09
[2310]	valid_0's l2: 1.94026e+09
[2311]	valid_0's l2: 1.94026e+09
[2312]	valid_0's l2: 1.94035e+09
[2313]	valid_0's l2: 1.94033e+09
[2314]	vali

[2535]	valid_0's l2: 1.9429e+09
[2536]	valid_0's l2: 1.94247e+09
[2537]	valid_0's l2: 1.94248e+09
[2538]	valid_0's l2: 1.94246e+09
[2539]	valid_0's l2: 1.94246e+09
[2540]	valid_0's l2: 1.94246e+09
[2541]	valid_0's l2: 1.94246e+09
[2542]	valid_0's l2: 1.94229e+09
[2543]	valid_0's l2: 1.94229e+09
[2544]	valid_0's l2: 1.94229e+09
[2545]	valid_0's l2: 1.94229e+09
[2546]	valid_0's l2: 1.9423e+09
[2547]	valid_0's l2: 1.9423e+09
[2548]	valid_0's l2: 1.9423e+09
[2549]	valid_0's l2: 1.9423e+09
[2550]	valid_0's l2: 1.94214e+09
[2551]	valid_0's l2: 1.94199e+09
[2552]	valid_0's l2: 1.94227e+09
[2553]	valid_0's l2: 1.94227e+09
[2554]	valid_0's l2: 1.94227e+09
[2555]	valid_0's l2: 1.94239e+09
[2556]	valid_0's l2: 1.94237e+09
[2557]	valid_0's l2: 1.94254e+09
[2558]	valid_0's l2: 1.94254e+09
[2559]	valid_0's l2: 1.94254e+09
[2560]	valid_0's l2: 1.94254e+09
[2561]	valid_0's l2: 1.94254e+09
[2562]	valid_0's l2: 1.94254e+09
[2563]	valid_0's l2: 1.94254e+09
[2564]	valid_0's l2: 1.94249e+09
[2565]	valid_0'

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.25, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=-1, num_leaves=50, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [70]:
y_pred_lgb = lgb_model.predict(x_test)
y_pred_lgb[y_pred_lgb < 0] = 0

In [71]:
lgb_rmsle = rmsle(list(y_test), list(y_pred_lgb))

In [72]:
print('lightgbm RMSLE Score is {}'.format(lgb_rmsle))

lightgbm RMSLE Score is 1.9625795828689219


In [73]:
joblib.dump(lgb_model, 'model/lgb_model.pkl') 

['model/lgb_model.pkl']

### Predicting RMSLE on submission file

In [74]:
from keras.models import load_model

Using TensorFlow backend.


In [75]:
# lgb_model = load_model('model/lgb_model.pkl')

In [76]:
x_train.shape, x_submit.shape, x_test.shape

((9576523, 62), (41697600, 62), (2992664, 62))

In [77]:
x_train.columns, x_submit.columns

(Index(['square_feet', 'beaufort_scale', 'precip_depth_1_hr', 'dew_temperature',
        'air_temperature', 'hour_sin', 'hour_cos', 'sea_level_pressure',
        'wind_direction', 'wind_speed', 'meter_0', 'meter_1', 'meter_2',
        'meter_3', 'site_id_0', 'site_id_2', 'site_id_3', 'site_id_4',
        'site_id_6', 'site_id_7', 'site_id_8', 'site_id_9', 'site_id_10',
        'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15',
        'primary_use_Education', 'primary_use_Entertainment/public assembly',
        'primary_use_Food sales and service', 'primary_use_Healthcare',
        'primary_use_Lodging/residential',
        'primary_use_Manufacturing/industrial', 'primary_use_Office',
        'primary_use_Other', 'primary_use_Parking',
        'primary_use_Public services', 'primary_use_Religious worship',
        'primary_use_Retail', 'primary_use_Services',
        'primary_use_Technology/science', 'primary_use_Utility',
        'primary_use_Warehouse/storage', 'month_1', 'month

In [78]:
x_submit.columns

Index(['square_feet', 'beaufort_scale', 'precip_depth_1_hr', 'dew_temperature',
       'air_temperature', 'hour_sin', 'hour_cos', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'meter_0', 'meter_1', 'meter_2',
       'meter_3', 'site_id_0', 'site_id_2', 'site_id_3', 'site_id_4',
       'site_id_6', 'site_id_7', 'site_id_8', 'site_id_9', 'site_id_10',
       'site_id_11', 'site_id_13', 'site_id_14', 'site_id_15',
       'primary_use_Education', 'primary_use_Entertainment/public assembly',
       'primary_use_Food sales and service', 'primary_use_Healthcare',
       'primary_use_Lodging/residential',
       'primary_use_Manufacturing/industrial', 'primary_use_Office',
       'primary_use_Other', 'primary_use_Parking',
       'primary_use_Public services', 'primary_use_Religious worship',
       'primary_use_Retail', 'primary_use_Services',
       'primary_use_Technology/science', 'primary_use_Utility',
       'primary_use_Warehouse/storage', 'month_1', 'month_2', 'month_3',

In [79]:
# lgb_model = load_model('model/lgb_model.h5')

In [80]:
# 41697600/4, 41697600/4*2, 41697600/4*3, 41697600

In [81]:
# x_submit[:10424400].shape

In [82]:
y_pred_lgb = lgb_model.predict(x_submit)
y_pred_lgb[y_pred_lgb < 0] = 0

In [83]:
# y_pred_lgb_2 = lgb_model.predict(x_submit[10424400:20848800])
# y_pred_lgb_2[y_pred_lgb_2 < 0] = 0

In [84]:
# y_pred_lgb_3 = lgb_model.predict(x_submit[20848800:31273200])
# y_pred_lgb_3[y_pred_lgb_3 < 0] = 0

In [85]:
# y_pred_lgb_4 = lgb_model.predict(x_submit[31273200:])
# y_pred_lgb_4[y_pred_lgb_4 < 0] = 0

In [86]:
y_pred_lgb.shape

(41697600,)

In [87]:
df_test.shape

(41697600, 72)

In [88]:
# df_test.head()

In [89]:
df_test['meter_reading'] = y_pred_lgb

In [90]:
df_test.head()

Unnamed: 0,row_id,building_id,timestamp,square_feet,air_temperature,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,...,month_11,month_12,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,dayofweek_5,dayofweek_6,meter_reading
0,0,0,2017-01-01,8.91355,17.8,11.7,,1021.5,100.0,3.6,...,0,0,0,0,0,0,0,0,1,0.0
1,1,1,2017-01-01,7.908387,17.8,11.7,,1021.5,100.0,3.6,...,0,0,0,0,0,0,0,0,1,0.0
2,2,2,2017-01-01,8.5897,17.8,11.7,,1021.5,100.0,3.6,...,0,0,0,0,0,0,0,0,1,0.0
3,3,3,2017-01-01,10.072597,17.8,11.7,,1021.5,100.0,3.6,...,0,0,0,0,0,0,0,0,1,0.0
4,4,4,2017-01-01,11.666565,17.8,11.7,,1021.5,100.0,3.6,...,0,0,0,0,0,0,0,0,1,237.484234


In [91]:
x_submit.shape

(41697600, 62)

In [92]:
df_submission.shape

(24936697, 2)

In [93]:
test_submission = pd.read_csv('data/test_submission_full.csv')
print(test_submission.shape)

(41697600, 2)


In [94]:
test_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,2.331686
1,1,0.0
2,2,2.331686
3,3,40.16047
4,4,359.384284


In [95]:
test_submission['meter_reading'] = df_test['meter_reading']

In [96]:
test_submission.shape

(41697600, 2)

In [97]:
test_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,0.0
1,1,0.0
2,2,0.0
3,3,0.0
4,4,237.484234


In [98]:
# df_submission_test = pd.DataFrame([], columns = ['row_id', 'meter_reading'])
# df_submission_test['row_id'] = x_submit['row_id']
# df_submission_test['meter_reading'] = x_submit['meter_reading']
test_submission.to_csv('data/test_submission_full.csv', index=False)

### Small LSTM

In [None]:
import numpy as np
import pandas as pd"tpw3487
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, GlobalMaxPooling1D, SpatialDropout1D
from keras.optimizers import RMSprop,Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from keras import losses

In [None]:
x_train.shape, x_val.shape, x_test.shape

In [None]:
x_train = x_train.reshape(9576523, 1, 9)
x_val = x_val.reshape(2394131, 1, 9)
x_test = x_test.reshape(2992664, 1, 9)

# x_submit = x_submit.values
# x_submit = x_submit.reshape(24936697, 1, 9)

In [None]:
x_train.shape, x_val.shape, x_submit.shape

In [None]:
input_dim = (x_train.shape[1], x_train.shape[2])

In [None]:
callbacks = [EarlyStopping(monitor='val_loss', patience=15),
             ModelCheckpoint(filepath='simple_lstm_2.h5', monitor='val_loss', save_best_only=True)]

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(units=30, return_sequences=True, input_shape=input_dim))
model.add(tf.keras.layers.LSTM(units=16, return_sequences=True))
model.add(tf.keras.layers.Dropout(0.8))
model.add(tf.keras.layers.Dense(1))
model.compile(optimizer=tf.keras.optimizers.Adam(clipvalue=1.0), 
              loss=losses.mean_squared_logarithmic_error)

In [None]:
model.summary()

In [None]:
model.fit(x_train, y_train, epochs=50, batch_size=1024, validation_data=(x_val, y_val), callbacks=callbacks)

In [35]:
y_pred_nn = model.predict(x_test)
y_pred_nn[y_pred_nn < 0] = 0

In [38]:
nn_rmsle = rmsle(list(y_test), list(y_pred_nn))

In [39]:
print('LSTM RMSLE Score is {}'.format(nn_rmsle))

LSTM RMSLE Score is 2.1848922221754057


In [119]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f912c1b86a0>

In [121]:
x_test.shape, x_submit.shape

((2992664, 1, 8), (24936697, 1, 8))

In [None]:
y_pred_nn_submit = model.predict(x_submit)
y_pred_nn_submit[y_pred_nn_submit < 0] = 0

In [None]:
df_submission.columns

In [None]:
df_submission['meter_reading'] = y_pred_nn_submit

In [None]:
df_submission.to_csv('data/test_submission.csv')

### LSTM with weather variables

In [224]:
print(x_train.shape, x_val.shape, x_test.shape)

(9576523, 41) (2394131, 41) (2992664, 41)


In [226]:
x_train = x_train.reshape(9576523, 1, 41)
# x_val = x_val.reshape(2394131, 1, 41)
# x_test = x_test.reshape(2992664, 1, 41)

AttributeError: 'DataFrame' object has no attribute 'reshape'

In [205]:
callbacks = [EarlyStopping(monitor='val_loss', patience=15),
             ModelCheckpoint(filepath='simple_lstm_improved.h5', monitor='val_loss', save_best_only=True)]

In [206]:
model_2 = tf.keras.models.Sequential()
model_2.add(tf.keras.layers.LSTM(units=30, return_sequences=True, input_shape=(1, 9)))
model_2.add(tf.keras.layers.LSTM(units=32, return_sequences=True))
model_2.add(tf.keras.layers.Dropout(0.8))
model_2.add(tf.keras.layers.LSTM(units=16, return_sequences=True))
model_2.add(tf.keras.layers.Dropout(0.8))
model_2.add(tf.keras.layers.Dense(1))
model_2.compile(optimizer=tf.keras.optimizers.Adam(clipvalue=1.0), 
                loss=losses.mean_squared_logarithmic_error)

W1110 21:13:27.702513 140638866609984 nn_ops.py:4224] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.
W1110 21:13:27.924329 140638866609984 nn_ops.py:4224] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


In [207]:
model_2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 1, 30)             4800      
_________________________________________________________________
lstm_5 (LSTM)                (None, 1, 32)             8064      
_________________________________________________________________
dropout_2 (Dropout)          (None, 1, 32)             0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 1, 16)             3136      
_________________________________________________________________
dropout_3 (Dropout)          (None, 1, 16)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1, 1)              17        
Total params: 16,017
Trainable params: 16,017
Non-trainable params: 0
__________________________________________________

In [159]:
model_2.fit(x_train, y_train, epochs=50, batch_size=1024, validation_data=(x_val, y_val), callbacks=callbacks)

Train on 9576523 samples, validate on 2394131 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4c1d82b630>

In [160]:
y_pred_nn = model_2.predict(x_test)
y_pred_nn[y_pred_nn < 0] = 0

In [161]:
nn_rmsle = rmsle(list(y_test), list(y_pred_nn))

In [162]:
print('LSTM RMSLE Score is {}'.format(nn_rmsle))

LSTM RMSLE Score is 2.1839909176624137


In [165]:
# 2.18489