# Great Energy Predictor - Modeling
#### Hosted by: ASHRAE
##### Source: https://www.kaggle.com/c/ashrae-energy-prediction

### Dependencies

In [1]:
%matplotlib inline

import src.utils as udf

import gc
import joblib
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
import optuna

# import os

# Pandas on dask
# os.environ['MODIN_ENGINE'] = 'dask'
# import modin.pandas as dd 

# Pandas on ray
# os.environ['MODIN_ENGINE'] = 'ray'
# import ray
# ray.init()
# import modin.pandas as ray

In [2]:
# Plot settings
sns.set(rc={'figure.figsize': (16, 4),
            'font.size': 16})

### Load data

In [3]:
path = '../data/from_mod/'

In [4]:
train1 = pd.read_pickle(f'{path}train_meter1.pkl')
train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3518870 entries, 71 to 18264845
Data columns (total 17 columns):
building_id           uint16
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(4)
memory usage: 221.5+ MB


In [5]:
train1 = udf.reduce_mem_usage(train1)
train1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3518870 entries, 71 to 18264845
Data columns (total 17 columns):
building_id           uint16
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(4)
memory usage: 221.5+ MB


In [6]:
train1.head()

Unnamed: 0,building_id,meter_reading,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
71,163,4.5719,-5.6,1017.299988,0.0,Education,72102,1970,0,1,0,0.0,0.0,39.616142,0,US,1
76,166,209.886002,-5.6,1017.299988,0.0,Lodging/residential,553210,2009,0,1,0,0.0,0.0,39.616142,0,US,1
81,168,51.556999,-5.6,1017.299988,0.0,Education,183460,2005,0,1,0,0.0,0.0,39.616142,0,US,1
84,169,176.686005,-5.6,1017.299988,0.0,Education,179559,2006,0,1,0,0.0,0.0,39.616142,0,US,1
87,170,11.2891,-5.6,1017.299988,0.0,Retail,45224,1982,0,1,0,0.0,0.0,39.616142,0,US,1


In [7]:
train2 = pd.read_pickle(f'{path}train_meter2.pkl')
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2296049 entries, 765 to 18264889
Data columns (total 17 columns):
building_id           uint16
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(4)
memory usage: 144.5+ MB


In [8]:
train2 = udf.reduce_mem_usage(train2)
train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2296049 entries, 765 to 18264889
Data columns (total 17 columns):
building_id           uint16
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(4)
memory usage: 144.5+ MB


In [9]:
train2.head()

Unnamed: 0,building_id,meter_reading,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
765,747,120.786003,2.2,1020.0,1.5,Education,30531,1964,1,1,0,0.5,0.866025,70.474915,0,US,1
769,750,4678.140137,2.2,1020.0,1.5,Education,35990,1964,1,1,0,0.5,0.866025,70.474915,0,US,1
771,753,324.226013,2.2,1020.0,1.5,Education,39558,1964,1,1,0,0.5,0.866025,70.474915,0,US,1
774,757,918.984009,2.2,1020.0,1.5,Education,46813,1964,1,1,0,0.5,0.866025,70.474915,0,US,1
776,758,256.165985,2.2,1020.0,1.5,Lodging/residential,47007,1968,1,1,0,0.5,0.866025,70.474915,0,US,1


In [10]:
train3 = pd.read_pickle(f'{path}train_meter3.pkl')
train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 919708 entries, 11 to 18264677
Data columns (total 17 columns):
building_id           919708 non-null uint16
meter_reading         919708 non-null float32
dew_temperature       919708 non-null float32
sea_level_pressure    919708 non-null float32
wind_speed            919708 non-null float32
primary_use           919708 non-null object
square_feet           919708 non-null uint32
year_built            919708 non-null uint16
missing_year          919708 non-null uint8
dayofyear             919708 non-null uint16
hour                  919708 non-null uint8
wind_direction_x      919708 non-null float32
wind_direction_y      919708 non-null float32
rel_humidity          919708 non-null float32
is_weekend            919708 non-null uint8
country               919708 non-null object
is_holiday            919708 non-null uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(4)
memory usage: 57.9+ MB


In [11]:
train3 = udf.reduce_mem_usage(train3)
train3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 919708 entries, 11 to 18264677
Data columns (total 17 columns):
building_id           919708 non-null uint16
meter_reading         919708 non-null float32
dew_temperature       919708 non-null float32
sea_level_pressure    919708 non-null float32
wind_speed            919708 non-null float32
primary_use           919708 non-null object
square_feet           919708 non-null uint32
year_built            919708 non-null uint16
missing_year          919708 non-null uint8
dayofyear             919708 non-null uint16
hour                  919708 non-null uint8
wind_direction_x      919708 non-null float32
wind_direction_y      919708 non-null float32
rel_humidity          919708 non-null float32
is_weekend            919708 non-null uint8
country               919708 non-null object
is_holiday            919708 non-null uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(4)
memory usage: 57.9+ MB


In [12]:
train3.head()

Unnamed: 0,building_id,meter_reading,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
11,112,96.977997,2.4,1020.900024,3.1,Education,32206,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1
13,113,19.597,2.4,1020.900024,3.1,Education,100481,1958,0,1,0,-0.5,-0.866025,90.549408,0,UK,1
15,114,100.0,2.4,1020.900024,3.1,Education,139683,1958,0,1,0,-0.5,-0.866025,90.549408,0,UK,1
19,117,19.680901,2.4,1020.900024,3.1,Education,15489,2004,0,1,0,-0.5,-0.866025,90.549408,0,UK,1
22,119,200.0,2.4,1020.900024,3.1,Education,91149,2007,0,1,0,-0.5,-0.866025,90.549408,0,UK,1


In [13]:
del path
gc.collect()

22

### Functions

In [14]:
# Train/validation/test split
def tvt_split(df):
    X = df.drop('meter_reading', axis=1)
    y = df[['meter_reading']]
    y = np.log1p(y)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)
    return X_train, y_train, X_val, y_val, X_test, y_test

In [15]:
def transform_feats(X_train, y_train, X_val, X_test):
    X_train, X_val, X_test, rare_dict = udf.rare_encoder(X_train, X_test, ['primary_use'], val=X_val)
    X_train, X_val, X_test, mean_dict = udf.mean_encoder(X_train, y_train, X_test, ['primary_use', 'country'], X_val=X_val)
    X_train_scaled, X_val_scaled, X_test_scaled = udf.scale_feats(X_train, X_test, val=X_val)
    
    return X_train_scaled, X_val_scaled, X_test_scaled

In [16]:
def lgbm_rmsle(X_train, y_train, X_val, y_val, X_test, y_test, params_dict, save_path):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)
    
    lgbm = lgb.train(params_dict, dtrain, valid_sets=[dtrain, dval], valid_names=['train', 'valid'], verbose_eval=False)
    lgbm.save_model(save_path)
    
    pred_train = lgbm.predict(X_train)
    pred_train[pred_train < 0] = 0
    rmsle_train = np.sqrt(mean_squared_log_error(y_train, pred_train))
    
    pred_val = lgbm.predict(X_val)
    pred_val[pred_val < 0] = 0
    rmsle_val = np.sqrt(mean_squared_log_error(y_val, pred_val))
    
    pred_test = lgbm.predict(X_test)
    pred_test[pred_test < 0] = 0
    rmsle_test = np.sqrt(mean_squared_log_error(y_test, pred_test))
    
    return rmsle_train, rmsle_val, rmsle_test

In [17]:
def xgb_rmsle(X_train, y_train, X_val, y_val, X_test, y_test, params_dict, save_path):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    xg = xgb.train(params_dict, dtrain,
                   evals=[(dtrain, 'train'), (dval, 'valid')],
                   num_boost_round=1000,
                   early_stopping_rounds=10,
                   verbose_eval=False)
    xg.save_model(save_path)
    
    pred_train = xg.predict(dtrain)
    pred_train[pred_train < 0] = 0
    rmsle_train = np.sqrt(mean_squared_log_error(y_train, pred_train))

    pred_val = xg.predict(dval)
    pred_val[pred_val < 0] = 0
    rmsle_val = np.sqrt(mean_squared_log_error(y_val, pred_val))

    pred_test = xg.predict(dtest)
    pred_test[pred_test < 0] = 0
    rmsle_test = np.sqrt(mean_squared_log_error(y_test, pred_test))
    
    return rmsle_train, rmsle_val, rmsle_test

### Train/test split - chilled water

In [18]:
Xc_train, yc_train, Xc_val, yc_val, Xc_test, yc_test = tvt_split(train1)
print('Train set:', Xc_train.shape, yc_train.shape)
print('Validation set:', Xc_val.shape, yc_val.shape)
print('Test set:', Xc_test.shape, yc_test.shape)

Train set: (2111322, 16) (2111322, 1)
Validation set: (703774, 16) (703774, 1)
Test set: (703774, 16) (703774, 1)


In [19]:
del train1
gc.collect()

0

### Feature transformation - chilled water

In [20]:
Xc_train_scaled, Xc_val_scaled, Xc_test_scaled = transform_feats(Xc_train, yc_train, Xc_val, Xc_test)
Xc_train_scaled.head()



Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,-1.562845,0.660151,-0.930315,-0.715817,0.75697,-0.142244,1.132501,-1.370127,0.439333,-0.813992,-0.657531,1.325628,-0.537515,1.598019,-0.173728,-0.170191
1,1.190213,0.920849,0.240857,2.155033,0.611782,-0.951649,1.855347,-1.370127,0.399516,-1.104583,0.814772,-1.381928,1.535949,-0.625775,-0.173728,-0.170191
2,-1.348291,-1.637593,-1.308255,0.519071,-1.579297,-0.837591,0.242844,0.729859,0.867364,1.074848,-0.413719,-1.456952,-1.9839,-0.625775,-0.173728,-0.170191
3,-1.743521,1.53783,1.01986,0.519071,-0.341213,0.616311,2.077762,-1.370127,0.090935,0.203075,1.450669,0.761589,0.445447,1.598019,-0.173728,-0.170191
4,0.591722,0.167073,-0.295374,0.994028,-1.579297,0.040093,0.076033,0.729859,-0.585952,0.05778,-0.153869,1.50867,1.114079,-0.625775,-0.173728,-0.170191


In [21]:
del Xc_train, Xc_val, Xc_test
gc.collect()

20

### LightGBM - chilled water

In [22]:
# Objective function for parameter optimization
def objective_lgb1(trial):
    
    joblib.dump(study_lgb1, '../objects/chilledwater/study_lgb1.pkl')
    
    dtrain = lgb.Dataset(Xc_train_scaled, label=yc_train)
    dval = lgb.Dataset(Xc_val_scaled, label=yc_val)

    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-4, 1e1),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-4, 1e1),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'num_leaves': trial.suggest_int('num_leaves', 2, 2048),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 5000),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'num_iterations': 5000,
        'early_stopping_round': 20,
        'metric': 'rmse',
        'num_threads': -1,
        'seed': 42
    }
    
    lgbm = lgb.train(params, dtrain, valid_sets=[dtrain, dval], valid_names=['train', 'valid'], verbose_eval=False)
    pred = lgbm.predict(Xc_val_scaled)
    pred[pred < 0] = 0
    loss = mean_squared_log_error(yc_val, pred)

#     cv = lgb.cv(params, dtrain, folds=KFold(10, shuffle=True, random_state=42), verbose_eval=False)
#     loss = cv['rmse-mean'][-1]
    
    return loss

In [23]:
# print(datetime.datetime.now())

# # Enable logging
# optuna.logging.enable_default_handler()

# # Optimize parameters
# study_lgb1 = optuna.create_study(direction='minimize')
# study_lgb1.optimize(objective_lgb1, n_trials=50)

# print(datetime.datetime.now())

# print(f'Finished trials: {len(study_lgb1.trials)}')
# print(f'Best trial: {study_lgb1.best_trial.value}')
# study_lgb1.best_trial.params

In [24]:
# Study results
study_lgb1 = joblib.load('../objects/chilledwater/study_lgb1.pkl')
print(f'Finished trials: {len(study_lgb1.trials)}')
print(f'Best trial: {study_lgb1.best_trial.value}')
study_lgb1.best_trial.params

Finished trials: 50
Best trial: 0.009449663067638923


{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671}

In [68]:
# Parameters from Optuna
params_lgb1 = dict(study_lgb1.best_trial.params)
params_lgb1['num_iterations'] = 10000
params_lgb1['early_stopping_round'] = 20
params_lgb1['metric'] = 'rmse'
params_lgb1['num_threads'] = -1
params_lgb1['seed'] = 42

# Root mean squared log error
rmsle_train, rmsle_val, rmsle_test = lgbm_rmsle(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, yc_test, params_lgb1, '../objects/chilledwater/lgbm1.txt')
print('Train RMSLE:', rmsle_train)
print('Validation RMSLE:', rmsle_val)
print('Test RMSLE:', rmsle_test)



Train RMSLE: 0.05925545877877479
Validation RMSLE: 0.09362403376475784
Test RMSLE: 0.0943962574506281


In [None]:
del rmsle_train, rmsle_val, rmsle_test
gc.collect()

### XGBoost - chilled water

In [25]:
# Objective function for parameter optimization
def objective_xgb1(trial):
    
    joblib.dump(study_xgb1, '../objects/chilledwater/study_xgb1.pkl')
    
    dtrain = xgb.DMatrix(Xc_val_scaled, label=yc_val)
    dval = xgb.DMatrix(Xc_test_scaled, label=yc_test)

    params = {
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'alpha': trial.suggest_loguniform('alpha', 1e-4, 1e1),
        'lambda': trial.suggest_loguniform('lambda', 1e-4, 1e1),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e1),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'max_leaves': trial.suggest_int('max_leaves', 2, 2024),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'rmse',
        'seed': 42
    }
    
    xg = xgb.train(params, dtrain, 
                   evals=[(dtrain, 'train'), (dval, 'valid')],
                   num_boost_round=1000, 
                   early_stopping_rounds=10,
                   verbose_eval=False)
    pred = xg.predict(dval)
    pred[pred < 0] = 0
    loss = mean_squared_log_error(yc_test, pred)

#     cv = xgb.cv(params, dtrain, folds=KFold(10, shuffle=True, random_state=42), verbose_eval=False)
#     loss = cv['rmse-mean'][-1]
    
    return loss

In [26]:
# print(f'Start: {datetime.datetime.now()}')

# # Optimize parameters
# study_xgb1 = optuna.create_study(direction='minimize')
# study_xgb1.optimize(objective_xgb1, n_trials=50)

# print(f'End: {datetime.datetime.now()}')

# print(f'Finished trials: {len(study_xgb1.trials)}')
# print(f'Best trial: {study_xgb1.best_trial.value}')
# study_xgb1.best_trial.params

In [27]:
# Study results
study_xgb1 = joblib.load('../objects/chilledwater/study_xgb1.pkl')
print(f'Finished trials: {len(study_xgb1.trials)}')
print(f'Best trial: {study_xgb1.best_trial.value}')
study_xgb1.best_trial.params

Finished trials: 50
Best trial: 0.01182694174349308


{'grow_policy': 'depthwise',
 'learning_rate': 0.03801178677708901,
 'alpha': 0.051064916441743415,
 'lambda': 0.890189928673286,
 'gamma': 0.053083931710043705,
 'max_depth': 16,
 'max_leaves': 1131,
 'subsample': 0.7245353487626778,
 'colsample_bytree': 0.777530288110325}

In [70]:
# Parameters from Optuna
params_xgb1 = dict(study_xgb1.best_trial.params)
params_xgb1['eval_metric'] = 'rmse'
params_xgb1['seed'] = 42

rmsle_train, rmsle_val, rmsle_test = xgb_rmsle(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, yc_test, params_xgb1, '../objects/chilledwater/xg1.txt')
print('Train RMSLE:', rmsle_train)
print('Validation RMSLE:', rmsle_val)
print('Test RMSLE:', rmsle_test)

Train RMSLE: 0.037819352
Validation RMSLE: 0.08988674
Test RMSLE: 0.09077369


In [None]:
del rmsle_train, rmsle_val, rmsle_test
gc.collect()

### Train/test split - steam

In [28]:
Xs_train, ys_train, Xs_val, ys_val, Xs_test, ys_test = tvt_split(train2)
print('Train set:', Xs_train.shape, ys_train.shape)
print('Validation set:', Xs_val.shape, ys_val.shape)
print('Test set:', Xs_test.shape, ys_test.shape)

Train set: (1377629, 16) (1377629, 1)
Validation set: (459210, 16) (459210, 1)
Test set: (459210, 16) (459210, 1)


In [29]:
del train2
gc.collect()

20

### Feature transformation - steam

In [30]:
Xs_train_scaled, Xs_val_scaled, Xs_test_scaled = transform_feats(Xs_train, ys_train, Xs_val, Xs_test)
Xs_train_scaled.head()



Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,-1.162719,-0.286393,0.453574,-0.740216,-1.852592,-0.560811,0.116663,0.535564,-0.857836,-1.086741,-0.157262,1.652178,-0.792023,-0.631759,-0.181878,-0.177634
1,0.511915,-1.472452,-0.549881,0.198366,-0.474143,1.124846,-0.221431,0.535564,-1.473199,-1.086741,-0.8411,1.2932,0.844909,-0.631759,-0.181878,-0.177634
2,-1.881141,1.247458,0.105153,-0.47205,-0.188589,-1.037873,-0.153812,0.535564,0.251682,-0.362719,1.531355,0.321111,-0.365016,-0.631759,-0.181878,-0.177634
3,-0.130618,0.311095,0.760187,-0.248578,-1.852592,0.826786,0.116663,0.535564,0.671248,-1.231545,-0.8411,1.2932,0.516242,-0.631759,-0.181878,-0.177634
4,0.355076,-0.634185,0.133028,0.198366,0.68993,-0.12007,0.2519,0.535564,-1.240107,-0.073111,-1.339644,0.321111,0.903768,-0.631759,-0.181878,-0.177634


In [31]:
del Xs_train, Xs_val, Xs_test
gc.collect()

20

### LightGBM - steam

In [32]:
# Objective function for parameter optimization
def objective_lgb2(trial):
    
    joblib.dump(study_lgb2, '../objects/steam/study_lgb2.pkl')
    
    dtrain = lgb.Dataset(Xs_train_scaled, label=ys_train)
    dval = lgb.Dataset(Xs_val_scaled, label=ys_val)

    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-4, 1e1),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-4, 1e1),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'num_leaves': trial.suggest_int('num_leaves', 2, 2048),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 5000),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'num_iterations': 5000,
        'early_stopping_round': 20,
        'metric': 'rmse',
        'num_threads': -1,
        'seed': 42
    }
    
    lgbm = lgb.train(params, dtrain, valid_sets=[dtrain, dval], valid_names=['train', 'valid'], verbose_eval=False)
    pred = lgbm.predict(Xs_val_scaled)
    pred[pred < 0] = 0
    loss = mean_squared_log_error(ys_val, pred)

#     cv = lgb.cv(params, dtrain, folds=KFold(10, shuffle=True, random_state=42), verbose_eval=False)
#     loss = cv['rmse-mean'][-1]
    
    return loss

In [33]:
# print(datetime.datetime.now())

# # Enable logging
# optuna.logging.enable_default_handler()

# # Optimize parameters
# study_lgb2 = optuna.create_study(direction='minimize')
# study_lgb2.optimize(objective_lgb2, n_trials=50)

# print(datetime.datetime.now())

# print(f'Finished trials: {len(study_lgb2.trials)}')
# print(f'Best trial: {study_lgb2.best_trial.value}')
# study_lgb2.best_trial.params

In [34]:
# Study results
study_lgb2 = joblib.load('../objects/steam/study_lgb2.pkl')
print(f'Finished trials: {len(study_lgb2.trials)}')
print(f'Best trial: {study_lgb2.best_trial.value}')
study_lgb2.best_trial.params

Finished trials: 50
Best trial: 0.008178677525527289


{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498}

In [35]:
# Parameters from Optuna
params_lgb2 = dict(study_lgb2.best_trial.params)
params_lgb2['num_iterations'] = 10000
params_lgb2['early_stopping_round'] = 20
params_lgb2['metric'] = 'rmse'
params_lgb2['num_threads'] = -1
params_lgb2['seed'] = 42

# Root mean squared log error
rmsle_train, rmsle_val, rmsle_test = lgbm_rmsle(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, ys_test, params_lgb2, '../objects/steam/lgbm2.txt')
print('Train RMSLE:', rmsle_train)
print('Validation RMSLE:', rmsle_val)
print('Test RMSLE:', rmsle_test)



Train RMSLE: 0.05803045259033331
Validation RMSLE: 0.09043599169296251
Test RMSLE: 0.09134125124560871


In [36]:
del rmsle_train, rmsle_val, rmsle_test
gc.collect()

46

### XGBoost - steam

In [35]:
# Objective function for parameter optimization
def objective_xgb2(trial):
    
    joblib.dump(study_xgb2, '../objects/steam/study_xgb2.pkl')
    
    dtrain = xgb.DMatrix(Xs_val_scaled, label=ys_val)
    dval = xgb.DMatrix(Xs_test_scaled, label=ys_test)

    params = {
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'alpha': trial.suggest_loguniform('alpha', 1e-4, 1e1),
        'lambda': trial.suggest_loguniform('lambda', 1e-4, 1e1),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e1),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'max_leaves': trial.suggest_int('max_leaves', 2, 2024),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'rmse',
        'seed': 42
    }
    
    xg = xgb.train(params, dtrain, 
                   evals=[(dtrain, 'train'), (dval, 'valid')],
                   num_boost_round=1000, 
                   early_stopping_rounds=10,
                   verbose_eval=False)
    pred = xg.predict(dval)
    pred[pred < 0] = 0
    loss = mean_squared_log_error(ys_test, pred)

#     cv = xgb.cv(params, dtrain, folds=KFold(10, shuffle=True, random_state=42), verbose_eval=False)
#     loss = cv['rmse-mean'][-1]
    
    return loss

In [36]:
# print(f'Start: {datetime.datetime.now()}')

# # Optimize parameters
# study_xgb2 = optuna.create_study(direction='minimize')
# study_xgb2.optimize(objective_xgb2, n_trials=50)

# print(f'End: {datetime.datetime.now()}')

# print(f'Finished trials: {len(study_xgb2.trials)}')
# print(f'Best trial: {study_xgb2.best_trial.value}')
# study_xgb2.best_trial.params

In [37]:
# Study results
study_xgb2 = joblib.load('../objects/steam/study_xgb2.pkl')
print(f'Finished trials: {len(study_xgb2.trials)}')
print(f'Best trial: {study_xgb2.best_trial.value}')
study_xgb2.best_trial.params

Finished trials: 50
Best trial: 0.009739160537719727


{'grow_policy': 'depthwise',
 'learning_rate': 0.045664576584020004,
 'alpha': 1.9071087639650297,
 'lambda': 0.00021496551872384197,
 'gamma': 0.15536767802205387,
 'max_depth': 59,
 'max_leaves': 1626,
 'subsample': 0.926581683911481,
 'colsample_bytree': 0.8620824662132568}

In [40]:
# Parameters from Optuna
params_xgb2 = dict(study_xgb2.best_trial.params)
params_xgb2['eval_metric'] = 'rmse'
params_xgb2['seed'] = 42

rmsle_train, rmsle_val, rmsle_test = xgb_rmsle(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, ys_test, params_xgb2, '../objects/steam/xg2.txt')
print('Train RMSLE:', rmsle_train)
print('Validation RMSLE:', rmsle_val)
print('Test RMSLE:', rmsle_test)

Train RMSLE: 0.038833246
Validation RMSLE: 0.0891143
Test RMSLE: 0.08991938


In [41]:
del rmsle_train, rmsle_val, rmsle_test
gc.collect()

116

### Train/test split - hot water

In [38]:
Xh_train, yh_train, Xh_val, yh_val, Xh_test, yh_test = tvt_split(train3)
print('Train set:', Xh_train.shape, yh_train.shape)
print('Validation set:', Xh_val.shape, yh_val.shape)
print('Test set:', Xh_test.shape, yh_test.shape)

Train set: (551824, 16) (551824, 1)
Validation set: (183942, 16) (183942, 1)
Test set: (183942, 16) (183942, 1)


In [39]:
del train3
gc.collect()

20

### Feature transformation - hot water

In [40]:
Xh_train_scaled, Xh_val_scaled, Xh_test_scaled = transform_feats(Xh_train, yh_train, Xh_val, Xh_test)
Xh_train_scaled.head()



Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,-1.378836,0.250694,-4.35399,0.70818,-0.294643,-1.085843,-0.554607,0.821996,-1.502306,-1.231708,-1.20803,1.046932,1.730162,-0.631296,-2.28666,-0.182268
1,0.914092,-0.606152,1.717379,-1.499822,2.024872,-1.007072,-0.499347,0.821996,-0.626856,0.219156,0.023539,0.124424,-1.207742,1.584043,-0.041646,-0.182268
2,-1.301746,-0.017071,-1.007465,0.94818,0.361148,-0.999774,-0.554607,0.821996,-0.554654,0.944588,1.415847,0.842008,0.933248,-0.631296,-2.28666,-0.182268
3,0.3883,-2.03066,3.264976,0.46818,-0.294643,0.040716,-0.33357,0.821996,1.593357,-1.666968,-1.559735,0.373638,0.617534,-0.631296,-0.041646,-0.182268
4,-1.129776,0.464906,-0.412233,-0.779821,1.460194,-0.901239,0.605837,-1.216551,1.133069,-1.521881,-1.20803,1.046932,-0.735872,1.584043,-0.041646,-0.182268


In [41]:
del Xh_train, Xh_val, Xh_test
gc.collect()

20

### LightGBM - hot water

In [42]:
# Objective function for parameter optimization
def objective_lgb3(trial):
    
    joblib.dump(study_lgb3, '../objects/hotwater/study_lgb3.pkl')
    
    dtrain = lgb.Dataset(Xh_train_scaled, label=yh_train)
    dval = lgb.Dataset(Xh_val_scaled, label=yh_val)

    params = {
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-4, 1e1),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-4, 1e1),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'num_leaves': trial.suggest_int('num_leaves', 2, 2048),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 5, 5000),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'num_iterations': 5000,
        'early_stopping_round': 20,
        'metric': 'rmse',
        'num_threads': -1,
        'seed': 42
    }
    
    lgbm = lgb.train(params, dtrain, valid_sets=[dtrain, dval], valid_names=['train', 'valid'], verbose_eval=False)
    pred = lgbm.predict(Xh_val_scaled)
    pred[pred < 0] = 0
    loss = mean_squared_log_error(yh_val, pred)

#     cv = lgb.cv(params, dtrain, folds=KFold(10, shuffle=True, random_state=42), verbose_eval=False)
#     loss = cv['rmse-mean'][-1]
    
    return loss

In [43]:
# print(datetime.datetime.now())

# # Enable logging
# optuna.logging.enable_default_handler()

# # Optimize parameters
# study_lgb3 = optuna.create_study(direction='minimize')
# study_lgb3.optimize(objective_lgb3, n_trials=50)

# print(datetime.datetime.now())

# print(f'Finished trials: {len(study_lgb3.trials)}')
# print(f'Best trial: {study_lgb3.best_trial.value}')
# study_lgb3.best_trial.params

2020-03-03 17:35:42.721353


[I 2020-03-03 17:38:07,612] Finished trial#0 resulted in value: 0.01859088810949926. Current best value is 0.01859088810949926 with parameters: {'learning_rate': 0.08024133085620601, 'lambda_l1': 1.7487893771198335, 'lambda_l2': 0.4677303361207078, 'max_depth': 25, 'num_leaves': 2036, 'min_data_in_leaf': 2184, 'bagging_fraction': 0.8464554792955383, 'feature_fraction': 0.7398623976708341}.
[I 2020-03-03 17:41:09,620] Finished trial#1 resulted in value: 0.04059647958394352. Current best value is 0.01859088810949926 with parameters: {'learning_rate': 0.08024133085620601, 'lambda_l1': 1.7487893771198335, 'lambda_l2': 0.4677303361207078, 'max_depth': 25, 'num_leaves': 2036, 'min_data_in_leaf': 2184, 'bagging_fraction': 0.8464554792955383, 'feature_fraction': 0.7398623976708341}.
[I 2020-03-03 17:43:12,612] Finished trial#2 resulted in value: 0.022993544829461404. Current best value is 0.01859088810949926 with parameters: {'learning_rate': 0.08024133085620601, 'lambda_l1': 1.748789377119833

2020-03-03 19:54:55.003814
Finished trials: 50
Best trial: 0.014979724161891843


{'learning_rate': 0.023942434668217872,
 'lambda_l1': 0.23828824891615835,
 'lambda_l2': 0.00028375734074312625,
 'max_depth': 58,
 'num_leaves': 1581,
 'min_data_in_leaf': 42,
 'bagging_fraction': 0.500825513633077,
 'feature_fraction': 0.9607308095583501}

In [44]:
# Study results
study_lgb3 = joblib.load('../objects/hotwater/study_lgb3.pkl')
print(f'Finished trials: {len(study_lgb3.trials)}')
print(f'Best trial: {study_lgb3.best_trial.value}')
study_lgb3.best_trial.params

Finished trials: 50
Best trial: 0.014979724161891843


{'learning_rate': 0.023942434668217872,
 'lambda_l1': 0.23828824891615835,
 'lambda_l2': 0.00028375734074312625,
 'max_depth': 58,
 'num_leaves': 1581,
 'min_data_in_leaf': 42,
 'bagging_fraction': 0.500825513633077,
 'feature_fraction': 0.9607308095583501}

In [45]:
# Parameters from Optuna
params_lgb3 = dict(study_lgb3.best_trial.params)
params_lgb3['num_iterations'] = 10000
params_lgb3['early_stopping_round'] = 20
params_lgb3['metric'] = 'rmse'
params_lgb3['num_threads'] = -1
params_lgb3['seed'] = 42

# Root mean squared log error
rmsle_train, rmsle_val, rmsle_test = lgbm_rmsle(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, yh_test, params_lgb3, '../objects/hotwater/lgbm3.txt')
print('Train RMSLE:', rmsle_train)
print('Validation RMSLE:', rmsle_val)
print('Test RMSLE:', rmsle_test)



Train RMSLE: 0.08628104651926864
Validation RMSLE: 0.12239168167271001
Test RMSLE: 0.12281269522818986


In [36]:
del rmsle_train, rmsle_val, rmsle_test
gc.collect()

46

### XGBoost - hot water

In [46]:
# Objective function for parameter optimization
def objective_xgb3(trial):
    
    joblib.dump(study_xgb3, '../objects/hotwater/study_xgb3.pkl')
    
    dtrain = xgb.DMatrix(Xh_val_scaled, label=yh_val)
    dval = xgb.DMatrix(Xh_test_scaled, label=yh_test)

    params = {
        'grow_policy': trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide']),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'alpha': trial.suggest_loguniform('alpha', 1e-4, 1e1),
        'lambda': trial.suggest_loguniform('lambda', 1e-4, 1e1),
        'gamma': trial.suggest_loguniform('gamma', 1e-4, 1e1),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'max_leaves': trial.suggest_int('max_leaves', 2, 2024),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'rmse',
        'seed': 42
    }
    
    xg = xgb.train(params, dtrain, 
                   evals=[(dtrain, 'train'), (dval, 'valid')],
                   num_boost_round=1000, 
                   early_stopping_rounds=10,
                   verbose_eval=False)
    pred = xg.predict(dval)
    pred[pred < 0] = 0
    loss = mean_squared_log_error(yh_test, pred)

#     cv = xgb.cv(params, dtrain, folds=KFold(10, shuffle=True, random_state=42), verbose_eval=False)
#     loss = cv['rmse-mean'][-1]
    
    return loss

In [47]:
# print(f'Start: {datetime.datetime.now()}')

# # Optimize parameters
# study_xgb3 = optuna.create_study(direction='minimize')
# study_xgb3.optimize(objective_xgb3, n_trials=50)

# print(f'End: {datetime.datetime.now()}')

# print(f'Finished trials: {len(study_xgb3.trials)}')
# print(f'Best trial: {study_xgb3.best_trial.value}')
# study_xgb3.best_trial.params

Start: 2020-03-03 19:56:42.344642


[I 2020-03-03 19:57:19,880] Finished trial#0 resulted in value: 0.06371662020683289. Current best value is 0.06371662020683289 with parameters: {'grow_policy': 'depthwise', 'learning_rate': 0.0017858014851832146, 'alpha': 0.20784827498656494, 'lambda': 0.00025615294943987296, 'gamma': 6.311264465983163, 'max_depth': 7, 'max_leaves': 118, 'subsample': 0.9577550578274965, 'colsample_bytree': 0.6628215901071686}.
[I 2020-03-03 19:59:49,202] Finished trial#1 resulted in value: 0.9317212700843811. Current best value is 0.06371662020683289 with parameters: {'grow_policy': 'depthwise', 'learning_rate': 0.0017858014851832146, 'alpha': 0.20784827498656494, 'lambda': 0.00025615294943987296, 'gamma': 6.311264465983163, 'max_depth': 7, 'max_leaves': 118, 'subsample': 0.9577550578274965, 'colsample_bytree': 0.6628215901071686}.
[I 2020-03-03 20:00:05,250] Finished trial#2 resulted in value: 0.10064920783042908. Current best value is 0.06371662020683289 with parameters: {'grow_policy': 'depthwise', 

End: 2020-03-03 22:39:38.598011
Finished trials: 50
Best trial: 0.01756460592150688


{'grow_policy': 'lossguide',
 'learning_rate': 0.009806612868641755,
 'alpha': 0.009923410545051567,
 'lambda': 2.4314911765779557,
 'gamma': 0.00182351994095809,
 'max_depth': 51,
 'max_leaves': 1225,
 'subsample': 0.8446420358715253,
 'colsample_bytree': 0.8798084029338699}

In [48]:
# Study results
study_xgb3 = joblib.load('../objects/hotwater/study_xgb3.pkl')
print(f'Finished trials: {len(study_xgb3.trials)}')
print(f'Best trial: {study_xgb3.best_trial.value}')
study_xgb3.best_trial.params

Finished trials: 50
Best trial: 0.01756460592150688


{'grow_policy': 'lossguide',
 'learning_rate': 0.009806612868641755,
 'alpha': 0.009923410545051567,
 'lambda': 2.4314911765779557,
 'gamma': 0.00182351994095809,
 'max_depth': 51,
 'max_leaves': 1225,
 'subsample': 0.8446420358715253,
 'colsample_bytree': 0.8798084029338699}

In [49]:
# Parameters from Optuna
params_xgb3 = dict(study_xgb3.best_trial.params)
params_xgb3['eval_metric'] = 'rmse'
params_xgb3['seed'] = 42

rmsle_train, rmsle_val, rmsle_test = xgb_rmsle(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, yh_test, params_xgb3, '../objects/hotwater/xg3.txt')
print('Train RMSLE:', rmsle_train)
print('Validation RMSLE:', rmsle_val)
print('Test RMSLE:', rmsle_test)

Train RMSLE: 0.010732781
Validation RMSLE: 0.11628261
Test RMSLE: 0.1168223


In [41]:
del rmsle_train, rmsle_val, rmsle_test
gc.collect()

116