# Imports

In [1]:
%matplotlib inline

import src.utils as udf

import gc
import joblib
import datetime
import holidays
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engine.categorical_encoders import RareLabelCategoricalEncoder as RareEncoder, \
                                                MeanCategoricalEncoder as MeanEncoder, \
                                                OrdinalCategoricalEncoder as OrdinalEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
import optuna

In [2]:
# Plot settings
sns.set(rc={'figure.figsize': (16, 4),
            'font.size': 16})

# Data

In [3]:
path = '../data/'

In [4]:
train = pd.read_pickle(f'{path}from_mod/train.pkl')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [5]:
train = udf.reduce_mem_usage(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [6]:
test = pd.read_pickle(f'{path}from_sub/test.pkl')
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             int64
hour                  int64
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            int64
country               object
is_holiday            int64
dtypes: float32(6), int64(4), object(2), uint16(2), uint32(1), uint8(2)
memory usage: 3.5+ GB


In [7]:
test = udf.reduce_mem_usage(test)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(6), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 2.4+ GB


In [8]:
submission = pd.read_csv(f'{path}raw/test/sample_submission.csv')
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           int64
meter_reading    int64
dtypes: int64(2)
memory usage: 636.3 MB


In [9]:
submission = udf.reduce_mem_usage(submission)
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           uint32
meter_reading    uint8
dtypes: uint32(1), uint8(1)
memory usage: 198.8 MB


In [10]:
mb = pd.read_pickle(f'{path}from_sub/mb.pkl')
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [11]:
mb = udf.reduce_mem_usage(mb)
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [12]:
del path
gc.collect()

22

# Features

In [13]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,46,0,15.604556,19.4,1019.400024,0.0,Retail,9045,2016,0,1,0,0.0,0.0,100.0,0,US,1
1,74,0,12.603682,19.4,1019.400024,0.0,Parking,387638,1997,0,1,0,0.0,0.0,100.0,0,US,1
2,93,0,15.364478,19.4,1019.400024,0.0,Office,33370,1982,0,1,0,0.0,0.0,100.0,0,US,1
3,105,0,23.3036,2.4,1020.900024,3.1,Education,50623,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1
4,106,0,0.3746,2.4,1020.900024,3.1,Education,5374,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1


In [14]:
test.head()

Unnamed: 0,building_id,meter,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,0,0,12.8,1022.099976,2.1,Education,7432,2008,0,1,0,-0.642788,0.766044,83.409012,1,US,1
1,1,0,12.8,1022.099976,2.1,Education,2720,2004,0,1,0,-0.642788,0.766044,83.409012,1,US,1
2,2,0,12.8,1022.099976,2.1,Education,5376,1991,0,1,0,-0.642788,0.766044,83.409012,1,US,1
3,3,0,12.8,1022.099976,2.1,Education,23685,2002,0,1,0,-0.642788,0.766044,83.409012,1,US,1
4,4,0,12.8,1022.099976,2.1,Education,116607,1975,0,1,0,-0.642788,0.766044,83.409012,1,US,1


In [15]:
train.drop(['missing_year', 'wind_direction_x'], axis=1, inplace=True)
feats = train.drop('meter_reading', axis=1).columns
test = test[feats]
test.head()

Unnamed: 0,building_id,meter,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,0,0,12.8,1022.099976,2.1,Education,7432,2008,0,1,0,-0.642788,0.766044,83.409012,1,US,1
1,1,0,12.8,1022.099976,2.1,Education,2720,2004,0,1,0,-0.642788,0.766044,83.409012,1,US,1
2,2,0,12.8,1022.099976,2.1,Education,5376,1991,0,1,0,-0.642788,0.766044,83.409012,1,US,1
3,3,0,12.8,1022.099976,2.1,Education,23685,2002,0,1,0,-0.642788,0.766044,83.409012,1,US,1
4,4,0,12.8,1022.099976,2.1,Education,116607,1975,0,1,0,-0.642788,0.766044,83.409012,1,US,1


In [16]:
del feats
gc.collect()

66

# Split data

In [17]:
train_df = []
test_df = []

for m in range(4):
    df_train = train[train.meter == m].drop('meter', axis=1)
    df_test = test[test.meter == m].drop('meter', axis=1)
    train_df.append(df_train)
    test_df.append(df_test)
    print(f'Meter {m}:', df_train.shape, df_test.shape)

Meter 0: (11530268, 17) (24755760, 16)
Meter 1: (3518870, 17) (8724960, 16)
Meter 2: (2296049, 17) (5676480, 16)
Meter 3: (919708, 17) (2540400, 16)


In [18]:
del m, df_train, df_test
gc.collect()

22

# Functions

In [19]:
def transform_data(df_train, df_test):
    X_train = df_train.drop('meter_reading', axis=1)
    y_train = df_train[['meter_reading']]
    y_train = np.log1p(y_train)
    
    X_train, X_val, X_test, rare_dict = udf.rare_encoder(X_train, df_test, ['primary_use'])
    X_train, X_val, X_test, mean_dict = udf.mean_encoder(X_train, y_train, X_test, ['primary_use', 'country'])
    X_train_scaled, X_val, X_test_scaled = udf.scale_feats(X_train, X_test)
    
    X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size=0.25, random_state=42)
    
    print('Train: ', X_train_scaled.shape, y_train.shape)
    print('Validation: ', X_val_scaled.shape, y_val.shape)
    print('Train: ', X_test_scaled.shape)
    
    return X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val

In [20]:
def predict_lgb(X_train, y_train, X_val, y_val, X_test, params_dict, save_path):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)
    lgbm = lgb.train(params_dict, dtrain, valid_sets=[dtrain, dval], valid_names=['train', 'val'], verbose_eval=False)
    lgbm.save_model(save_path)
    
    pred = lgbm.predict(X_test)
    pred[pred < 0] = 0
    return pred

In [21]:
def predict_xgb(X_train, y_train, X_val, y_val, X_test, params_dict, save_path):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)
    xg = xgb.train(params_dict, dtrain, evals=[(dtrain, 'train'), (dval, 'val')], num_boost_round=1000, early_stopping_rounds=10, verbose_eval=False)
    xg.save_model(save_path)
    
    pred = xg.predict(dtest)
    pred[pred < 0] = 0
    return pred

In [22]:
gc.collect()

66

# Electricity meter

### Transform data

In [24]:
Xe_train_scaled, Xe_val_scaled, Xe_test_scaled, ye_train, ye_val = transform_data(train_df[0], test_df[0])
Xe_train_scaled.head()



Train:  (8647701, 16) (8647701, 1)
Validation:  (2882567, 16) (2882567, 1)
Train:  (24755760, 16)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
3820751,1.660735,-0.714804,0.377953,0.663585,-0.364128,0.032242,-2.496025,-1.103144,-0.553622,0.794318,1.461126,-0.425474,-0.929986,-0.633757,0.18362,-0.180316
5192151,-1.138221,0.185423,-0.702006,-0.197577,1.047964,0.306161,0.58615,-1.103144,-0.144126,-0.650406,-0.19238,1.588679,-1.827506,-0.633757,0.18362,-0.180316
10784538,-0.288186,0.519215,0.782937,-0.886506,1.047964,-0.372759,-1.982329,-1.103144,1.484338,0.505373,-0.19238,-1.401239,0.421137,-0.633757,0.18362,-0.180316
3949783,1.724795,-0.097795,-0.715502,-1.532377,-0.364128,-0.651102,-1.608732,-1.103144,-0.51553,1.661153,0.065518,0.09372,0.921203,-0.633757,0.18362,-0.180316
2900643,-1.071696,-0.209059,-0.971996,-0.197577,0.25291,-0.527437,-0.487941,-1.103144,-0.839318,0.505373,-1.02067,-0.941568,-1.764215,1.577891,0.18362,-0.180316


### Lasso Regression

In [25]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xe_train_scaled, ye_train)

val0 = lasso.predict(Xe_val_scaled)
print(lasso.score(Xe_val_scaled, ye_val))
print(np.sqrt(mean_squared_log_error(ye_val, val0)))

lasso0 = lasso.predict(Xe_test_scaled)
lasso0[lasso0 < 0] = 0
lasso0

0.43644685533660776
0.28288227713627717


array([3.56470725, 3.50417155, 3.43739948, ..., 3.41324582, 3.94986875,
       4.58220954])

### LightGBM

In [26]:
study_lgb0 = joblib.load('../objects/electricity/study_lgb.pkl')
params_lgb0 = study_lgb0.best_trial.params
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332}

In [27]:
params_lgb0['num_iterations'] = 10000
params_lgb0['early_stopping_round'] = 10
params_lgb0['metric'] = 'rmse'
params_lgb0['num_threads'] = -1
params_lgb0['seed'] = 42
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332,
 'num_iterations': 10000,
 'early_stopping_round': 10,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [29]:
lgbm0 = predict_lgb(Xe_train_scaled, ye_train, Xe_val_scaled, ye_val, Xe_test_scaled, params_lgb0, '../objects/submission3/lgb0.pkl') # .txt')
lgbm0

array([3.1540757 , 2.03864938, 0.44416536, ..., 2.12634682, 5.172471  ,
       1.61522709])

### XGBoost

In [30]:
study_xgb0 = joblib.load('../objects/electricity/study_xgb.pkl')
params_xgb0 = study_xgb0.best_trial.params
params_xgb0['eval_metric'] = 'rmse'
params_xgb0['seed'] = 42
params_xgb0

{'grow_policy': 'lossguide',
 'learning_rate': 0.014754037383886122,
 'alpha': 0.005261717289274988,
 'lambda': 0.10981652452119427,
 'gamma': 0.00015007710756172543,
 'max_depth': 23,
 'max_leaves': 1598,
 'subsample': 0.8406224713599797,
 'colsample_bytree': 0.9175035245877109,
 'eval_metric': 'rmse',
 'seed': 42}

In [31]:
xg0 = predict_xgb(Xe_train_scaled, ye_train, Xe_val_scaled, ye_val, Xe_test_scaled, params_xgb0, '../objects/submission3/xgb0.pkl') # .txt')
xg0



array([2.6616104 , 1.6929845 , 0.77909154, ..., 2.3185928 , 5.1443424 ,
       1.5823331 ], dtype=float32)

In [32]:
del lasso, val0, study_lgb0
gc.collect()

108

# Chilled water meter

### Transform data

In [33]:
Xc_train_scaled, Xc_val_scaled, Xc_test_scaled, yc_train, yc_val = transform_data(train_df[1], test_df[1])
Xc_train_scaled.head()



Train:  (2639152, 16) (2639152, 1)
Validation:  (879718, 16) (879718, 1)
Train:  (8724960, 16)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
3054842,0.29511,-0.652177,1.457773,-0.716347,0.758045,0.383649,-0.257998,0.729506,1.245823,-0.088512,-0.489204,1.409013,-0.682122,-0.625362,-0.173847,-0.170259
1722336,0.817001,1.320539,-1.035706,0.043689,-1.578794,-0.856238,0.075476,0.729506,-0.00872,-0.379102,-0.878127,-1.189395,0.178401,-0.625362,-0.173847,-0.170259
511899,-1.397081,-0.760676,-1.549518,1.278746,0.611098,0.434406,0.408949,-1.370791,-1.21348,-1.250872,-0.154245,-1.526436,-0.639142,-0.625362,-0.173847,-0.170259
1159847,-1.891861,1.261357,0.475493,0.2812,0.758045,-0.292965,1.687265,-1.370791,-0.526468,1.073849,0.381633,1.508082,0.701131,-0.625362,-0.173847,-0.170259
688211,0.277036,1.152858,-0.92993,0.518711,-1.578794,-0.386131,0.075476,0.729506,-0.994432,-1.541463,-1.222583,0.761156,1.517252,-0.625362,-0.173847,-0.170259


### Lasso Regression

In [34]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xc_train_scaled, yc_train)

val1 = lasso.predict(Xc_val_scaled)
print(lasso.score(Xc_val_scaled, yc_val))
print(np.sqrt(mean_squared_log_error(yc_val, val1)))

lasso1 = lasso.predict(Xc_test_scaled)
lasso1[lasso1 < 0] = 0
lasso1

0.3636831099624357
0.34343200031760657


array([4.01890656, 3.12153255, 4.00542906, ..., 4.04388686, 3.88629343,
       4.23110277])

### LightGBM

In [35]:
study_lgb1 = joblib.load('../objects/chilledwater/study_lgb1.pkl')
params_lgb1 = study_lgb1.best_trial.params
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671}

In [36]:
params_lgb1['num_iterations'] = 10000
params_lgb1['early_stopping_round'] = 20
params_lgb1['metric'] = 'rmse'
params_lgb1['num_threads'] = -1
params_lgb1['seed'] = 42
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [37]:
lgbm1 = predict_lgb(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, params_lgb1, '../objects/submission3/lgb1.pkl') # .txt')
lgbm1



array([3.97479671, 1.16085364, 2.52688976, ..., 0.989952  , 0.21865837,
       1.92261079])

### XGBoost

In [38]:
study_xgb1 = joblib.load('../objects/chilledwater/study_xgb1.pkl')
params_xgb1 = study_xgb1.best_trial.params
params_xgb1['eval_metric'] = 'rmse'
params_xgb1['seed'] = 42
params_xgb1

{'grow_policy': 'depthwise',
 'learning_rate': 0.03801178677708901,
 'alpha': 0.051064916441743415,
 'lambda': 0.890189928673286,
 'gamma': 0.053083931710043705,
 'max_depth': 16,
 'max_leaves': 1131,
 'subsample': 0.7245353487626778,
 'colsample_bytree': 0.777530288110325,
 'eval_metric': 'rmse',
 'seed': 42}

In [39]:
xg1 = predict_xgb(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, params_xgb1, '../objects/submission3/xgb1.pkl') # .txt')
xg1

array([3.7609074 , 1.4283535 , 2.5475748 , ..., 0.8232962 , 0.48295254,
       1.2624555 ], dtype=float32)

In [40]:
del lasso, val1, study_lgb1
gc.collect()

100

# Steam meter

### Transform data

In [41]:
Xs_train_scaled, Xs_val_scaled, Xs_test_scaled, ys_train, ys_val = transform_data(train_df[2], test_df[2])
Xs_train_scaled.head()



Train:  (1722036, 16) (1722036, 1)
Validation:  (574013, 16) (574013, 1)
Train:  (5676480, 16)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
1227863,-0.055769,1.149265,0.300442,0.198853,0.693481,-0.370458,0.253185,0.536456,0.14002,-1.520262,-1.33981,0.321329,0.846145,-0.631491,-0.181765,-0.177586
793790,-1.072452,1.05116,-1.038513,-0.024691,-1.856454,-0.815237,0.118055,0.536456,-0.531292,-0.072679,-1.166664,-0.784003,-0.080153,-0.631491,-0.181765,-0.177586
2268147,-0.794255,0.649821,0.41202,-1.410663,-0.188679,-0.675412,-0.152204,0.536456,1.641147,-1.375504,0.09572,0.036405,0.650101,-0.631491,-0.181765,-0.177586
970560,-0.065885,-0.188533,-0.592195,0.869485,1.900406,0.542175,-0.354898,0.536456,-0.270226,1.66442,1.531249,0.321329,-0.984523,-0.631491,-0.181765,-0.177586
1921836,-1.224196,1.05116,0.41202,-1.410663,-1.856454,0.355498,0.118055,0.536456,1.174958,-1.66502,0.09572,0.036405,1.324216,-0.631491,-0.181765,-0.177586


### Lasso Regression

In [42]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xs_train_scaled, ys_train)

val2 = lasso.predict(Xs_val_scaled)
print(lasso.score(Xs_val_scaled, ys_val))
print(np.sqrt(mean_squared_log_error(ys_val, val2)))

lasso2 = lasso.predict(Xs_test_scaled)
lasso2[lasso2 < 0] = 0
lasso2

0.3695794860436076
0.26672297836694026


array([4.94876532, 5.03333772, 5.06622019, ..., 5.60341521, 5.54688705,
       5.73923076])

### LightGBM

In [43]:
study_lgb2 = joblib.load('../objects/steam/study_lgb2.pkl')
params_lgb2 = study_lgb2.best_trial.params
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498}

In [44]:
params_lgb2['num_iterations'] = 10000
params_lgb2['early_stopping_round'] = 20
params_lgb2['metric'] = 'rmse'
params_lgb2['num_threads'] = -1
params_lgb2['seed'] = 42
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [45]:
lgbm2 = predict_lgb(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, params_lgb2, '../objects/submission3/lgb2.pkl') # .txt')
lgbm2



array([3.46181614, 5.15397299, 8.43467644, ..., 6.3003671 , 6.18580643,
       3.03739893])

### XGBoost

In [46]:
study_xgb2 = joblib.load('../objects/steam/study_xgb2.pkl')
params_xgb2 = study_xgb2.best_trial.params
params_xgb2['eval_metric'] = 'rmse'
params_xgb2['seed'] = 42
params_xgb2

{'grow_policy': 'depthwise',
 'learning_rate': 0.045664576584020004,
 'alpha': 1.9071087639650297,
 'lambda': 0.00021496551872384197,
 'gamma': 0.15536767802205387,
 'max_depth': 59,
 'max_leaves': 1626,
 'subsample': 0.926581683911481,
 'colsample_bytree': 0.8620824662132568,
 'eval_metric': 'rmse',
 'seed': 42}

In [47]:
xg2 = predict_xgb(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, params_xgb2, '../objects/submission3/xgb2.pkl') # .txt')
xg2

array([3.4970672, 5.082091 , 8.4367695, ..., 6.4669933, 6.1868773,
       3.0965648], dtype=float32)

In [48]:
del lasso, val2, study_lgb2
gc.collect()

100

# Hot water meter

### Transform data

In [49]:
Xh_train_scaled, Xh_val_scaled, Xh_test_scaled, yh_train, yh_val = transform_data(train_df[3], test_df[3])
Xh_train_scaled.head()



Train:  (689781, 16) (689781, 1)
Validation:  (229927, 16) (229927, 1)
Train:  (2540400, 16)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
617696,0.902166,1.246184,-0.292844,0.467919,0.356687,-0.296929,-0.002246,0.821397,0.601279,0.219848,0.022716,-1.311659,-0.601921,-0.630965,-0.044778,-0.182612
559728,-1.234496,1.653144,-0.821912,0.707877,-1.973171,-0.896851,2.317122,-1.217438,0.348563,0.654835,1.056198,1.222356,-0.514823,-0.630965,-0.044778,-0.182612
900844,0.914025,-0.488753,2.207,0.227961,2.023683,-1.007877,-0.499254,0.821397,1.648246,0.219848,1.606102,0.372354,-0.191426,1.584873,-0.044778,5.476081
230300,-1.372855,0.089559,1.426621,0.467919,-0.291586,-0.583896,-0.941038,-1.217438,-0.896966,0.799831,1.533566,0.613943,1.249094,-0.630965,-2.259887,-0.182612
268936,-1.129738,-0.842166,-0.134121,-0.011996,1.456112,-0.901935,0.605207,-1.217438,-0.770608,0.654835,1.533566,-0.367554,-1.585444,-0.630965,-0.044778,-0.182612


### Lasso Regression

In [50]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xh_train_scaled, yh_train)

val3 = lasso.predict(Xh_val_scaled)
print(lasso.score(Xh_val_scaled, yh_val))
print(np.sqrt(mean_squared_log_error(yh_val, val3)))

lasso3 = lasso.predict(Xh_test_scaled)
lasso3[lasso3 < 0] = 0
lasso3

0.37663201313372874
0.3827641301037221


array([3.31196728, 4.49854374, 3.4868851 , ..., 6.30618509, 6.33471414,
       6.08575683])

### LightGBM

In [51]:
study_lgb3 = joblib.load('../objects/hotwater/study_lgb3.pkl')
params_lgb3 = study_lgb3.best_trial.params
params_lgb3

{'learning_rate': 0.023942434668217872,
 'lambda_l1': 0.23828824891615835,
 'lambda_l2': 0.00028375734074312625,
 'max_depth': 58,
 'num_leaves': 1581,
 'min_data_in_leaf': 42,
 'bagging_fraction': 0.500825513633077,
 'feature_fraction': 0.9607308095583501}

In [52]:
params_lgb3['num_iterations'] = 10000
params_lgb3['early_stopping_round'] = 20
params_lgb3['metric'] = 'rmse'
params_lgb3['num_threads'] = -1
params_lgb3['seed'] = 42

In [53]:
lgbm3 = predict_lgb(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, params_lgb3, '../objects/submission3/lgb3.pkl') # .txt')
lgbm3



array([2.21216226, 4.39984903, 4.54932769, ..., 8.19525535, 5.7771369 ,
       8.16497967])

### XGBoost

In [54]:
study_xgb3 = joblib.load('../objects/hotwater/study_xgb3.pkl')
params_xgb3 = study_xgb3.best_trial.params
params_xgb3['eval_metric'] = 'rmse'
params_xgb3['seed'] = 42
params_xgb3

{'grow_policy': 'lossguide',
 'learning_rate': 0.009806612868641755,
 'alpha': 0.009923410545051567,
 'lambda': 2.4314911765779557,
 'gamma': 0.00182351994095809,
 'max_depth': 51,
 'max_leaves': 1225,
 'subsample': 0.8446420358715253,
 'colsample_bytree': 0.8798084029338699,
 'eval_metric': 'rmse',
 'seed': 42}

In [55]:
xg3 = predict_xgb(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, params_xgb3, '../objects/submission3/xgb3.pkl') # .txt')
xg3

array([2.443047 , 4.3914104, 4.520183 , ..., 8.184082 , 5.7314363,
       8.119482 ], dtype=float32)

In [56]:
del lasso, val3, study_lgb3
gc.collect()

100

# Combine predictions

In [57]:
print('Test observations: ', [df.shape[0] for df in test_df])
print('Lasso predictions: ', list(map(len, [lasso0, lasso1, lasso2, lasso3])))
print('LightGBM predictions: ', list(map(len, [lgbm0, lgbm1, lgbm2, lgbm3])))
print('XGBoost predictions: ', list(map(len, [xg0, xg1, xg2, xg3])))

Test observations:  [24755760, 8724960, 5676480, 2540400]
Lasso predictions:  [24755760, 8724960, 5676480, 2540400]
LightGBM predictions:  [24755760, 8724960, 5676480, 2540400]
XGBoost predictions:  [24755760, 8724960, 5676480, 2540400]


In [58]:
# Electricity meter
pred0 = test_df[0][['building_id']].copy()
pred0['lasso'] = lasso0
pred0['lgb'] = lgbm0
pred0['xgb'] = xg0
pred0.drop('building_id', axis=1, inplace=True)
pred0

Unnamed: 0,lasso,lgb,xgb
0,3.564707,3.154076,2.661610
1,3.504172,2.038649,1.692984
2,3.437399,0.444165,0.779092
3,3.643407,2.770399,4.758497
4,4.138754,3.788354,3.479053
...,...,...,...
41697595,2.922922,1.765164,1.698674
41697596,3.929604,1.815196,1.820905
41697597,3.413246,2.126347,2.318593
41697598,3.949869,5.172471,5.144342


In [59]:
# Chilled water meter
pred1 = test_df[1][['building_id']].copy()
pred1['lasso'] = lasso1
pred1['lgb'] = lgbm1
pred1['xgb'] = xg1
pred1.drop('building_id', axis=1, inplace=True)
pred1

Unnamed: 0,lasso,lgb,xgb
8,4.018907,3.974797,3.760907
11,3.121533,1.160854,1.428354
16,4.005429,2.526890,2.547575
18,4.054622,3.302680,3.671750
20,3.099171,2.471126,3.197178
...,...,...,...
41697538,3.466860,3.795705,3.700027
41697541,3.802188,3.403475,3.388334
41697543,4.043887,0.989952,0.823296
41697545,3.886293,0.218658,0.482953


In [60]:
# Steam meter
pred2 = test_df[2][['building_id']].copy()
pred2['lasso'] = lasso2
pred2['lgb'] = lgbm2
pred2['xgb'] = xg2
pred2.drop('building_id', axis=1, inplace=True)
pred2

Unnamed: 0,lasso,lgb,xgb
16340505,4.948765,3.461816,3.497067
16340509,5.033338,5.153973,5.082091
16340515,5.066220,8.434676,8.436769
16340517,5.161285,5.190465,5.131716
16340521,5.090248,5.672593,5.646076
...,...,...,...
41697581,5.231263,5.564695,5.773028
41697584,5.431151,6.819534,6.993342
41697586,5.603415,6.300367,6.466993
41697588,5.546887,6.185806,6.186877


In [61]:
# Hot water meter
pred3 = test_df[3][['building_id']].copy()
pred3['lasso'] = lasso3
pred3['lgb'] = lgbm3
pred3['xgb'] = xg3
pred3.drop('building_id', axis=1, inplace=True)
pred3

Unnamed: 0,lasso,lgb,xgb
2260082,3.311967,2.212162,2.443047
2260086,4.498544,4.399849,4.391410
2260090,3.486885,4.549328,4.520183
2260092,4.791413,3.375947,3.227913
2260094,5.030432,5.404420,5.252806
...,...,...,...
41696866,6.571619,8.308251,8.258118
41697101,6.555142,5.626354,5.636195
41697116,6.306185,8.195255,8.184082
41697351,6.334714,5.777137,5.731436


In [62]:
# Log-transformed predictions
pred_transformed = pd.concat([pred0, pred1, pred2, pred3]).sort_index()
pred_transformed

Unnamed: 0,lasso,lgb,xgb
0,3.564707,3.154076,2.661610
1,3.504172,2.038649,1.692984
2,3.437399,0.444165,0.779092
3,3.643407,2.770399,4.758497
4,4.138754,3.788354,3.479053
...,...,...,...
41697595,2.922922,1.765164,1.698674
41697596,3.929604,1.815196,1.820905
41697597,3.413246,2.126347,2.318593
41697598,3.949869,5.172471,5.144342


In [63]:
del pred0, pred1, pred2, pred3
gc.collect()

110

# Transform predictions back to normal scale (and units for site 0)

In [64]:
# Transform back to normal scale
pred = pred_transformed.copy()
pred['lasso'] = np.expm1(pred.lasso)
pred['lgb'] = np.expm1(pred.lgb)
pred['xgb'] = np.expm1(pred.xgb)
pred.reset_index(inplace=True)
pred.columns = ['row_id', 'lasso', 'lgb', 'xgb']
pred

Unnamed: 0,row_id,lasso,lgb,xgb
0,0,34.329109,22.431369,13.319330
1,1,32.253883,6.680229,4.435679
2,2,30.105961,0.559188,1.179491
3,3,37.221856,14.964998,115.570618
4,4,61.724605,43.183597,31.428997
...,...,...,...,...
41697595,41697595,17.595547,4.842528,4.466696
41697596,41697596,49.886810,5.142279,5.177444
41697597,41697597,29.363640,7.384182,9.161366
41697598,41697598,50.928551,175.350060,170.458694


In [65]:
pred = pd.merge(mb[['row_id', 'site_id', 'meter']], pred, on='row_id', how='left')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,34.329109,22.431369,13.319330
1,1,0,0,32.253883,6.680229,4.435679
2,2,0,0,30.105961,0.559188,1.179491
3,3,0,0,37.221856,14.964998,115.570618
4,4,0,0,61.724605,43.183597,31.428997
...,...,...,...,...,...,...
41697595,41697595,15,0,17.595547,4.842528,4.466696
41697596,41697596,15,0,49.886810,5.142279,5.177444
41697597,41697597,15,0,29.363640,7.384182,9.161366
41697598,41697598,15,0,50.928551,175.350060,170.458694


In [66]:
# Site 0 meter 0 predictions in kWh
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,34.329109,22.431369,13.31933
1,1,0,0,32.253883,6.680229,4.435679
2,2,0,0,30.105961,0.559188,1.179491
3,3,0,0,37.221856,14.964998,115.570618
4,4,0,0,61.724605,43.183597,31.428997


In [67]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,117.124055,76.531346,45.44289
1,1,0,0,110.043799,22.791606,15.133649
2,2,0,0,102.715518,1.907839,4.024189
3,3,0,0,126.993527,51.057581,394.303833
4,4,0,0,210.592006,147.333795,107.229446


In [68]:
# Site 0 meter 1 predictions in tons
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,54.640233,52.239293,41.987415
11,11,0,1,21.681113,2.192658,3.171825
16,16,0,1,53.895373,11.514522,11.776081
18,18,0,1,56.663337,26.185388,38.320641
20,20,0,1,21.179562,10.835769,23.4634


In [69]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,655.6828,626.871512,503.848999
11,11,0,1,260.173356,26.31189,38.061897
16,16,0,1,646.744472,138.174269,141.312973
18,18,0,1,679.960043,314.224659,459.847687
20,20,0,1,254.154739,130.029227,281.560791


In [70]:
del mb
gc.collect()

110

# Save Predictions

In [71]:
pred.describe()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
count,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0
mean,20848800.0,8.086134,0.6642857,306.608,368.1448,356.3271
std,12037060.0,5.134712,0.9278067,1505.808,979.4739,910.8969
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,10424400.0,3.0,0.0,38.02365,26.40117,25.86097
50%,20848800.0,9.0,0.0,75.3129,87.86519,86.68192
75%,31273200.0,13.0,1.0,187.1852,280.9086,277.6914
max,41697600.0,15.0,3.0,176639.0,24603.96,22915.87


In [72]:
path = '../submissions/sub3/'
pred.to_pickle(f'{path}preds.pkl')
pred = pd.read_pickle(f'{path}preds.pkl')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,117.124055,76.531346,45.442890
1,1,0,0,110.043799,22.791606,15.133649
2,2,0,0,102.715518,1.907839,4.024189
3,3,0,0,126.993527,51.057581,394.303833
4,4,0,0,210.592006,147.333795,107.229446
...,...,...,...,...,...,...
41697595,41697595,15,0,17.595547,4.842528,4.466696
41697596,41697596,15,0,49.886810,5.142279,5.177444
41697597,41697597,15,0,29.363640,7.384182,9.161366
41697598,41697598,15,0,50.928551,175.350060,170.458694


In [73]:
submission

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
41697595,41697595,0
41697596,41697596,0
41697597,41697597,0
41697598,41697598,0


In [74]:
# Lasso predictions
lasso_pred = submission.copy()
lasso_pred['meter_reading'] = pred['lasso']
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,117.124055
1,1,110.043799
2,2,102.715518
3,3,126.993527
4,4,210.592006
...,...,...
41697595,41697595,17.595547
41697596,41697596,49.886810
41697597,41697597,29.363640
41697598,41697598,50.928551


In [75]:
# Lasso predictions
lgb_pred = submission.copy()
lgb_pred['meter_reading'] = pred['lgb']
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,76.531346
1,1,22.791606
2,2,1.907839
3,3,51.057581
4,4,147.333795
...,...,...
41697595,41697595,4.842528
41697596,41697596,5.142279
41697597,41697597,7.384182
41697598,41697598,175.350060


In [76]:
# Lasso predictions
xgb_pred = submission.copy()
xgb_pred['meter_reading'] = pred['xgb']
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,45.442890
1,1,15.133649
2,2,4.024189
3,3,394.303833
4,4,107.229446
...,...,...
41697595,41697595,4.466696
41697596,41697596,5.177444
41697597,41697597,9.161366
41697598,41697598,170.458694


In [77]:
lasso_pred.to_csv(f'{path}l1r.csv', index=False)
lasso_pred = pd.read_csv(f'{path}l1r.csv')
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,117.124055
1,1,110.043799
2,2,102.715518
3,3,126.993527
4,4,210.592006
...,...,...
41697595,41697595,17.595547
41697596,41697596,49.886810
41697597,41697597,29.363640
41697598,41697598,50.928551


In [78]:
lgb_pred.to_csv(f'{path}lgb.csv', index=False)
lgb_pred = pd.read_csv(f'{path}lgb.csv')
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,76.531346
1,1,22.791606
2,2,1.907839
3,3,51.057581
4,4,147.333795
...,...,...
41697595,41697595,4.842528
41697596,41697596,5.142279
41697597,41697597,7.384182
41697598,41697598,175.350060


In [79]:
xgb_pred.to_csv(f'{path}xgb.csv', index=False)
xgb_pred = pd.read_csv(f'{path}xgb.csv')
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,45.442890
1,1,15.133649
2,2,4.024189
3,3,394.303830
4,4,107.229450
...,...,...
41697595,41697595,4.466696
41697596,41697596,5.177444
41697597,41697597,9.161366
41697598,41697598,170.458700


### Test RMSLE:
##### Lasso regression:  (Public),  (Private)
##### LightGBM:  (Public),  (Private)
##### XGBoost:  (Public),  (Private)

# Combine LightGBM and XGBoost predictions

In [8]:
xl = pd.merge(lgb_pred, xgb_pred, on='row_id', how='left')
xl.columns = ['row_id', 'lgb', 'xgb']
xl.head()

Unnamed: 0,row_id,lgb,xgb
0,0,54.809699,51.882465
1,1,17.976733,16.754257
2,2,1.477563,3.712897
3,3,71.624491,373.01392
4,4,142.16725,117.07958


In [11]:
xl['xl55'] = xl[['lgb', 'xgb']].mean(axis=1)
xl['xl46'] = (xl.lgb * 0.4) + (xl.xgb * 0.6)
xl['xl2575'] = (xl.lgb * 0.25) + (xl.xgb * 0.75)
xl.head()

Unnamed: 0,row_id,lgb,xgb,xl55
0,0,54.809699,51.882465,53.346082
1,1,17.976733,16.754257,17.365495
2,2,1.477563,3.712897,2.59523
3,3,71.624491,373.01392,222.319205
4,4,142.16725,117.07958,129.623415


In [16]:
xl1 = xl[['row_id', 'xl55']]
xl1.columns = ['row_id', 'meter_reading']

xl1.to_csv(f'{path}xl1.csv', index=False)
xl1 = pd.read_csv(f'{path}xl1.csv')
xl1

Unnamed: 0,row_id,meter_reading
0,0,53.346082
1,1,17.365495
2,2,2.595230
3,3,222.319205
4,4,129.623415
...,...,...
41697595,41697595,4.897745
41697596,41697596,5.232897
41697597,41697597,7.730367
41697598,41697598,172.912913


In [17]:
xl2 = xl[['row_id', 'xl46']]
xl2.columns = ['row_id', 'meter_reading']

xl2.to_csv(f'{path}xl2.csv', index=False)
xl2 = pd.read_csv(f'{path}xl2.csv')
xl2

Unnamed: 0,row_id,meter_reading
0,0,53.053358
1,1,17.243247
2,2,2.818763
3,3,252.458148
4,4,127.114648
...,...,...
41697595,41697595,4.863569
41697596,41697596,5.207736
41697597,41697597,7.822719
41697598,41697598,172.888550


In [18]:
xl3 = xl[['row_id', 'xl2575']]
xl3.columns = ['row_id', 'meter_reading']

xl3.to_csv(f'{path}xl3.csv', index=False)
xl3 = pd.read_csv(f'{path}xl3.csv')
xl3

Unnamed: 0,row_id,meter_reading
0,0,52.614273
1,1,17.059876
2,2,3.154063
3,3,297.666563
4,4,123.351498
...,...,...
41697595,41697595,4.812304
41697596,41697596,5.169994
41697597,41697597,7.961248
41697598,41697598,172.852006
