# Imports

In [1]:
%matplotlib inline

import src.utils as udf

import gc
import joblib
import datetime
import holidays
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from feature_engine.categorical_encoders import RareLabelCategoricalEncoder as RareEncoder, \
                                                MeanCategoricalEncoder as MeanEncoder, \
                                                OrdinalCategoricalEncoder as OrdinalEncoder

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
import optuna

In [2]:
# Plot settings
sns.set(rc={'figure.figsize': (16, 4),
            'font.size': 16})

# Data

In [3]:
path = '../data/'

In [4]:
train = pd.read_pickle(f'{path}from_mod/train.pkl')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [5]:
train = udf.reduce_mem_usage(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [6]:
test = pd.read_pickle(f'{path}from_sub/test.pkl')
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             int64
hour                  int64
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            int64
country               object
is_holiday            int64
dtypes: float32(6), int64(4), object(2), uint16(2), uint32(1), uint8(2)
memory usage: 3.5+ GB


In [7]:
test = udf.reduce_mem_usage(test)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(6), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 2.4+ GB


In [8]:
submission = pd.read_csv(f'{path}raw/test/sample_submission.csv')
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           int64
meter_reading    int64
dtypes: int64(2)
memory usage: 636.3 MB


In [9]:
submission = udf.reduce_mem_usage(submission)
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           uint32
meter_reading    uint8
dtypes: uint32(1), uint8(1)
memory usage: 198.8 MB


In [10]:
mb = pd.read_pickle(f'{path}from_sub/mb.pkl')
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [11]:
mb = udf.reduce_mem_usage(mb)
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [12]:
del path
gc.collect()

0

# Features

In [13]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,46,0,15.604556,19.4,1019.400024,0.0,Retail,9045,2016,0,1,0,0.0,0.0,100.0,0,US,1
1,74,0,12.603682,19.4,1019.400024,0.0,Parking,387638,1997,0,1,0,0.0,0.0,100.0,0,US,1
2,93,0,15.364478,19.4,1019.400024,0.0,Office,33370,1982,0,1,0,0.0,0.0,100.0,0,US,1
3,105,0,23.3036,2.4,1020.900024,3.1,Education,50623,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1
4,106,0,0.3746,2.4,1020.900024,3.1,Education,5374,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1


In [14]:
test.head()

Unnamed: 0,building_id,meter,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,0,0,12.8,1022.099976,2.1,Education,7432,2008,0,1,0,-0.642788,0.766044,83.409012,1,US,1
1,1,0,12.8,1022.099976,2.1,Education,2720,2004,0,1,0,-0.642788,0.766044,83.409012,1,US,1
2,2,0,12.8,1022.099976,2.1,Education,5376,1991,0,1,0,-0.642788,0.766044,83.409012,1,US,1
3,3,0,12.8,1022.099976,2.1,Education,23685,2002,0,1,0,-0.642788,0.766044,83.409012,1,US,1
4,4,0,12.8,1022.099976,2.1,Education,116607,1975,0,1,0,-0.642788,0.766044,83.409012,1,US,1


In [15]:
train.drop(['missing_year', 'wind_direction_x'], axis=1, inplace=True)
feats = train.drop('meter_reading', axis=1).columns
test = test[feats]
test.head()

Unnamed: 0,building_id,meter,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,0,0,12.8,1022.099976,2.1,Education,7432,2008,1,0,0.766044,83.409012,1,US,1
1,1,0,12.8,1022.099976,2.1,Education,2720,2004,1,0,0.766044,83.409012,1,US,1
2,2,0,12.8,1022.099976,2.1,Education,5376,1991,1,0,0.766044,83.409012,1,US,1
3,3,0,12.8,1022.099976,2.1,Education,23685,2002,1,0,0.766044,83.409012,1,US,1
4,4,0,12.8,1022.099976,2.1,Education,116607,1975,1,0,0.766044,83.409012,1,US,1


In [16]:
del feats
gc.collect()

66

# Split data

In [17]:
train_df = []
test_df = []

for m in range(4):
    df_train = train[train.meter == m].drop('meter', axis=1)
    df_test = test[test.meter == m].drop('meter', axis=1)
    train_df.append(df_train)
    test_df.append(df_test)
    print(f'Meter {m}:', df_train.shape, df_test.shape)

Meter 0: (11530268, 15) (24755760, 14)
Meter 1: (3518870, 15) (8724960, 14)
Meter 2: (2296049, 15) (5676480, 14)
Meter 3: (919708, 15) (2540400, 14)


In [18]:
del m, df_train, df_test
gc.collect()

22

# Functions

In [19]:
def transform_data(df_train, df_test):
    X_train = df_train.drop('meter_reading', axis=1)
    y_train = df_train[['meter_reading']]
    y_train = np.log1p(y_train)
    
    X_train, X_val, X_test, rare_dict = udf.rare_encoder(X_train, df_test, ['primary_use'])
    X_train, X_val, X_test, mean_dict = udf.mean_encoder(X_train, y_train, X_test, ['primary_use', 'country'])
    X_train_scaled, X_val, X_test_scaled = udf.scale_feats(X_train, X_test)
    
    return X_train_scaled, X_test_scaled, y_train

In [20]:
def predict_lgb(X_train, y_train, X_test, params_dict, save_path):
    dtrain = lgb.Dataset(X_train, label=y_train)
    lgbm = lgb.train(params_dict, dtrain, valid_sets=[dtrain], valid_names=['train'], verbose_eval=False)
#     lgbm.save_model(save_path)
    joblib.dump(lgbm, save_path)
    
    pred = lgbm.predict(X_test)
    pred[pred < 0] = 0
    return pred

In [21]:
def predict_xgb(X_train, y_train, X_test, params_dict, save_path):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test)
    xg = xgb.train(params_dict, dtrain, evals=[(dtrain, 'train')], num_boost_round=1000, early_stopping_rounds=10, verbose_eval=False)
#     xg.save_model(save_path)
    joblib.dump(xg, save_path)
    
    pred = xg.predict(dtest)
    pred[pred < 0] = 0
    return pred

In [22]:
gc.collect()

88

# Electricity meter

### Transform data

In [23]:
Xe_train_scaled, Xe_test_scaled, ye_train = transform_data(train_df[0], test_df[0])
Xe_train_scaled.head()



Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,-1.665489,1.136225,0.418458,-1.532377,-1.035516,-0.747902,2.314036,-1.782112,-1.661713,0.09372,1.542446,-0.633757,0.18362,5.545828
1,-1.5965,1.136225,0.418458,-1.532377,-1.035516,2.635689,1.426743,-1.782112,-1.661713,0.09372,1.542446,-0.633757,0.18362,5.545828
2,-1.549687,1.136225,0.418458,-1.532377,0.25291,-0.530502,0.726249,-1.782112,-1.661713,0.09372,1.542446,-0.633757,0.18362,5.545828
3,-1.52012,-0.58331,0.62095,-0.197577,1.047964,-0.376308,-0.301143,-1.782112,-1.661713,-1.220925,1.11956,-0.633757,-2.124149,5.545828
4,-1.517656,-0.58331,0.62095,-0.197577,1.047964,-0.78071,-0.301143,-1.782112,-1.661713,-1.220925,1.11956,-0.633757,-2.124149,5.545828


### Lasso Regression

In [24]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xe_train_scaled, ye_train)

lasso0 = lasso.predict(Xe_test_scaled)
lasso0[lasso0 < 0] = 0
lasso0

array([3.54956087, 3.48910698, 3.42263687, ..., 3.40900343, 3.94560265,
       4.57810574])

### LightGBM

In [25]:
study_lgb0 = joblib.load('../objects/electricity/study_lgb.pkl')
params_lgb0 = study_lgb0.best_trial.params
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332}

In [26]:
params_lgb0['num_iterations'] = 10000
params_lgb0['early_stopping_round'] = 10
params_lgb0['metric'] = 'rmse'
params_lgb0['num_threads'] = -1
params_lgb0['seed'] = 42
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332,
 'num_iterations': 10000,
 'early_stopping_round': 10,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [27]:
lgbm0 = predict_lgb(Xe_train_scaled, ye_train, Xe_test_scaled, params_lgb0, '../objects/submission3/lgb0.pkl') # .txt')
lgbm0



array([3.09722584, 1.88164839, 0.51222062, ..., 2.12182217, 5.16017581,
       1.57023243])

### XGBoost

In [28]:
study_xgb0 = joblib.load('../objects/electricity/study_xgb.pkl')
params_xgb0 = study_xgb0.best_trial.params
params_xgb0['eval_metric'] = 'rmse'
params_xgb0['seed'] = 42
params_xgb0

{'grow_policy': 'lossguide',
 'learning_rate': 0.014754037383886122,
 'alpha': 0.005261717289274988,
 'lambda': 0.10981652452119427,
 'gamma': 0.00015007710756172543,
 'max_depth': 23,
 'max_leaves': 1598,
 'subsample': 0.8406224713599797,
 'colsample_bytree': 0.9175035245877109,
 'eval_metric': 'rmse',
 'seed': 42}

In [29]:
xg0 = predict_xgb(Xe_train_scaled, ye_train, Xe_test_scaled, params_xgb0, '../objects/submission3/xgb0.pkl') # .txt')
xg0



array([2.7455297 , 1.7999588 , 0.70870906, ..., 2.1669888 , 5.15829   ,
       1.6024051 ], dtype=float32)

In [30]:
del lasso, study_lgb0
gc.collect()

97

# Chilled water meter

### Transform data

In [31]:
Xc_train_scaled, Xc_test_scaled, yc_train = transform_data(train_df[1], test_df[1])
Xc_train_scaled.head()



Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,-1.591378,-1.372218,0.309256,-1.42888,0.758045,-0.478269,0.075476,-1.880578,-1.686758,-0.009177,-0.782728,-0.625362,-0.173847,5.873399
1,-1.5846,-1.372218,0.309256,-1.42888,-0.342804,3.61216,2.243055,-1.880578,-1.686758,-0.009177,-0.782728,-0.625362,-0.173847,5.873399
2,-1.580082,-1.372218,0.309256,-1.42888,0.758045,0.468508,2.020739,-1.880578,-1.686758,-0.009177,-0.782728,-0.625362,-0.173847,5.873399
3,-1.577823,-1.372218,0.309256,-1.42888,0.758045,0.435342,2.076318,-1.880578,-1.686758,-0.009177,-0.782728,-0.625362,-0.173847,5.873399
4,-1.575563,-1.372218,0.309256,-1.42888,0.694563,-0.706788,0.742423,-1.880578,-1.686758,-0.009177,-0.782728,-0.625362,-0.173847,5.873399


### Lasso Regression

In [32]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xc_train_scaled, yc_train)

lasso1 = lasso.predict(Xc_test_scaled)
lasso1[lasso1 < 0] = 0
lasso1

array([4.24615904, 3.37920026, 4.24073629, ..., 4.60337428, 4.45655105,
       4.82164732])

### LightGBM

In [33]:
study_lgb1 = joblib.load('../objects/chilledwater/study_lgb1.pkl')
params_lgb1 = study_lgb1.best_trial.params
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671}

In [34]:
params_lgb1['num_iterations'] = 10000
params_lgb1['early_stopping_round'] = 20
params_lgb1['metric'] = 'rmse'
params_lgb1['num_threads'] = -1
params_lgb1['seed'] = 42
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [35]:
lgbm1 = predict_lgb(Xc_train_scaled, yc_train, Xc_test_scaled, params_lgb1, '../objects/submission3/lgb1.pkl') # .txt')
lgbm1



array([4.11257445, 1.19383163, 2.81599296, ..., 1.01078737, 0.47256432,
       1.48833957])

### XGBoost

In [36]:
study_xgb1 = joblib.load('../objects/chilledwater/study_xgb1.pkl')
params_xgb1 = study_xgb1.best_trial.params
params_xgb1['eval_metric'] = 'rmse'
params_xgb1['seed'] = 42
params_xgb1

{'grow_policy': 'depthwise',
 'learning_rate': 0.03801178677708901,
 'alpha': 0.051064916441743415,
 'lambda': 0.890189928673286,
 'gamma': 0.053083931710043705,
 'max_depth': 16,
 'max_leaves': 1131,
 'subsample': 0.7245353487626778,
 'colsample_bytree': 0.777530288110325,
 'eval_metric': 'rmse',
 'seed': 42}

In [37]:
xg1 = predict_xgb(Xc_train_scaled, yc_train, Xc_test_scaled, params_xgb1, '../objects/submission3/xgb1.pkl') # .txt')
xg1

array([3.5334344 , 1.3368204 , 2.512142  , ..., 0.9364805 , 0.48840135,
       1.6987219 ], dtype=float32)

In [38]:
del lasso, study_lgb1
gc.collect()

97

# Steam meter

### Transform data

In [39]:
Xs_train_scaled, Xs_test_scaled, ys_train = transform_data(train_df[2], test_df[2])
Xs_train_scaled.head()



Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,-1.871636,-0.438255,0.481757,-0.740031,-0.188679,-0.892549,-0.152204,-1.734059,-1.66502,1.457392,0.126477,-0.631491,-0.181765,5.631085
1,-1.856461,-0.438255,0.481757,-0.740031,-0.188679,-0.846257,-0.152204,-1.734059,-1.66502,1.457392,0.126477,-0.631491,-0.181765,5.631085
2,-1.841287,-0.438255,0.481757,-0.740031,-0.188679,-0.816001,-0.152204,-1.734059,-1.66502,1.457392,0.126477,-0.631491,-0.181765,5.631085
3,-1.821054,-0.438255,0.481757,-0.740031,-0.188679,-0.754479,-0.152204,-1.734059,-1.66502,1.457392,0.126477,-0.631491,-0.181765,5.631085
4,-1.815996,-0.438255,0.481757,-0.740031,-1.856454,-0.752834,0.118055,-1.734059,-1.66502,1.457392,0.126477,-0.631491,-0.181765,5.631085


### Lasso Regression

In [40]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xs_train_scaled, ys_train)

lasso2 = lasso.predict(Xs_test_scaled)
lasso2[lasso2 < 0] = 0
lasso2

array([5.13067267, 5.21498871, 5.2458769 , ..., 5.77792329, 5.75762818,
       6.01564546])

### LightGBM

In [41]:
study_lgb2 = joblib.load('../objects/steam/study_lgb2.pkl')
params_lgb2 = study_lgb2.best_trial.params
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498}

In [42]:
params_lgb2['num_iterations'] = 10000
params_lgb2['early_stopping_round'] = 20
params_lgb2['metric'] = 'rmse'
params_lgb2['num_threads'] = -1
params_lgb2['seed'] = 42
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [43]:
lgbm2 = predict_lgb(Xs_train_scaled, ys_train, Xs_test_scaled, params_lgb2, '../objects/submission3/lgb2.pkl') # .txt')
lgbm2



array([3.43506245, 5.14996941, 8.45918489, ..., 6.24906511, 6.08563622,
       2.93509097])

### XGBoost

In [44]:
study_xgb2 = joblib.load('../objects/steam/study_xgb2.pkl')
params_xgb2 = study_xgb2.best_trial.params
params_xgb2['eval_metric'] = 'rmse'
params_xgb2['seed'] = 42
params_xgb2

{'grow_policy': 'depthwise',
 'learning_rate': 0.045664576584020004,
 'alpha': 1.9071087639650297,
 'lambda': 0.00021496551872384197,
 'gamma': 0.15536767802205387,
 'max_depth': 59,
 'max_leaves': 1626,
 'subsample': 0.926581683911481,
 'colsample_bytree': 0.8620824662132568,
 'eval_metric': 'rmse',
 'seed': 42}

In [45]:
xg2 = predict_xgb(Xs_train_scaled, ys_train, Xs_test_scaled, params_xgb2, '../objects/submission3/xgb2.pkl') # .txt')
xg2

array([3.5249217, 5.05113  , 8.48744  , ..., 6.357294 , 6.172634 ,
       2.9086502], dtype=float32)

In [46]:
del lasso, study_lgb2
gc.collect()

97

# Hot water meter

### Transform data

In [47]:
Xh_train_scaled, Xh_test_scaled, yh_train = transform_data(train_df[3], test_df[3])
Xh_train_scaled.head()



Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,-1.366926,-0.167468,0.791748,-0.011996,-0.291586,-0.825362,-0.554477,-1.591936,-1.665095,-1.119425,1.435164,-0.630965,-2.259887,5.476081
1,-1.364949,-0.167468,0.791748,-0.011996,-0.291586,-0.160304,-0.664923,-1.591936,-1.665095,-1.119425,1.435164,-0.630965,-2.259887,5.476081
2,-1.362972,-0.167468,0.791748,-0.011996,-0.291586,0.221559,-0.664923,-1.591936,-1.665095,-1.119425,1.435164,-0.630965,-2.259887,5.476081
3,-1.357043,-0.167468,0.791748,-0.011996,-0.291586,-0.988201,1.875338,-1.591936,-1.665095,-1.119425,1.435164,-0.630965,-2.259887,5.476081
4,-1.35309,-0.167468,0.791748,-0.011996,-0.291586,-0.251205,2.041007,-1.591936,-1.665095,-1.119425,1.435164,-0.630965,-2.259887,5.476081


### Lasso Regression

In [48]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xh_train_scaled, yh_train)

lasso3 = lasso.predict(Xh_test_scaled)
lasso3[lasso3 < 0] = 0
lasso3

array([3.96449514, 4.29579591, 4.15239367, ..., 5.48995443, 5.52113135,
       5.28519403])

### LightGBM

In [49]:
study_lgb3 = joblib.load('../objects/hotwater/study_lgb3.pkl')
params_lgb3 = study_lgb3.best_trial.params
params_lgb3

{'learning_rate': 0.023942434668217872,
 'lambda_l1': 0.23828824891615835,
 'lambda_l2': 0.00028375734074312625,
 'max_depth': 58,
 'num_leaves': 1581,
 'min_data_in_leaf': 42,
 'bagging_fraction': 0.500825513633077,
 'feature_fraction': 0.9607308095583501}

In [50]:
params_lgb3['num_iterations'] = 10000
params_lgb3['early_stopping_round'] = 20
params_lgb3['metric'] = 'rmse'
params_lgb3['num_threads'] = -1
params_lgb3['seed'] = 42

In [51]:
lgbm3 = predict_lgb(Xh_train_scaled, yh_train, Xh_test_scaled, params_lgb3, '../objects/submission3/lgb3.pkl') # .txt')
lgbm3



array([2.29696414, 4.27519541, 4.62297078, ..., 8.07765069, 5.82531429,
       8.03580037])

### XGBoost

In [52]:
study_xgb3 = joblib.load('../objects/hotwater/study_xgb3.pkl')
params_xgb3 = study_xgb3.best_trial.params
params_xgb3['eval_metric'] = 'rmse'
params_xgb3['seed'] = 42
params_xgb3

{'grow_policy': 'lossguide',
 'learning_rate': 0.009806612868641755,
 'alpha': 0.009923410545051567,
 'lambda': 2.4314911765779557,
 'gamma': 0.00182351994095809,
 'max_depth': 51,
 'max_leaves': 1225,
 'subsample': 0.8446420358715253,
 'colsample_bytree': 0.8798084029338699,
 'eval_metric': 'rmse',
 'seed': 42}

In [53]:
xg3 = predict_xgb(Xh_train_scaled, yh_train, Xh_test_scaled, params_xgb3, '../objects/submission3/xgb3.pkl') # .txt')
xg3

array([2.4874685, 4.3753457, 4.48964  , ..., 8.212854 , 5.722743 ,
       8.135026 ], dtype=float32)

In [54]:
del lasso, study_lgb3
gc.collect()

97

# Combine predictions

In [55]:
print('Test observations: ', [df.shape[0] for df in test_df])
print('Lasso predictions: ', list(map(len, [lasso0, lasso1, lasso2, lasso3])))
print('LightGBM predictions: ', list(map(len, [lgbm0, lgbm1, lgbm2, lgbm3])))
print('XGBoost predictions: ', list(map(len, [xg0, xg1, xg2, xg3])))

Test observations:  [24755760, 8724960, 5676480, 2540400]
Lasso predictions:  [24755760, 8724960, 5676480, 2540400]
LightGBM predictions:  [24755760, 8724960, 5676480, 2540400]
XGBoost predictions:  [24755760, 8724960, 5676480, 2540400]


In [56]:
# Electricity meter
pred0 = test_df[0][['building_id']].copy()
pred0['lasso'] = lasso0
pred0['lgb'] = lgbm0
pred0['xgb'] = xg0
pred0.drop('building_id', axis=1, inplace=True)
pred0

Unnamed: 0,lasso,lgb,xgb
0,3.549561,3.097226,2.745530
1,3.489107,1.881648,1.799959
2,3.422637,0.512221,0.708709
3,3.628445,3.238785,4.894021
4,4.124625,4.389382,3.388686
...,...,...,...
41697595,2.920560,1.676943,1.766006
41697596,3.924602,1.727303,1.827526
41697597,3.409003,2.121822,2.166989
41697598,3.945603,5.160176,5.158290


In [57]:
# Chilled water meter
pred1 = test_df[1][['building_id']].copy()
pred1['lasso'] = lasso1
pred1['lgb'] = lgbm1
pred1['xgb'] = xg1
pred1.drop('building_id', axis=1, inplace=True)
pred1

Unnamed: 0,lasso,lgb,xgb
8,4.246159,4.112574,3.533434
11,3.379200,1.193832,1.336820
16,4.240736,2.815993,2.512142
18,4.298950,3.181133,3.443561
20,3.336720,2.550655,3.060190
...,...,...,...
41697538,3.997476,3.860388,3.837852
41697541,4.413236,3.374878,3.418785
41697543,4.603374,1.010787,0.936481
41697545,4.456551,0.472564,0.488401


In [58]:
# Steam meter
pred2 = test_df[2][['building_id']].copy()
pred2['lasso'] = lasso2
pred2['lgb'] = lgbm2
pred2['xgb'] = xg2
pred2.drop('building_id', axis=1, inplace=True)
pred2

Unnamed: 0,lasso,lgb,xgb
16340505,5.130673,3.435062,3.524922
16340509,5.214989,5.149969,5.051130
16340515,5.245877,8.459185,8.487440
16340517,5.377686,5.268222,4.999544
16340521,5.267759,5.704744,5.714539
...,...,...,...
41697581,5.400224,5.540325,5.695114
41697584,5.610539,6.925007,6.870532
41697586,5.777923,6.249065,6.357294
41697588,5.757628,6.085636,6.172634


In [59]:
# Hot water meter
pred3 = test_df[3][['building_id']].copy()
pred3['lasso'] = lasso3
pred3['lgb'] = lgbm3
pred3['xgb'] = xg3
pred3.drop('building_id', axis=1, inplace=True)
pred3

Unnamed: 0,lasso,lgb,xgb
2260082,3.964495,2.296964,2.487468
2260086,4.295796,4.275195,4.375346
2260090,4.152394,4.622971,4.489640
2260092,4.605092,3.165257,3.227464
2260094,4.868590,5.235664,5.347295
...,...,...,...
41696866,5.731409,8.280734,8.307076
41697101,5.725892,5.730129,5.635671
41697116,5.489954,8.077651,8.212854
41697351,5.521131,5.825314,5.722743


In [60]:
# Log-transformed predictions
pred_transformed = pd.concat([pred0, pred1, pred2, pred3]).sort_index()
pred_transformed

Unnamed: 0,lasso,lgb,xgb
0,3.549561,3.097226,2.745530
1,3.489107,1.881648,1.799959
2,3.422637,0.512221,0.708709
3,3.628445,3.238785,4.894021
4,4.124625,4.389382,3.388686
...,...,...,...
41697595,2.920560,1.676943,1.766006
41697596,3.924602,1.727303,1.827526
41697597,3.409003,2.121822,2.166989
41697598,3.945603,5.160176,5.158290


In [61]:
del pred0, pred1, pred2, pred3
gc.collect()

110

# Transform predictions back to normal scale (and units for site 0)

In [62]:
# Transform back to normal scale
pred = pred_transformed.copy()
pred['lasso'] = np.expm1(pred.lasso)
pred['lgb'] = np.expm1(pred.lgb)
pred['xgb'] = np.expm1(pred.xgb)
pred.reset_index(inplace=True)
pred.columns = ['row_id', 'lasso', 'lgb', 'xgb']
pred

Unnamed: 0,row_id,lasso,lgb,xgb
0,0,33.798033,21.136456,14.572860
1,1,31.756682,5.564316,5.049398
2,2,29.650129,0.668993,1.031367
3,3,36.654206,24.502722,132.489258
4,4,60.844608,79.590563,28.627003
...,...,...,...,...
41697595,41697595,17.551681,4.349178,4.847453
41697596,41697596,49.632905,4.625460,5.218484
41697597,41697597,29.235098,7.346332,7.731951
41697598,41697598,50.707490,173.195078,172.866882


In [63]:
pred = pd.merge(mb[['row_id', 'site_id', 'meter']], pred, on='row_id', how='left')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,33.798033,21.136456,14.572860
1,1,0,0,31.756682,5.564316,5.049398
2,2,0,0,29.650129,0.668993,1.031367
3,3,0,0,36.654206,24.502722,132.489258
4,4,0,0,60.844608,79.590563,28.627003
...,...,...,...,...,...,...
41697595,41697595,15,0,17.551681,4.349178,4.847453
41697596,41697596,15,0,49.632905,4.625460,5.218484
41697597,41697597,15,0,29.235098,7.346332,7.731951
41697598,41697598,15,0,50.707490,173.195078,172.866882


In [64]:
# Site 0 meter 0 predictions in kWh
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,33.798033,21.136456,14.57286
1,1,0,0,31.756682,5.564316,5.049398
2,2,0,0,29.650129,0.668993,1.031367
3,3,0,0,36.654206,24.502722,132.489258
4,4,0,0,60.844608,79.590563,28.627003


In [65]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,115.31213,72.11336,49.719681
1,1,0,0,108.347448,18.984335,17.227537
2,2,0,0,101.16031,2.282471,3.518818
3,3,0,0,125.056819,83.598386,452.026825
4,4,0,0,207.589633,271.547083,97.669601


In [66]:
# Site 0 meter 1 predictions in tons
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,68.836657,60.103824,33.241364
11,11,0,1,28.347292,2.2997,2.80692
16,16,0,1,68.458975,15.70976,11.331315
18,18,0,1,72.622465,23.074013,30.298208
20,20,0,1,27.126728,11.815499,20.331614


In [67]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,826.039882,721.245886,398.896362
11,11,0,1,340.1675,27.596403,33.683037
16,16,0,1,821.507703,188.517115,135.975784
18,18,0,1,871.469577,276.888152,363.578491
20,20,0,1,325.520734,141.785992,243.97937


In [68]:
del mb
gc.collect()

0

# Save Predictions

In [69]:
pred.describe()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
count,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0
mean,20848800.0,8.086134,0.6642857,307.8682,369.6531,361.249
std,12037060.0,5.134712,0.9278067,1500.955,988.392,923.7194
min,0.0,0.0,0.0,0.2362829,0.0,0.0
25%,10424400.0,3.0,0.0,38.13405,26.26906,26.17671
50%,20848800.0,9.0,0.0,75.23381,87.90955,87.96815
75%,31273200.0,13.0,1.0,185.6528,280.4062,282.1136
max,41697600.0,15.0,3.0,170879.6,30103.13,19452.36


In [70]:
path = '../submissions/sub3/'
pred.to_pickle(f'{path}preds.pkl')
pred = pd.read_pickle(f'{path}preds.pkl')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,115.312130,72.113360,49.719681
1,1,0,0,108.347448,18.984335,17.227537
2,2,0,0,101.160310,2.282471,3.518818
3,3,0,0,125.056819,83.598386,452.026825
4,4,0,0,207.589633,271.547083,97.669601
...,...,...,...,...,...,...
41697595,41697595,15,0,17.551681,4.349178,4.847453
41697596,41697596,15,0,49.632905,4.625460,5.218484
41697597,41697597,15,0,29.235098,7.346332,7.731951
41697598,41697598,15,0,50.707490,173.195078,172.866882


In [71]:
submission

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
41697595,41697595,0
41697596,41697596,0
41697597,41697597,0
41697598,41697598,0


In [72]:
# Lasso predictions
lasso_pred = submission.copy()
lasso_pred['meter_reading'] = pred['lasso']
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,115.312130
1,1,108.347448
2,2,101.160310
3,3,125.056819
4,4,207.589633
...,...,...
41697595,41697595,17.551681
41697596,41697596,49.632905
41697597,41697597,29.235098
41697598,41697598,50.707490


In [73]:
# Lasso predictions
lgb_pred = submission.copy()
lgb_pred['meter_reading'] = pred['lgb']
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,72.113360
1,1,18.984335
2,2,2.282471
3,3,83.598386
4,4,271.547083
...,...,...
41697595,41697595,4.349178
41697596,41697596,4.625460
41697597,41697597,7.346332
41697598,41697598,173.195078


In [74]:
# Lasso predictions
xgb_pred = submission.copy()
xgb_pred['meter_reading'] = pred['xgb']
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,49.719681
1,1,17.227537
2,2,3.518818
3,3,452.026825
4,4,97.669601
...,...,...
41697595,41697595,4.847453
41697596,41697596,5.218484
41697597,41697597,7.731951
41697598,41697598,172.866882


In [75]:
lasso_pred.to_csv(f'{path}l1r.csv', index=False)
lasso_pred = pd.read_csv(f'{path}l1r.csv')
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,115.312130
1,1,108.347448
2,2,101.160310
3,3,125.056819
4,4,207.589633
...,...,...
41697595,41697595,17.551681
41697596,41697596,49.632905
41697597,41697597,29.235098
41697598,41697598,50.707490


In [76]:
lgb_pred.to_csv(f'{path}lgb.csv', index=False)
lgb_pred = pd.read_csv(f'{path}lgb.csv')
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,72.113360
1,1,18.984335
2,2,2.282471
3,3,83.598386
4,4,271.547083
...,...,...
41697595,41697595,4.349178
41697596,41697596,4.625460
41697597,41697597,7.346332
41697598,41697598,173.195078


In [77]:
xgb_pred.to_csv(f'{path}xgb.csv', index=False)
xgb_pred = pd.read_csv(f'{path}xgb.csv')
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,49.719680
1,1,17.227537
2,2,3.518818
3,3,452.026820
4,4,97.669600
...,...,...
41697595,41697595,4.847453
41697596,41697596,5.218484
41697597,41697597,7.731951
41697598,41697598,172.866880


### Test RMSLE:
##### Lasso regression:  (Public),  (Private)
##### LightGBM:  (Public),  (Private)
##### XGBoost:  (Public),  (Private)

# Combine LightGBM and XGBoost predictions

In [8]:
xl = pd.merge(lgb_pred, xgb_pred, on='row_id', how='left')
xl.columns = ['row_id', 'lgb', 'xgb']
xl.head()

Unnamed: 0,row_id,lgb,xgb
0,0,54.809699,51.882465
1,1,17.976733,16.754257
2,2,1.477563,3.712897
3,3,71.624491,373.01392
4,4,142.16725,117.07958


In [11]:
xl['xl55'] = xl[['lgb', 'xgb']].mean(axis=1)
xl['xl46'] = (xl.lgb * 0.4) + (xl.xgb * 0.6)
xl['xl2575'] = (xl.lgb * 0.25) + (xl.xgb * 0.75)
xl.head()

Unnamed: 0,row_id,lgb,xgb,xl55
0,0,54.809699,51.882465,53.346082
1,1,17.976733,16.754257,17.365495
2,2,1.477563,3.712897,2.59523
3,3,71.624491,373.01392,222.319205
4,4,142.16725,117.07958,129.623415


In [16]:
xl1 = xl[['row_id', 'xl55']]
xl1.columns = ['row_id', 'meter_reading']

xl1.to_csv(f'{path}xl1.csv', index=False)
xl1 = pd.read_csv(f'{path}xl1.csv')
xl1

Unnamed: 0,row_id,meter_reading
0,0,53.346082
1,1,17.365495
2,2,2.595230
3,3,222.319205
4,4,129.623415
...,...,...
41697595,41697595,4.897745
41697596,41697596,5.232897
41697597,41697597,7.730367
41697598,41697598,172.912913


In [17]:
xl2 = xl[['row_id', 'xl46']]
xl2.columns = ['row_id', 'meter_reading']

xl2.to_csv(f'{path}xl2.csv', index=False)
xl2 = pd.read_csv(f'{path}xl2.csv')
xl2

Unnamed: 0,row_id,meter_reading
0,0,53.053358
1,1,17.243247
2,2,2.818763
3,3,252.458148
4,4,127.114648
...,...,...
41697595,41697595,4.863569
41697596,41697596,5.207736
41697597,41697597,7.822719
41697598,41697598,172.888550


In [18]:
xl3 = xl[['row_id', 'xl2575']]
xl3.columns = ['row_id', 'meter_reading']

xl3.to_csv(f'{path}xl3.csv', index=False)
xl3 = pd.read_csv(f'{path}xl3.csv')
xl3

Unnamed: 0,row_id,meter_reading
0,0,52.614273
1,1,17.059876
2,2,3.154063
3,3,297.666563
4,4,123.351498
...,...,...
41697595,41697595,4.812304
41697596,41697596,5.169994
41697597,41697597,7.961248
41697598,41697598,172.852006
