# Imports

In [1]:
%matplotlib inline

import src.utils as udf

import gc
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import Lasso

import lightgbm as lgb
import xgboost as xgb

# Data

In [2]:
path = '../data/'

In [3]:
train = pd.read_pickle(f'{path}from_mod/train.pkl')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [4]:
train = udf.reduce_mem_usage(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [5]:
test = pd.read_pickle(f'{path}from_sub/test.pkl')
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             int64
hour                  int64
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            int64
country               object
is_holiday            int64
dtypes: float32(6), int64(4), object(2), uint16(2), uint32(1), uint8(2)
memory usage: 3.5+ GB


In [6]:
test = udf.reduce_mem_usage(test)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(6), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 2.4+ GB


In [7]:
submission = pd.read_csv(f'{path}raw/test/sample_submission.csv')
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           int64
meter_reading    int64
dtypes: int64(2)
memory usage: 636.3 MB


In [8]:
submission = udf.reduce_mem_usage(submission)
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           uint32
meter_reading    uint8
dtypes: uint32(1), uint8(1)
memory usage: 198.8 MB


In [9]:
mb = pd.read_pickle(f'{path}from_sub/mb.pkl')
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [10]:
mb = udf.reduce_mem_usage(mb)
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [11]:
del path
gc.collect()

44

# Features

In [12]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,46,0,15.604556,19.4,1019.400024,0.0,Retail,9045,2016,0,1,0,0.0,0.0,100.0,0,US,1
1,74,0,12.603682,19.4,1019.400024,0.0,Parking,387638,1997,0,1,0,0.0,0.0,100.0,0,US,1
2,93,0,15.364478,19.4,1019.400024,0.0,Office,33370,1982,0,1,0,0.0,0.0,100.0,0,US,1
3,105,0,23.3036,2.4,1020.900024,3.1,Education,50623,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1
4,106,0,0.3746,2.4,1020.900024,3.1,Education,5374,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1


In [13]:
test.head()

Unnamed: 0,building_id,meter,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,0,0,12.8,1022.099976,2.1,Education,7432,2008,0,1,0,-0.642788,0.766044,83.409012,1,US,1
1,1,0,12.8,1022.099976,2.1,Education,2720,2004,0,1,0,-0.642788,0.766044,83.409012,1,US,1
2,2,0,12.8,1022.099976,2.1,Education,5376,1991,0,1,0,-0.642788,0.766044,83.409012,1,US,1
3,3,0,12.8,1022.099976,2.1,Education,23685,2002,0,1,0,-0.642788,0.766044,83.409012,1,US,1
4,4,0,12.8,1022.099976,2.1,Education,116607,1975,0,1,0,-0.642788,0.766044,83.409012,1,US,1


In [14]:
train.drop(['missing_year', 'wind_direction_x', 'sea_level_pressure', 'rel_humidity', 'building_id', 'wind_direction_y'], axis=1, inplace=True)
feats = train.drop('meter_reading', axis=1).columns
test = test[feats]
test.head()

Unnamed: 0,meter,dew_temperature,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,is_weekend,country,is_holiday
0,0,12.8,2.1,Education,7432,2008,1,0,1,US,1
1,0,12.8,2.1,Education,2720,2004,1,0,1,US,1
2,0,12.8,2.1,Education,5376,1991,1,0,1,US,1
3,0,12.8,2.1,Education,23685,2002,1,0,1,US,1
4,0,12.8,2.1,Education,116607,1975,1,0,1,US,1


In [15]:
del feats
gc.collect()

66

# Split data

In [16]:
train_df = []
test_df = []

for m in range(4):
    df_train = train[train.meter == m].drop('meter', axis=1)
    df_test = test[test.meter == m].drop('meter', axis=1)
    train_df.append(df_train)
    test_df.append(df_test)
    print(f'Meter {m}:', df_train.shape, df_test.shape)

Meter 0: (11530268, 11) (24755760, 10)
Meter 1: (3518870, 11) (8724960, 10)
Meter 2: (2296049, 11) (5676480, 10)
Meter 3: (919708, 11) (2540400, 10)


In [17]:
del m, df_train, df_test
gc.collect()

22

# Functions

In [18]:
def transform_data(df_train, df_test):
    X_train = df_train.drop('meter_reading', axis=1)
    y_train = df_train[['meter_reading']]
    y_train = np.log1p(y_train)
    
    X_train, X_val, X_test, rare_dict = udf.rare_encoder(X_train, df_test, ['primary_use'])
    X_train, X_val, X_test, mean_dict = udf.mean_encoder(X_train, y_train, X_test, ['primary_use', 'country'])
    X_train_scaled, X_val, X_test_scaled = udf.scale_feats(X_train, X_test)
    
    X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size=0.25, random_state=42)
    
    print('Train: ', X_train_scaled.shape, y_train.shape)
    print('Validation: ', X_val_scaled.shape, y_val.shape)
    print('Train: ', X_test_scaled.shape)
    
    return X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val

In [19]:
def predict_lgb(X_train, y_train, X_val, y_val, X_test, params_dict, save_path):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)
    lgbm = lgb.train(params_dict, dtrain, valid_sets=[dtrain, dval], valid_names=['train', 'val'], verbose_eval=False)
    lgbm.save_model(save_path)
    
    pred = lgbm.predict(X_test)
    pred[pred < 0] = 0
    return pred

In [20]:
def predict_xgb(X_train, y_train, X_val, y_val, X_test, params_dict, save_path):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)
    xg = xgb.train(params_dict, dtrain, evals=[(dtrain, 'train'), (dval, 'val')], num_boost_round=1000, early_stopping_rounds=10, verbose_eval=False)
    xg.save_model(save_path)
    
    pred = xg.predict(dtest)
    pred[pred < 0] = 0
    return pred

In [21]:
gc.collect()

44

# Electricity meter

### Transform data

In [22]:
Xe_train_scaled, Xe_val_scaled, Xe_test_scaled, ye_train, ye_val = transform_data(train_df[0], test_df[0])
Xe_train_scaled.head()



Train:  (8647701, 10) (8647701, 1)
Validation:  (2882567, 10) (2882567, 1)
Train:  (24755760, 10)


Unnamed: 0,dew_temperature,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,is_weekend,country,is_holiday
3820751,-0.714804,0.663585,-0.364128,0.032242,-2.496025,-0.553622,0.794318,-0.633757,0.18362,-0.180316
5192151,0.185423,-0.197577,1.047964,0.306161,0.58615,-0.144126,-0.650406,-0.633757,0.18362,-0.180316
10784538,0.519215,-0.886506,1.047964,-0.372759,-1.982329,1.484338,0.505373,-0.633757,0.18362,-0.180316
3949783,-0.097795,-1.532377,-0.364128,-0.651102,-1.608732,-0.51553,1.661153,-0.633757,0.18362,-0.180316
2900643,-0.209059,-0.197577,0.25291,-0.527437,-0.487941,-0.839318,0.505373,1.577891,0.18362,-0.180316


### Lasso Regression

In [23]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xe_train_scaled, ye_train)

val0 = lasso.predict(Xe_val_scaled)
print(lasso.score(Xe_val_scaled, ye_val))
print(np.sqrt(mean_squared_log_error(ye_val, val0)))

lasso0 = lasso.predict(Xe_test_scaled)
lasso0[lasso0 < 0] = 0
lasso0

0.4319418889324335
0.283669118540562


array([3.70715764, 3.64631742, 3.57966761, ..., 3.28231633, 3.81638814,
       4.44950177])

### LightGBM

In [24]:
study_lgb0 = joblib.load('../objects/electricity/study_lgb.pkl')
params_lgb0 = study_lgb0.best_trial.params
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332}

In [25]:
params_lgb0['num_iterations'] = 10000
params_lgb0['early_stopping_round'] = 10
params_lgb0['metric'] = 'rmse'
params_lgb0['num_threads'] = -1
params_lgb0['seed'] = 42
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332,
 'num_iterations': 10000,
 'early_stopping_round': 10,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [26]:
lgbm0 = predict_lgb(Xe_train_scaled, ye_train, Xe_val_scaled, ye_val, Xe_test_scaled, params_lgb0, '../objects/submission5/lgb0.pkl') # .txt')
lgbm0



array([2.99282268, 1.47011324, 0.61531442, ..., 2.18399088, 5.138987  ,
       1.56788163])

### XGBoost

In [27]:
study_xgb0 = joblib.load('../objects/electricity/study_xgb.pkl')
params_xgb0 = study_xgb0.best_trial.params
params_xgb0['eval_metric'] = 'rmse'
params_xgb0['seed'] = 42
params_xgb0

{'grow_policy': 'lossguide',
 'learning_rate': 0.014754037383886122,
 'alpha': 0.005261717289274988,
 'lambda': 0.10981652452119427,
 'gamma': 0.00015007710756172543,
 'max_depth': 23,
 'max_leaves': 1598,
 'subsample': 0.8406224713599797,
 'colsample_bytree': 0.9175035245877109,
 'eval_metric': 'rmse',
 'seed': 42}

In [28]:
xg0 = predict_xgb(Xe_train_scaled, ye_train, Xe_val_scaled, ye_val, Xe_test_scaled, params_xgb0, '../objects/submission5/xgb0.pkl') # .txt')
xg0



array([2.7869961, 1.6420962, 0.6931938, ..., 2.2249656, 5.1218357,
       1.6599343], dtype=float32)

In [29]:
del lasso, val0, study_lgb0
gc.collect()

100

# Chilled water meter

### Transform data

In [30]:
Xc_train_scaled, Xc_val_scaled, Xc_test_scaled, yc_train, yc_val = transform_data(train_df[1], test_df[1])
Xc_train_scaled.head()



Train:  (2639152, 10) (2639152, 1)
Validation:  (879718, 10) (879718, 1)
Train:  (8724960, 10)


Unnamed: 0,dew_temperature,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,is_weekend,country,is_holiday
3054842,-0.652177,-0.716347,0.758045,0.383649,-0.257998,1.245823,-0.088512,-0.625362,-0.173847,-0.170259
1722336,1.320539,0.043689,-1.578794,-0.856238,0.075476,-0.00872,-0.379102,-0.625362,-0.173847,-0.170259
511899,-0.760676,1.278746,0.611098,0.434406,0.408949,-1.21348,-1.250872,-0.625362,-0.173847,-0.170259
1159847,1.261357,0.2812,0.758045,-0.292965,1.687265,-0.526468,1.073849,-0.625362,-0.173847,-0.170259
688211,1.152858,0.518711,-1.578794,-0.386131,0.075476,-0.994432,-1.541463,-0.625362,-0.173847,-0.170259


### Lasso Regression

In [31]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xc_train_scaled, yc_train)

val1 = lasso.predict(Xc_val_scaled)
print(lasso.score(Xc_val_scaled, yc_val))
print(np.sqrt(mean_squared_log_error(yc_val, val1)))

lasso1 = lasso.predict(Xc_test_scaled)
lasso1[lasso1 < 0] = 0
lasso1

0.28773575522597106
0.35824474141310614


array([5.03430741, 4.10588914, 5.00810388, ..., 4.46224075, 4.30459204,
       4.62817871])

### LightGBM

In [32]:
study_lgb1 = joblib.load('../objects/chilledwater/study_lgb1.pkl')
params_lgb1 = study_lgb1.best_trial.params
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671}

In [33]:
params_lgb1['num_iterations'] = 10000
params_lgb1['early_stopping_round'] = 20
params_lgb1['metric'] = 'rmse'
params_lgb1['num_threads'] = -1
params_lgb1['seed'] = 42
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [34]:
lgbm1 = predict_lgb(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, params_lgb1, '../objects/submission5/lgb1.pkl') # .txt')
lgbm1



array([4.11155291, 1.35262337, 2.06403015, ..., 1.12279785, 0.68395054,
       3.85404329])

### XGBoost

In [35]:
study_xgb1 = joblib.load('../objects/chilledwater/study_xgb1.pkl')
params_xgb1 = study_xgb1.best_trial.params
params_xgb1['eval_metric'] = 'rmse'
params_xgb1['seed'] = 42
params_xgb1

{'grow_policy': 'depthwise',
 'learning_rate': 0.03801178677708901,
 'alpha': 0.051064916441743415,
 'lambda': 0.890189928673286,
 'gamma': 0.053083931710043705,
 'max_depth': 16,
 'max_leaves': 1131,
 'subsample': 0.7245353487626778,
 'colsample_bytree': 0.777530288110325,
 'eval_metric': 'rmse',
 'seed': 42}

In [36]:
xg1 = predict_xgb(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, params_xgb1, '../objects/submission5/xgb1.pkl') # .txt')
xg1

array([3.9515536 , 1.0544503 , 2.9680653 , ..., 0.91247594, 0.6474498 ,
       4.0173216 ], dtype=float32)

In [37]:
del lasso, val1, study_lgb1
gc.collect()

100

# Steam meter

### Transform data

In [38]:
Xs_train_scaled, Xs_val_scaled, Xs_test_scaled, ys_train, ys_val = transform_data(train_df[2], test_df[2])
Xs_train_scaled.head()



Train:  (1722036, 10) (1722036, 1)
Validation:  (574013, 10) (574013, 1)
Train:  (5676480, 10)


Unnamed: 0,dew_temperature,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,is_weekend,country,is_holiday
1227863,1.149265,0.198853,0.693481,-0.370458,0.253185,0.14002,-1.520262,-0.631491,-0.181765,-0.177586
793790,1.05116,-0.024691,-1.856454,-0.815237,0.118055,-0.531292,-0.072679,-0.631491,-0.181765,-0.177586
2268147,0.649821,-1.410663,-0.188679,-0.675412,-0.152204,1.641147,-1.375504,-0.631491,-0.181765,-0.177586
970560,-0.188533,0.869485,1.900406,0.542175,-0.354898,-0.270226,1.66442,-0.631491,-0.181765,-0.177586
1921836,1.05116,-1.410663,-1.856454,0.355498,0.118055,1.174958,-1.66502,-0.631491,-0.181765,-0.177586


### Lasso Regression

In [39]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xs_train_scaled, ys_train)

val2 = lasso.predict(Xs_val_scaled)
print(lasso.score(Xs_val_scaled, ys_val))
print(np.sqrt(mean_squared_log_error(ys_val, val2)))

lasso2 = lasso.predict(Xs_test_scaled)
lasso2[lasso2 < 0] = 0
lasso2

0.3120525923046893
0.27449811897348836


array([6.13215746, 6.20630145, 6.23008381, ..., 5.370356  , 5.34441748,
       5.68163808])

### LightGBM

In [40]:
study_lgb2 = joblib.load('../objects/steam/study_lgb2.pkl')
params_lgb2 = study_lgb2.best_trial.params
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498}

In [41]:
params_lgb2['num_iterations'] = 10000
params_lgb2['early_stopping_round'] = 20
params_lgb2['metric'] = 'rmse'
params_lgb2['num_threads'] = -1
params_lgb2['seed'] = 42
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [42]:
lgbm2 = predict_lgb(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, params_lgb2, '../objects/submission5/lgb2.pkl') # .txt')
lgbm2



array([4.65322015, 5.14898015, 8.39147446, ..., 6.43008592, 6.2763424 ,
       3.26740959])

### XGBoost

In [43]:
study_xgb2 = joblib.load('../objects/steam/study_xgb2.pkl')
params_xgb2 = study_xgb2.best_trial.params
params_xgb2['eval_metric'] = 'rmse'
params_xgb2['seed'] = 42
params_xgb2

{'grow_policy': 'depthwise',
 'learning_rate': 0.045664576584020004,
 'alpha': 1.9071087639650297,
 'lambda': 0.00021496551872384197,
 'gamma': 0.15536767802205387,
 'max_depth': 59,
 'max_leaves': 1626,
 'subsample': 0.926581683911481,
 'colsample_bytree': 0.8620824662132568,
 'eval_metric': 'rmse',
 'seed': 42}

In [44]:
xg2 = predict_xgb(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, params_xgb2, '../objects/submission5/xgb2.pkl') # .txt')
xg2

array([3.588488 , 4.9324713, 8.480513 , ..., 6.526628 , 6.370153 ,
       3.5039847], dtype=float32)

In [45]:
del lasso, val2, study_lgb2
gc.collect()

100

# Hot water meter

### Transform data

In [46]:
Xh_train_scaled, Xh_val_scaled, Xh_test_scaled, yh_train, yh_val = transform_data(train_df[3], test_df[3])
Xh_train_scaled.head()



Train:  (689781, 10) (689781, 1)
Validation:  (229927, 10) (229927, 1)
Train:  (2540400, 10)


Unnamed: 0,dew_temperature,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,is_weekend,country,is_holiday
617696,1.246184,0.467919,0.356687,-0.296929,-0.002246,0.601279,0.219848,-0.630965,-0.044778,-0.182612
559728,1.653144,0.707877,-1.973171,-0.896851,2.317122,0.348563,0.654835,-0.630965,-0.044778,-0.182612
900844,-0.488753,0.227961,2.023683,-1.007877,-0.499254,1.648246,0.219848,1.584873,-0.044778,5.476081
230300,0.089559,0.467919,-0.291586,-0.583896,-0.941038,-0.896966,0.799831,-0.630965,-2.259887,-0.182612
268936,-0.842166,-0.011996,1.456112,-0.901935,0.605207,-0.770608,0.654835,-0.630965,-0.044778,-0.182612


### Lasso Regression

In [47]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xh_train_scaled, yh_train)

val3 = lasso.predict(Xh_val_scaled)
print(lasso.score(Xh_val_scaled, yh_val))
print(np.sqrt(mean_squared_log_error(yh_val, val3)))

lasso3 = lasso.predict(Xh_test_scaled)
lasso3[lasso3 < 0] = 0
lasso3

0.1817706489881239
0.40821794632661085


array([4.22411032, 4.52750956, 4.35278515, ..., 4.7686193 , 5.53930815,
       4.72149436])

### LightGBM

In [48]:
study_lgb3 = joblib.load('../objects/hotwater/study_lgb3.pkl')
params_lgb3 = study_lgb3.best_trial.params
params_lgb3

{'learning_rate': 0.023942434668217872,
 'lambda_l1': 0.23828824891615835,
 'lambda_l2': 0.00028375734074312625,
 'max_depth': 58,
 'num_leaves': 1581,
 'min_data_in_leaf': 42,
 'bagging_fraction': 0.500825513633077,
 'feature_fraction': 0.9607308095583501}

In [49]:
params_lgb3['num_iterations'] = 10000
params_lgb3['early_stopping_round'] = 20
params_lgb3['metric'] = 'rmse'
params_lgb3['num_threads'] = -1
params_lgb3['seed'] = 42

In [50]:
lgbm3 = predict_lgb(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, params_lgb3, '../objects/submission5/lgb3.pkl') # .txt')
lgbm3



array([2.30699633, 4.51591721, 4.87263313, ..., 8.39162638, 5.55925233,
       8.24464821])

### XGBoost

In [51]:
study_xgb3 = joblib.load('../objects/hotwater/study_xgb3.pkl')
params_xgb3 = study_xgb3.best_trial.params
params_xgb3['eval_metric'] = 'rmse'
params_xgb3['seed'] = 42
params_xgb3

{'grow_policy': 'lossguide',
 'learning_rate': 0.009806612868641755,
 'alpha': 0.009923410545051567,
 'lambda': 2.4314911765779557,
 'gamma': 0.00182351994095809,
 'max_depth': 51,
 'max_leaves': 1225,
 'subsample': 0.8446420358715253,
 'colsample_bytree': 0.8798084029338699,
 'eval_metric': 'rmse',
 'seed': 42}

In [52]:
xg3 = predict_xgb(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, params_xgb3, '../objects/submission5/xgb3.pkl') # .txt')
xg3

array([2.5913582, 4.331254 , 4.658316 , ..., 8.262454 , 5.6579776,
       8.212147 ], dtype=float32)

In [53]:
del lasso, val3, study_lgb3
gc.collect()

100

# Combine predictions

In [54]:
print('Test observations: ', [df.shape[0] for df in test_df])
print('Lasso predictions: ', list(map(len, [lasso0, lasso1, lasso2, lasso3])))
print('LightGBM predictions: ', list(map(len, [lgbm0, lgbm1, lgbm2, lgbm3])))
print('XGBoost predictions: ', list(map(len, [xg0, xg1, xg2, xg3])))

Test observations:  [24755760, 8724960, 5676480, 2540400]
Lasso predictions:  [24755760, 8724960, 5676480, 2540400]
LightGBM predictions:  [24755760, 8724960, 5676480, 2540400]
XGBoost predictions:  [24755760, 8724960, 5676480, 2540400]


In [56]:
# Electricity meter
pred0 = test_df[0][['dayofyear']].copy()
pred0['lasso'] = lasso0
pred0['lgb'] = lgbm0
pred0['xgb'] = xg0
pred0.drop('dayofyear', axis=1, inplace=True)
pred0

Unnamed: 0,lasso,lgb,xgb
0,3.707158,2.992823,2.786996
1,3.646317,1.470113,1.642096
2,3.579668,0.615314,0.693194
3,3.786011,3.572633,4.880510
4,4.285159,4.648524,3.513339
...,...,...,...
41697595,2.794025,1.770332,1.623312
41697596,3.791327,1.662047,1.750197
41697597,3.282316,2.183991,2.224966
41697598,3.816388,5.138987,5.121836


In [57]:
# Chilled water meter
pred1 = test_df[1][['dayofyear']].copy()
pred1['lasso'] = lasso1
pred1['lgb'] = lgbm1
pred1['xgb'] = xg1
pred1.drop('dayofyear', axis=1, inplace=True)
pred1

Unnamed: 0,lasso,lgb,xgb
8,5.034307,4.111553,3.951554
11,4.105889,1.352623,1.054450
16,5.008104,2.064030,2.968065
18,5.044468,2.847326,3.129877
20,4.116061,2.642188,3.276200
...,...,...,...
41697538,3.934950,3.563849,3.555144
41697541,4.176718,3.284388,3.397947
41697543,4.462241,1.122798,0.912476
41697545,4.304592,0.683951,0.647450


In [58]:
# Steam meter
pred2 = test_df[2][['dayofyear']].copy()
pred2['lasso'] = lasso2
pred2['lgb'] = lgbm2
pred2['xgb'] = xg2
pred2.drop('dayofyear', axis=1, inplace=True)
pred2

Unnamed: 0,lasso,lgb,xgb
16340505,6.132157,4.653220,3.588488
16340509,6.206301,5.148980,4.932471
16340515,6.230084,8.391474,8.480513
16340517,6.393150,5.166964,5.494358
16340521,6.245628,5.683478,5.765615
...,...,...,...
41697581,5.029608,6.074075,5.930480
41697584,5.335002,7.314515,7.026133
41697586,5.370356,6.430086,6.526628
41697588,5.344417,6.276342,6.370153


In [59]:
# Hot water meter
pred3 = test_df[3][['dayofyear']].copy()
pred3['lasso'] = lasso3
pred3['lgb'] = lgbm3
pred3['xgb'] = xg3
pred3.drop('dayofyear', axis=1, inplace=True)
pred3

Unnamed: 0,lasso,lgb,xgb
2260082,4.224110,2.306996,2.591358
2260086,4.527510,4.515917,4.331254
2260090,4.352785,4.872633,4.658316
2260092,4.696159,3.382443,3.129199
2260094,4.884155,5.566306,5.219331
...,...,...,...
41696866,4.815744,8.384931,8.318701
41697101,5.586433,5.460835,5.600766
41697116,4.768619,8.391626,8.262454
41697351,5.539308,5.559252,5.657978


In [60]:
# Log-transformed predictions
pred_transformed = pd.concat([pred0, pred1, pred2, pred3]).sort_index()
pred_transformed

Unnamed: 0,lasso,lgb,xgb
0,3.707158,2.992823,2.786996
1,3.646317,1.470113,1.642096
2,3.579668,0.615314,0.693194
3,3.786011,3.572633,4.880510
4,4.285159,4.648524,3.513339
...,...,...,...
41697595,2.794025,1.770332,1.623312
41697596,3.791327,1.662047,1.750197
41697597,3.282316,2.183991,2.224966
41697598,3.816388,5.138987,5.121836


In [61]:
del pred0, pred1, pred2, pred3
gc.collect()

104

# Transform predictions back to normal scale (and units for site 0)

In [62]:
# Transform back to normal scale
pred = pred_transformed.copy()
pred['lasso'] = np.expm1(pred.lasso)
pred['lgb'] = np.expm1(pred.lgb)
pred['xgb'] = np.expm1(pred.xgb)
pred.reset_index(inplace=True)
pred.columns = ['row_id', 'lasso', 'lgb', 'xgb']
pred

Unnamed: 0,row_id,lasso,lgb,xgb
0,0,39.737850,18.941893,15.232187
1,1,37.333240,3.349728,4.165987
2,2,34.861619,0.850238,1.000093
3,3,43.080201,34.610227,130.697861
4,4,71.614087,103.430784,32.560146
...,...,...,...,...
41697595,41697595,15.346683,4.872803,4.069853
41697596,41697596,43.315155,4.270085,4.755738
41697597,41697597,25.637402,7.881681,8.253164
41697598,41697598,44.439789,169.542920,166.642822


In [63]:
pred = pd.merge(mb[['row_id', 'site_id', 'meter']], pred, on='row_id', how='left')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,39.737850,18.941893,15.232187
1,1,0,0,37.333240,3.349728,4.165987
2,2,0,0,34.861619,0.850238,1.000093
3,3,0,0,43.080201,34.610227,130.697861
4,4,0,0,71.614087,103.430784,32.560146
...,...,...,...,...,...,...
41697595,41697595,15,0,15.346683,4.872803,4.069853
41697596,41697596,15,0,43.315155,4.270085,4.755738
41697597,41697597,15,0,25.637402,7.881681,8.253164
41697598,41697598,15,0,44.439789,169.542920,166.642822


In [64]:
# Site 0 meter 0 predictions in kWh
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,39.73785,18.941893,15.232187
1,1,0,0,37.33324,3.349728,4.165987
2,2,0,0,34.861619,0.850238,1.000093
3,3,0,0,43.080201,34.610227,130.697861
4,4,0,0,71.614087,103.430784,32.560146


In [65]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,135.577597,64.625949,51.969173
1,1,0,0,127.37355,11.428601,14.213514
2,2,0,0,118.940871,2.900843,3.412118
3,3,0,0,146.981029,118.083171,445.914948
4,4,0,0,244.332941,352.885147,111.088707


In [66]:
# Site 0 meter 1 predictions in tons
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,152.593178,60.041436,51.016113
11,11,0,1,59.696689,2.867558,1.870397
16,16,0,1,148.620768,6.877654,18.454245
18,18,0,1,154.161727,16.241619,21.871162
20,20,0,1,60.31725,13.043896,25.474972


In [67]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,1831.11814,720.497229,612.193359
11,11,0,1,716.360264,34.410699,22.444761
16,16,0,1,1783.449211,82.531848,221.450928
18,18,0,1,1849.940725,194.899433,262.453949
20,20,0,1,723.807,156.526748,305.699646


In [68]:
del mb
gc.collect()

20

# Save Predictions

In [69]:
pred.describe()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
count,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0
mean,20848800.0,8.086134,0.6642857,320.9549,356.2353,347.5202
std,12037060.0,5.134712,0.9278067,1637.012,905.9069,886.3033
min,0.0,0.0,0.0,2.544252,0.0,0.0
25%,10424400.0,3.0,0.0,40.42393,27.70319,26.76995
50%,20848800.0,9.0,0.0,77.27886,90.88669,88.21765
75%,31273200.0,13.0,1.0,180.7503,281.6429,268.6156
max,41697600.0,15.0,3.0,160618.2,53755.42,40266.55


In [70]:
path = '../submissions/sub5/'
pred.to_pickle(f'{path}preds.pkl')
pred = pd.read_pickle(f'{path}preds.pkl')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,135.577597,64.625949,51.969173
1,1,0,0,127.373550,11.428601,14.213514
2,2,0,0,118.940871,2.900843,3.412118
3,3,0,0,146.981029,118.083171,445.914948
4,4,0,0,244.332941,352.885147,111.088707
...,...,...,...,...,...,...
41697595,41697595,15,0,15.346683,4.872803,4.069853
41697596,41697596,15,0,43.315155,4.270085,4.755738
41697597,41697597,15,0,25.637402,7.881681,8.253164
41697598,41697598,15,0,44.439789,169.542920,166.642822


In [71]:
submission

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
41697595,41697595,0
41697596,41697596,0
41697597,41697597,0
41697598,41697598,0


In [72]:
# Lasso predictions
lasso_pred = submission.copy()
lasso_pred['meter_reading'] = pred['lasso']
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,135.577597
1,1,127.373550
2,2,118.940871
3,3,146.981029
4,4,244.332941
...,...,...
41697595,41697595,15.346683
41697596,41697596,43.315155
41697597,41697597,25.637402
41697598,41697598,44.439789


In [73]:
# Lasso predictions
lgb_pred = submission.copy()
lgb_pred['meter_reading'] = pred['lgb']
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,64.625949
1,1,11.428601
2,2,2.900843
3,3,118.083171
4,4,352.885147
...,...,...
41697595,41697595,4.872803
41697596,41697596,4.270085
41697597,41697597,7.881681
41697598,41697598,169.542920


In [74]:
# Lasso predictions
xgb_pred = submission.copy()
xgb_pred['meter_reading'] = pred['xgb']
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,51.969173
1,1,14.213514
2,2,3.412118
3,3,445.914948
4,4,111.088707
...,...,...
41697595,41697595,4.069853
41697596,41697596,4.755738
41697597,41697597,8.253164
41697598,41697598,166.642822


In [75]:
lasso_pred.to_csv(f'{path}l1r.csv', index=False)
lasso_pred = pd.read_csv(f'{path}l1r.csv')
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,135.577597
1,1,127.373550
2,2,118.940871
3,3,146.981029
4,4,244.332941
...,...,...
41697595,41697595,15.346683
41697596,41697596,43.315155
41697597,41697597,25.637402
41697598,41697598,44.439789


In [76]:
lgb_pred.to_csv(f'{path}lgb.csv', index=False)
lgb_pred = pd.read_csv(f'{path}lgb.csv')
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,64.625949
1,1,11.428601
2,2,2.900843
3,3,118.083171
4,4,352.885147
...,...,...
41697595,41697595,4.872803
41697596,41697596,4.270085
41697597,41697597,7.881681
41697598,41697598,169.542920


In [77]:
xgb_pred.to_csv(f'{path}xgb.csv', index=False)
xgb_pred = pd.read_csv(f'{path}xgb.csv')
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,51.969173
1,1,14.213514
2,2,3.412118
3,3,445.914950
4,4,111.088710
...,...,...
41697595,41697595,4.069853
41697596,41697596,4.755738
41697597,41697597,8.253164
41697598,41697598,166.642820


### Test RMSLE:
##### Lasso regression:  (Public),  (Private)
##### LightGBM:  (Public),  (Private)
##### XGBoost:  (Public),  (Private)

# Combine LightGBM and XGBoost predictions

In [8]:
xl = pd.merge(lgb_pred, xgb_pred, on='row_id', how='left')
xl.columns = ['row_id', 'lgb', 'xgb']
xl.head()

Unnamed: 0,row_id,lgb,xgb
0,0,54.809699,51.882465
1,1,17.976733,16.754257
2,2,1.477563,3.712897
3,3,71.624491,373.01392
4,4,142.16725,117.07958


In [11]:
xl['xl55'] = xl[['lgb', 'xgb']].mean(axis=1)
xl['xl46'] = (xl.lgb * 0.4) + (xl.xgb * 0.6)
xl['xl2575'] = (xl.lgb * 0.25) + (xl.xgb * 0.75)
xl.head()

Unnamed: 0,row_id,lgb,xgb,xl55
0,0,54.809699,51.882465,53.346082
1,1,17.976733,16.754257,17.365495
2,2,1.477563,3.712897,2.59523
3,3,71.624491,373.01392,222.319205
4,4,142.16725,117.07958,129.623415


In [16]:
xl1 = xl[['row_id', 'xl55']]
xl1.columns = ['row_id', 'meter_reading']

xl1.to_csv(f'{path}xl1.csv', index=False)
xl1 = pd.read_csv(f'{path}xl1.csv')
xl1

Unnamed: 0,row_id,meter_reading
0,0,53.346082
1,1,17.365495
2,2,2.595230
3,3,222.319205
4,4,129.623415
...,...,...
41697595,41697595,4.897745
41697596,41697596,5.232897
41697597,41697597,7.730367
41697598,41697598,172.912913


In [17]:
xl2 = xl[['row_id', 'xl46']]
xl2.columns = ['row_id', 'meter_reading']

xl2.to_csv(f'{path}xl2.csv', index=False)
xl2 = pd.read_csv(f'{path}xl2.csv')
xl2

Unnamed: 0,row_id,meter_reading
0,0,53.053358
1,1,17.243247
2,2,2.818763
3,3,252.458148
4,4,127.114648
...,...,...
41697595,41697595,4.863569
41697596,41697596,5.207736
41697597,41697597,7.822719
41697598,41697598,172.888550


In [18]:
xl3 = xl[['row_id', 'xl2575']]
xl3.columns = ['row_id', 'meter_reading']

xl3.to_csv(f'{path}xl3.csv', index=False)
xl3 = pd.read_csv(f'{path}xl3.csv')
xl3

Unnamed: 0,row_id,meter_reading
0,0,52.614273
1,1,17.059876
2,2,3.154063
3,3,297.666563
4,4,123.351498
...,...,...
41697595,41697595,4.812304
41697596,41697596,5.169994
41697597,41697597,7.961248
41697598,41697598,172.852006
