# 14 feats

# Imports

In [1]:
%matplotlib inline

import src.utils as udf

import gc
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import Lasso

import lightgbm as lgb
import xgboost as xgb

# Data

In [3]:
path = '../data/'

In [4]:
train = pd.read_pickle(f'{path}from_mod/train.pkl')
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [5]:
train = udf.reduce_mem_usage(train)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18264895 entries, 0 to 18264894
Data columns (total 18 columns):
building_id           uint16
meter                 uint8
meter_reading         float32
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(7), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 1.1+ GB


In [6]:
test = pd.read_pickle(f'{path}from_sub/test.pkl')
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             int64
hour                  int64
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            int64
country               object
is_holiday            int64
dtypes: float32(6), int64(4), object(2), uint16(2), uint32(1), uint8(2)
memory usage: 3.5+ GB


In [7]:
test = udf.reduce_mem_usage(test)
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 17 columns):
building_id           uint16
meter                 uint8
dew_temperature       float32
sea_level_pressure    float32
wind_speed            float32
primary_use           object
square_feet           uint32
year_built            uint16
missing_year          uint8
dayofyear             uint16
hour                  uint8
wind_direction_x      float32
wind_direction_y      float32
rel_humidity          float32
is_weekend            uint8
country               object
is_holiday            uint8
dtypes: float32(6), object(2), uint16(3), uint32(1), uint8(5)
memory usage: 2.4+ GB


In [8]:
submission = pd.read_csv(f'{path}raw/test/sample_submission.csv')
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           int64
meter_reading    int64
dtypes: int64(2)
memory usage: 636.3 MB


In [9]:
submission = udf.reduce_mem_usage(submission)
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41697600 entries, 0 to 41697599
Data columns (total 2 columns):
row_id           uint32
meter_reading    uint8
dtypes: uint32(1), uint8(1)
memory usage: 198.8 MB


In [10]:
mb = pd.read_pickle(f'{path}from_sub/mb.pkl')
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [11]:
mb = udf.reduce_mem_usage(mb)
mb.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41697600 entries, 0 to 41697599
Data columns (total 9 columns):
row_id          uint32
building_id     uint16
meter           uint8
timestamp       datetime64[ns]
site_id         uint8
primary_use     object
square_feet     uint32
year_built      uint16
missing_year    uint8
dtypes: datetime64[ns](1), object(1), uint16(2), uint32(2), uint8(3)
memory usage: 1.5+ GB


In [12]:
del path
gc.collect()

22

# Features

In [13]:
train.head()

Unnamed: 0,building_id,meter,meter_reading,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,46,0,15.604556,19.4,1019.400024,0.0,Retail,9045,2016,0,1,0,0.0,0.0,100.0,0,US,1
1,74,0,12.603682,19.4,1019.400024,0.0,Parking,387638,1997,0,1,0,0.0,0.0,100.0,0,US,1
2,93,0,15.364478,19.4,1019.400024,0.0,Office,33370,1982,0,1,0,0.0,0.0,100.0,0,US,1
3,105,0,23.3036,2.4,1020.900024,3.1,Education,50623,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1
4,106,0,0.3746,2.4,1020.900024,3.1,Education,5374,1960,1,1,0,-0.5,-0.866025,90.549408,0,UK,1


In [14]:
test.head()

Unnamed: 0,building_id,meter,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,missing_year,dayofyear,hour,wind_direction_x,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,0,0,12.8,1022.099976,2.1,Education,7432,2008,0,1,0,-0.642788,0.766044,83.409012,1,US,1
1,1,0,12.8,1022.099976,2.1,Education,2720,2004,0,1,0,-0.642788,0.766044,83.409012,1,US,1
2,2,0,12.8,1022.099976,2.1,Education,5376,1991,0,1,0,-0.642788,0.766044,83.409012,1,US,1
3,3,0,12.8,1022.099976,2.1,Education,23685,2002,0,1,0,-0.642788,0.766044,83.409012,1,US,1
4,4,0,12.8,1022.099976,2.1,Education,116607,1975,0,1,0,-0.642788,0.766044,83.409012,1,US,1


In [15]:
train.drop(['missing_year', 'wind_direction_x'], axis=1, inplace=True)
feats = train.drop('meter_reading', axis=1).columns
test = test[feats]
test.head()

Unnamed: 0,building_id,meter,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
0,0,0,12.8,1022.099976,2.1,Education,7432,2008,1,0,0.766044,83.409012,1,US,1
1,1,0,12.8,1022.099976,2.1,Education,2720,2004,1,0,0.766044,83.409012,1,US,1
2,2,0,12.8,1022.099976,2.1,Education,5376,1991,1,0,0.766044,83.409012,1,US,1
3,3,0,12.8,1022.099976,2.1,Education,23685,2002,1,0,0.766044,83.409012,1,US,1
4,4,0,12.8,1022.099976,2.1,Education,116607,1975,1,0,0.766044,83.409012,1,US,1


In [16]:
del feats
gc.collect()

66

# Split data

In [17]:
train_df = []
test_df = []

for m in range(4):
    df_train = train[train.meter == m].drop('meter', axis=1)
    df_test = test[test.meter == m].drop('meter', axis=1)
    train_df.append(df_train)
    test_df.append(df_test)
    print(f'Meter {m}:', df_train.shape, df_test.shape)

Meter 0: (11530268, 15) (24755760, 14)
Meter 1: (3518870, 15) (8724960, 14)
Meter 2: (2296049, 15) (5676480, 14)
Meter 3: (919708, 15) (2540400, 14)


In [18]:
del m, df_train, df_test
gc.collect()

22

# Functions

In [19]:
def transform_data(df_train, df_test):
    X_train = df_train.drop('meter_reading', axis=1)
    y_train = df_train[['meter_reading']]
    y_train = np.log1p(y_train)
    
    X_train, X_val, X_test, rare_dict = udf.rare_encoder(X_train, df_test, ['primary_use'])
    X_train, X_val, X_test, mean_dict = udf.mean_encoder(X_train, y_train, X_test, ['primary_use', 'country'])
    X_train_scaled, X_val, X_test_scaled = udf.scale_feats(X_train, X_test)
    
    X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(X_train_scaled, y_train, test_size=0.25, random_state=42)
    
    print('Train: ', X_train_scaled.shape, y_train.shape)
    print('Validation: ', X_val_scaled.shape, y_val.shape)
    print('Test: ', X_test_scaled.shape)
    
    return X_train_scaled, X_val_scaled, X_test_scaled, y_train, y_val

In [20]:
def predict_lgb(X_train, y_train, X_val, y_val, X_test, params_dict, save_path):
    dtrain = lgb.Dataset(X_train, label=y_train)
    dval = lgb.Dataset(X_val, label=y_val)
    lgbm = lgb.train(params_dict, dtrain, valid_sets=[dtrain, dval], valid_names=['train', 'val'], verbose_eval=False)
    lgbm.save_model(save_path)
    
    pred = lgbm.predict(X_test)
    pred[pred < 0] = 0
    return pred

In [21]:
def predict_xgb(X_train, y_train, X_val, y_val, X_test, params_dict, save_path):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)
    xg = xgb.train(params_dict, dtrain, evals=[(dtrain, 'train'), (dval, 'val')], num_boost_round=1000, early_stopping_rounds=10, verbose_eval=False)
    xg.save_model(save_path)
    
    pred = xg.predict(dtest)
    pred[pred < 0] = 0
    return pred

In [22]:
gc.collect()

66

# Electricity meter

### Transform data

In [23]:
Xe_train_scaled, Xe_val_scaled, Xe_test_scaled, ye_train, ye_val = transform_data(train_df[0], test_df[0])
Xe_train_scaled.head()



Train:  (8647701, 14) (8647701, 1)
Validation:  (2882567, 14) (2882567, 1)
Train:  (24755760, 14)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
3820751,1.660735,-0.714804,0.377953,0.663585,-0.364128,0.032242,-2.496025,-0.553622,0.794318,-0.425474,-0.929986,-0.633757,0.18362,-0.180316
5192151,-1.138221,0.185423,-0.702006,-0.197577,1.047964,0.306161,0.58615,-0.144126,-0.650406,1.588679,-1.827506,-0.633757,0.18362,-0.180316
10784538,-0.288186,0.519215,0.782937,-0.886506,1.047964,-0.372759,-1.982329,1.484338,0.505373,-1.401239,0.421137,-0.633757,0.18362,-0.180316
3949783,1.724795,-0.097795,-0.715502,-1.532377,-0.364128,-0.651102,-1.608732,-0.51553,1.661153,0.09372,0.921203,-0.633757,0.18362,-0.180316
2900643,-1.071696,-0.209059,-0.971996,-0.197577,0.25291,-0.527437,-0.487941,-0.839318,0.505373,-0.941568,-1.764215,1.577891,0.18362,-0.180316


### Lasso Regression

In [24]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xe_train_scaled, ye_train)

val0 = lasso.predict(Xe_val_scaled)
print(lasso.score(Xe_val_scaled, ye_val))
print(np.sqrt(mean_squared_log_error(ye_val, val0)))

lasso0 = lasso.predict(Xe_test_scaled)
lasso0[lasso0 < 0] = 0
lasso0

0.4363158212083116
0.28294328293908033


array([3.54987727, 3.48938197, 3.42275486, ..., 3.40873006, 3.94546082,
       4.57793329])

### LightGBM

In [25]:
study_lgb0 = joblib.load('../objects/electricity/study_lgb.pkl')
params_lgb0 = study_lgb0.best_trial.params
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332}

In [26]:
params_lgb0['num_iterations'] = 10000
params_lgb0['early_stopping_round'] = 10
params_lgb0['metric'] = 'rmse'
params_lgb0['num_threads'] = -1
params_lgb0['seed'] = 42
params_lgb0

{'learning_rate': 0.07916336777546343,
 'lambda_l1': 0.008557356431137609,
 'lambda_l2': 0.0006037228650908533,
 'max_depth': 51,
 'num_leaves': 923,
 'min_child_samples': 7,
 'subsample': 0.7399597912518232,
 'feature_fraction': 0.7310599981838332,
 'num_iterations': 10000,
 'early_stopping_round': 10,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [27]:
lgbm0 = predict_lgb(Xe_train_scaled, ye_train, Xe_val_scaled, ye_val, Xe_test_scaled, params_lgb0, '../objects/submission3/lgb0.pkl') # .txt')
lgbm0



array([2.93922587, 1.90868692, 0.51269491, ..., 2.16748426, 5.14519045,
       1.60435027])

### XGBoost

In [28]:
study_xgb0 = joblib.load('../objects/electricity/study_xgb.pkl')
params_xgb0 = study_xgb0.best_trial.params
params_xgb0['eval_metric'] = 'rmse'
params_xgb0['seed'] = 42
params_xgb0

{'grow_policy': 'lossguide',
 'learning_rate': 0.014754037383886122,
 'alpha': 0.005261717289274988,
 'lambda': 0.10981652452119427,
 'gamma': 0.00015007710756172543,
 'max_depth': 23,
 'max_leaves': 1598,
 'subsample': 0.8406224713599797,
 'colsample_bytree': 0.9175035245877109,
 'eval_metric': 'rmse',
 'seed': 42}

In [29]:
xg0 = predict_xgb(Xe_train_scaled, ye_train, Xe_val_scaled, ye_val, Xe_test_scaled, params_xgb0, '../objects/submission3/xgb0.pkl') # .txt')
xg0



array([2.7443616, 1.747664 , 0.6911469, ..., 2.2844005, 5.142176 ,
       1.5813034], dtype=float32)

In [30]:
del lasso, val0, study_lgb0
gc.collect()

100

# Chilled water meter

### Transform data

In [31]:
Xc_train_scaled, Xc_val_scaled, Xc_test_scaled, yc_train, yc_val = transform_data(train_df[1], test_df[1])
Xc_train_scaled.head()



Train:  (2639152, 14) (2639152, 1)
Validation:  (879718, 14) (879718, 1)
Train:  (8724960, 14)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
3054842,0.29511,-0.652177,1.457773,-0.716347,0.758045,0.383649,-0.257998,1.245823,-0.088512,1.409013,-0.682122,-0.625362,-0.173847,-0.170259
1722336,0.817001,1.320539,-1.035706,0.043689,-1.578794,-0.856238,0.075476,-0.00872,-0.379102,-1.189395,0.178401,-0.625362,-0.173847,-0.170259
511899,-1.397081,-0.760676,-1.549518,1.278746,0.611098,0.434406,0.408949,-1.21348,-1.250872,-1.526436,-0.639142,-0.625362,-0.173847,-0.170259
1159847,-1.891861,1.261357,0.475493,0.2812,0.758045,-0.292965,1.687265,-0.526468,1.073849,1.508082,0.701131,-0.625362,-0.173847,-0.170259
688211,0.277036,1.152858,-0.92993,0.518711,-1.578794,-0.386131,0.075476,-0.994432,-1.541463,0.761156,1.517252,-0.625362,-0.173847,-0.170259


### Lasso Regression

In [32]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xc_train_scaled, yc_train)

val1 = lasso.predict(Xc_val_scaled)
print(lasso.score(Xc_val_scaled, yc_val))
print(np.sqrt(mean_squared_log_error(yc_val, val1)))

lasso1 = lasso.predict(Xc_test_scaled)
lasso1[lasso1 < 0] = 0
lasso1

0.3467343407312131
0.3463306230474696


array([4.24764178, 3.37926614, 4.24192387, ..., 4.60241533, 4.45527173,
       4.8197265 ])

### LightGBM

In [33]:
study_lgb1 = joblib.load('../objects/chilledwater/study_lgb1.pkl')
params_lgb1 = study_lgb1.best_trial.params
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671}

In [34]:
params_lgb1['num_iterations'] = 10000
params_lgb1['early_stopping_round'] = 20
params_lgb1['metric'] = 'rmse'
params_lgb1['num_threads'] = -1
params_lgb1['seed'] = 42
params_lgb1

{'learning_rate': 0.020424864262841822,
 'lambda_l1': 0.27986455703148144,
 'lambda_l2': 7.08804479307531,
 'max_depth': 32,
 'num_leaves': 1293,
 'min_data_in_leaf': 23,
 'bagging_fraction': 0.9143657877950819,
 'feature_fraction': 0.701556769695671,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [35]:
lgbm1 = predict_lgb(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, params_lgb1, '../objects/submission3/lgb1.pkl') # .txt')
lgbm1



array([3.95518567, 1.13198105, 2.47584771, ..., 1.0181702 , 0.31760869,
       1.59460085])

### XGBoost

In [36]:
study_xgb1 = joblib.load('../objects/chilledwater/study_xgb1.pkl')
params_xgb1 = study_xgb1.best_trial.params
params_xgb1['eval_metric'] = 'rmse'
params_xgb1['seed'] = 42
params_xgb1

{'grow_policy': 'depthwise',
 'learning_rate': 0.03801178677708901,
 'alpha': 0.051064916441743415,
 'lambda': 0.890189928673286,
 'gamma': 0.053083931710043705,
 'max_depth': 16,
 'max_leaves': 1131,
 'subsample': 0.7245353487626778,
 'colsample_bytree': 0.777530288110325,
 'eval_metric': 'rmse',
 'seed': 42}

In [37]:
xg1 = predict_xgb(Xc_train_scaled, yc_train, Xc_val_scaled, yc_val, Xc_test_scaled, params_xgb1, '../objects/submission3/xgb1.pkl') # .txt')
xg1

array([3.0228665, 0.9836829, 2.0785747, ..., 0.9429326, 0.5375627,
       1.9558679], dtype=float32)

In [38]:
del lasso, val1, study_lgb1
gc.collect()

100

# Steam meter

### Transform data

In [39]:
Xs_train_scaled, Xs_val_scaled, Xs_test_scaled, ys_train, ys_val = transform_data(train_df[2], test_df[2])
Xs_train_scaled.head()



Train:  (1722036, 14) (1722036, 1)
Validation:  (574013, 14) (574013, 1)
Train:  (5676480, 14)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
1227863,-0.055769,1.149265,0.300442,0.198853,0.693481,-0.370458,0.253185,0.14002,-1.520262,0.321329,0.846145,-0.631491,-0.181765,-0.177586
793790,-1.072452,1.05116,-1.038513,-0.024691,-1.856454,-0.815237,0.118055,-0.531292,-0.072679,-0.784003,-0.080153,-0.631491,-0.181765,-0.177586
2268147,-0.794255,0.649821,0.41202,-1.410663,-0.188679,-0.675412,-0.152204,1.641147,-1.375504,0.036405,0.650101,-0.631491,-0.181765,-0.177586
970560,-0.065885,-0.188533,-0.592195,0.869485,1.900406,0.542175,-0.354898,-0.270226,1.66442,0.321329,-0.984523,-0.631491,-0.181765,-0.177586
1921836,-1.224196,1.05116,0.41202,-1.410663,-1.856454,0.355498,0.118055,1.174958,-1.66502,0.036405,1.324216,-0.631491,-0.181765,-0.177586


### Lasso Regression

In [40]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xs_train_scaled, ys_train)

val2 = lasso.predict(Xs_val_scaled)
print(lasso.score(Xs_val_scaled, ys_val))
print(np.sqrt(mean_squared_log_error(ys_val, val2)))

lasso2 = lasso.predict(Xs_test_scaled)
lasso2[lasso2 < 0] = 0
lasso2

0.36091587429867217
0.26692922474173114


array([5.13084632, 5.2151377 , 5.24601396, ..., 5.7768725 , 5.7571963 ,
       6.0173939 ])

### LightGBM

In [41]:
study_lgb2 = joblib.load('../objects/steam/study_lgb2.pkl')
params_lgb2 = study_lgb2.best_trial.params
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498}

In [42]:
params_lgb2['num_iterations'] = 10000
params_lgb2['early_stopping_round'] = 20
params_lgb2['metric'] = 'rmse'
params_lgb2['num_threads'] = -1
params_lgb2['seed'] = 42
params_lgb2

{'learning_rate': 0.04854268157540762,
 'lambda_l1': 0.0028266466239996284,
 'lambda_l2': 0.00010211303918218761,
 'max_depth': 56,
 'num_leaves': 409,
 'min_data_in_leaf': 8,
 'bagging_fraction': 0.5012612103606688,
 'feature_fraction': 0.8537936823847498,
 'num_iterations': 10000,
 'early_stopping_round': 20,
 'metric': 'rmse',
 'num_threads': -1,
 'seed': 42}

In [43]:
lgbm2 = predict_lgb(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, params_lgb2, '../objects/submission3/lgb2.pkl') # .txt')
lgbm2



array([3.52633308, 5.08657529, 8.45465243, ..., 6.24808185, 6.05614676,
       2.8423114 ])

### XGBoost

In [44]:
study_xgb2 = joblib.load('../objects/steam/study_xgb2.pkl')
params_xgb2 = study_xgb2.best_trial.params
params_xgb2['eval_metric'] = 'rmse'
params_xgb2['seed'] = 42
params_xgb2

{'grow_policy': 'depthwise',
 'learning_rate': 0.045664576584020004,
 'alpha': 1.9071087639650297,
 'lambda': 0.00021496551872384197,
 'gamma': 0.15536767802205387,
 'max_depth': 59,
 'max_leaves': 1626,
 'subsample': 0.926581683911481,
 'colsample_bytree': 0.8620824662132568,
 'eval_metric': 'rmse',
 'seed': 42}

In [45]:
xg2 = predict_xgb(Xs_train_scaled, ys_train, Xs_val_scaled, ys_val, Xs_test_scaled, params_xgb2, '../objects/submission3/xgb2.pkl') # .txt')
xg2

array([3.9213645, 5.013207 , 8.410185 , ..., 6.4260616, 6.2009954,
       3.0311716], dtype=float32)

In [46]:
del lasso, val2, study_lgb2
gc.collect()

100

# Hot water meter

### Transform data

In [47]:
Xh_train_scaled, Xh_val_scaled, Xh_test_scaled, yh_train, yh_val = transform_data(train_df[3], test_df[3])
Xh_train_scaled.head()



Train:  (689781, 14) (689781, 1)
Validation:  (229927, 14) (229927, 1)
Train:  (2540400, 14)


Unnamed: 0,building_id,dew_temperature,sea_level_pressure,wind_speed,primary_use,square_feet,year_built,dayofyear,hour,wind_direction_y,rel_humidity,is_weekend,country,is_holiday
617696,0.902166,1.246184,-0.292844,0.467919,0.356687,-0.296929,-0.002246,0.601279,0.219848,-1.311659,-0.601921,-0.630965,-0.044778,-0.182612
559728,-1.234496,1.653144,-0.821912,0.707877,-1.973171,-0.896851,2.317122,0.348563,0.654835,1.222356,-0.514823,-0.630965,-0.044778,-0.182612
900844,0.914025,-0.488753,2.207,0.227961,2.023683,-1.007877,-0.499254,1.648246,0.219848,0.372354,-0.191426,1.584873,-0.044778,5.476081
230300,-1.372855,0.089559,1.426621,0.467919,-0.291586,-0.583896,-0.941038,-0.896966,0.799831,0.613943,1.249094,-0.630965,-2.259887,-0.182612
268936,-1.129738,-0.842166,-0.134121,-0.011996,1.456112,-0.901935,0.605207,-0.770608,0.654835,-0.367554,-1.585444,-0.630965,-0.044778,-0.182612


### Lasso Regression

In [48]:
lasso = Lasso(alpha=1e-6, random_state=42)
lasso.fit(Xh_train_scaled, yh_train)

val3 = lasso.predict(Xh_val_scaled)
print(lasso.score(Xh_val_scaled, yh_val))
print(np.sqrt(mean_squared_log_error(yh_val, val3)))

lasso3 = lasso.predict(Xh_test_scaled)
lasso3[lasso3 < 0] = 0
lasso3

0.36529208903063226
0.3856265743690515


array([3.97480874, 4.30642415, 4.16282654, ..., 5.49262889, 5.52568577,
       5.28825195])

### LightGBM

In [49]:
study_lgb3 = joblib.load('../objects/hotwater/study_lgb3.pkl')
params_lgb3 = study_lgb3.best_trial.params
params_lgb3

{'learning_rate': 0.023942434668217872,
 'lambda_l1': 0.23828824891615835,
 'lambda_l2': 0.00028375734074312625,
 'max_depth': 58,
 'num_leaves': 1581,
 'min_data_in_leaf': 42,
 'bagging_fraction': 0.500825513633077,
 'feature_fraction': 0.9607308095583501}

In [50]:
params_lgb3['num_iterations'] = 10000
params_lgb3['early_stopping_round'] = 20
params_lgb3['metric'] = 'rmse'
params_lgb3['num_threads'] = -1
params_lgb3['seed'] = 42

In [51]:
lgbm3 = predict_lgb(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, params_lgb3, '../objects/submission3/lgb3.pkl') # .txt')
lgbm3



array([2.20708412, 4.35435085, 4.64648194, ..., 8.19292046, 5.77224541,
       8.19296269])

### XGBoost

In [52]:
study_xgb3 = joblib.load('../objects/hotwater/study_xgb3.pkl')
params_xgb3 = study_xgb3.best_trial.params
params_xgb3['eval_metric'] = 'rmse'
params_xgb3['seed'] = 42
params_xgb3

{'grow_policy': 'lossguide',
 'learning_rate': 0.009806612868641755,
 'alpha': 0.009923410545051567,
 'lambda': 2.4314911765779557,
 'gamma': 0.00182351994095809,
 'max_depth': 51,
 'max_leaves': 1225,
 'subsample': 0.8446420358715253,
 'colsample_bytree': 0.8798084029338699,
 'eval_metric': 'rmse',
 'seed': 42}

In [53]:
xg3 = predict_xgb(Xh_train_scaled, yh_train, Xh_val_scaled, yh_val, Xh_test_scaled, params_xgb3, '../objects/submission3/xgb3.pkl') # .txt')
xg3

array([2.4618597, 4.476922 , 4.6057267, ..., 8.152203 , 5.746389 ,
       8.130064 ], dtype=float32)

In [54]:
del lasso, val3, study_lgb3
gc.collect()

100

# Combine predictions

In [55]:
print('Test observations: ', [df.shape[0] for df in test_df])
print('Lasso predictions: ', list(map(len, [lasso0, lasso1, lasso2, lasso3])))
print('LightGBM predictions: ', list(map(len, [lgbm0, lgbm1, lgbm2, lgbm3])))
print('XGBoost predictions: ', list(map(len, [xg0, xg1, xg2, xg3])))

Test observations:  [24755760, 8724960, 5676480, 2540400]
Lasso predictions:  [24755760, 8724960, 5676480, 2540400]
LightGBM predictions:  [24755760, 8724960, 5676480, 2540400]
XGBoost predictions:  [24755760, 8724960, 5676480, 2540400]


In [56]:
# Electricity meter
pred0 = test_df[0][['building_id']].copy()
pred0['lasso'] = lasso0
pred0['lgb'] = lgbm0
pred0['xgb'] = xg0
pred0.drop('building_id', axis=1, inplace=True)
pred0

Unnamed: 0,lasso,lgb,xgb
0,3.549877,2.939226,2.744362
1,3.489382,1.908687,1.747664
2,3.422755,0.512695,0.691147
3,3.628668,3.229814,4.861615
4,4.124407,4.185015,3.439973
...,...,...,...
41697595,2.919300,1.626395,1.650684
41697596,3.924178,1.745877,1.813093
41697597,3.408730,2.167484,2.284400
41697598,3.945461,5.145190,5.142176


In [57]:
# Chilled water meter
pred1 = test_df[1][['building_id']].copy()
pred1['lasso'] = lasso1
pred1['lgb'] = lgbm1
pred1['xgb'] = xg1
pred1.drop('building_id', axis=1, inplace=True)
pred1

Unnamed: 0,lasso,lgb,xgb
8,4.247642,3.955186,3.022866
11,3.379266,1.131981,0.983683
16,4.241924,2.475848,2.078575
18,4.299785,3.144893,2.911512
20,3.337733,2.455724,2.744730
...,...,...,...
41697538,3.997619,3.862111,3.701042
41697541,4.410354,3.453723,3.324516
41697543,4.602415,1.018170,0.942933
41697545,4.455272,0.317609,0.537563


In [58]:
# Steam meter
pred2 = test_df[2][['building_id']].copy()
pred2['lasso'] = lasso2
pred2['lgb'] = lgbm2
pred2['xgb'] = xg2
pred2.drop('building_id', axis=1, inplace=True)
pred2

Unnamed: 0,lasso,lgb,xgb
16340505,5.130846,3.526333,3.921365
16340509,5.215138,5.086575,5.013207
16340515,5.246014,8.454652,8.410185
16340517,5.378871,5.288428,4.991770
16340521,5.267886,5.708070,5.728630
...,...,...,...
41697581,5.399248,5.578244,5.554851
41697584,5.610836,6.866227,6.906217
41697586,5.776872,6.248082,6.426062
41697588,5.757196,6.056147,6.200995


In [59]:
# Hot water meter
pred3 = test_df[3][['building_id']].copy()
pred3['lasso'] = lasso3
pred3['lgb'] = lgbm3
pred3['xgb'] = xg3
pred3.drop('building_id', axis=1, inplace=True)
pred3

Unnamed: 0,lasso,lgb,xgb
2260082,3.974809,2.207084,2.461860
2260086,4.306424,4.354351,4.476922
2260090,4.162827,4.646482,4.605727
2260092,4.615836,3.316648,3.236039
2260094,4.879496,5.375963,5.278156
...,...,...,...
41696866,5.733527,8.326604,8.252348
41697101,5.730063,5.625056,5.621520
41697116,5.492629,8.192920,8.152203
41697351,5.525686,5.772245,5.746389


In [60]:
# Log-transformed predictions
pred_transformed = pd.concat([pred0, pred1, pred2, pred3]).sort_index()
pred_transformed

Unnamed: 0,lasso,lgb,xgb
0,3.549877,2.939226,2.744362
1,3.489382,1.908687,1.747664
2,3.422755,0.512695,0.691147
3,3.628668,3.229814,4.861615
4,4.124407,4.185015,3.439973
...,...,...,...
41697595,2.919300,1.626395,1.650684
41697596,3.924178,1.745877,1.813093
41697597,3.408730,2.167484,2.284400
41697598,3.945461,5.145190,5.142176


In [61]:
del pred0, pred1, pred2, pred3
gc.collect()

110

# Transform predictions back to normal scale (and units for site 0)

In [62]:
# Transform back to normal scale
pred = pred_transformed.copy()
pred['lasso'] = np.expm1(pred.lasso)
pred['lgb'] = np.expm1(pred.lgb)
pred['xgb'] = np.expm1(pred.xgb)
pred.reset_index(inplace=True)
pred.columns = ['row_id', 'lasso', 'lgb', 'xgb']
pred

Unnamed: 0,row_id,lasso,lgb,xgb
0,0,33.809045,17.901209,14.554681
1,1,31.765691,5.744227,4.741176
2,2,29.653746,0.669785,0.996003
3,3,36.662598,24.274947,128.232773
4,4,60.831159,64.694516,30.186119
...,...,...,...,...
41697595,41697595,17.528321,4.085506,4.210545
41697596,41697596,49.611458,4.730928,5.129375
41697597,41697597,29.226834,7.736278,8.819798
41697598,41697598,50.700157,170.604164,170.087677


In [63]:
pred = pd.merge(mb[['row_id', 'site_id', 'meter']], pred, on='row_id', how='left')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,33.809045,17.901209,14.554681
1,1,0,0,31.765691,5.744227,4.741176
2,2,0,0,29.653746,0.669785,0.996003
3,3,0,0,36.662598,24.274947,128.232773
4,4,0,0,60.831159,64.694516,30.186119
...,...,...,...,...,...,...
41697595,41697595,15,0,17.528321,4.085506,4.210545
41697596,41697596,15,0,49.611458,4.730928,5.129375
41697597,41697597,15,0,29.226834,7.736278,8.819798
41697598,41697598,15,0,50.700157,170.604164,170.087677


In [64]:
# Site 0 meter 0 predictions in kWh
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,33.809045,17.901209,14.554681
1,1,0,0,31.765691,5.744227,4.741176
2,2,0,0,29.653746,0.669785,0.996003
3,3,0,0,36.662598,24.274947,128.232773
4,4,0,0,60.831159,64.694516,30.186119


In [65]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=0, conversion='kwh_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 0)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,115.3497,61.075344,49.657658
1,1,0,0,108.378185,19.598155,16.175943
2,2,0,0,101.172649,2.285173,3.398165
3,3,0,0,125.085452,82.821265,437.504578
4,4,0,0,207.543749,220.72475,102.988998


In [66]:
# Site 0 meter 1 predictions in tons
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,68.940283,51.205386,19.550114
11,11,0,1,28.349225,2.101795,1.674287
16,16,0,1,68.541512,10.891784,6.993068
18,18,0,1,72.683948,22.21718,17.384573
20,20,0,1,27.155226,10.654863,14.560409


In [67]:
# Convert from kWh back to kBTU
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lasso')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='lgb')
pred = udf.convert_readings(pred, site_num=0, meter_type=1, conversion='ton_to_kbtu', reading_col='xgb')
pred[(pred.site_id == 0) & (pred.meter == 1)].head()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
8,8,0,1,827.283401,614.464631,234.601364
11,11,0,1,340.190701,25.221543,20.091446
16,16,0,1,822.498146,130.701403,83.916809
18,18,0,1,872.207374,266.606164,208.614868
20,20,0,1,325.862714,127.858358,174.724899


In [68]:
del mb
gc.collect()

0

# Save Predictions

In [69]:
pred.describe()

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
count,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0,41697600.0
mean,20848800.0,8.086134,0.6642857,307.9083,368.592,356.2933
std,12037060.0,5.134712,0.9278067,1501.366,982.1016,911.5507
min,0.0,0.0,0.0,0.2421764,0.0,0.0
25%,10424400.0,3.0,0.0,38.14342,26.33986,25.94249
50%,20848800.0,9.0,0.0,75.24997,87.82993,86.63894
75%,31273200.0,13.0,1.0,185.6735,280.5522,277.1358
max,41697600.0,15.0,3.0,171499.2,27882.64,16165.23


In [70]:
path = '../submissions/sub3/'
pred.to_pickle(f'{path}preds.pkl')
pred = pd.read_pickle(f'{path}preds.pkl')
pred

Unnamed: 0,row_id,site_id,meter,lasso,lgb,xgb
0,0,0,0,115.349700,61.075344,49.657658
1,1,0,0,108.378185,19.598155,16.175943
2,2,0,0,101.172649,2.285173,3.398165
3,3,0,0,125.085452,82.821265,437.504578
4,4,0,0,207.543749,220.724750,102.988998
...,...,...,...,...,...,...
41697595,41697595,15,0,17.528321,4.085506,4.210545
41697596,41697596,15,0,49.611458,4.730928,5.129375
41697597,41697597,15,0,29.226834,7.736278,8.819798
41697598,41697598,15,0,50.700157,170.604164,170.087677


In [71]:
submission

Unnamed: 0,row_id,meter_reading
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
41697595,41697595,0
41697596,41697596,0
41697597,41697597,0
41697598,41697598,0


In [72]:
# Lasso predictions
lasso_pred = submission.copy()
lasso_pred['meter_reading'] = pred['lasso']
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,115.349700
1,1,108.378185
2,2,101.172649
3,3,125.085452
4,4,207.543749
...,...,...
41697595,41697595,17.528321
41697596,41697596,49.611458
41697597,41697597,29.226834
41697598,41697598,50.700157


In [73]:
# Lasso predictions
lgb_pred = submission.copy()
lgb_pred['meter_reading'] = pred['lgb']
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,61.075344
1,1,19.598155
2,2,2.285173
3,3,82.821265
4,4,220.724750
...,...,...
41697595,41697595,4.085506
41697596,41697596,4.730928
41697597,41697597,7.736278
41697598,41697598,170.604164


In [74]:
# Lasso predictions
xgb_pred = submission.copy()
xgb_pred['meter_reading'] = pred['xgb']
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,49.657658
1,1,16.175943
2,2,3.398165
3,3,437.504578
4,4,102.988998
...,...,...
41697595,41697595,4.210545
41697596,41697596,5.129375
41697597,41697597,8.819798
41697598,41697598,170.087677


In [75]:
lasso_pred.to_csv(f'{path}l1r.csv', index=False)
lasso_pred = pd.read_csv(f'{path}l1r.csv')
lasso_pred

Unnamed: 0,row_id,meter_reading
0,0,115.349700
1,1,108.378185
2,2,101.172649
3,3,125.085452
4,4,207.543749
...,...,...
41697595,41697595,17.528321
41697596,41697596,49.611458
41697597,41697597,29.226834
41697598,41697598,50.700157


In [76]:
lgb_pred.to_csv(f'{path}lgb.csv', index=False)
lgb_pred = pd.read_csv(f'{path}lgb.csv')
lgb_pred

Unnamed: 0,row_id,meter_reading
0,0,61.075344
1,1,19.598155
2,2,2.285173
3,3,82.821265
4,4,220.724750
...,...,...
41697595,41697595,4.085506
41697596,41697596,4.730928
41697597,41697597,7.736278
41697598,41697598,170.604164


In [77]:
xgb_pred.to_csv(f'{path}xgb.csv', index=False)
xgb_pred = pd.read_csv(f'{path}xgb.csv')
xgb_pred

Unnamed: 0,row_id,meter_reading
0,0,49.657658
1,1,16.175943
2,2,3.398164
3,3,437.504580
4,4,102.989000
...,...,...
41697595,41697595,4.210545
41697596,41697596,5.129376
41697597,41697597,8.819797
41697598,41697598,170.087680


### Test RMSLE:
##### Lasso regression:  (Public),  (Private)
##### LightGBM:  (Public),  (Private)
##### XGBoost:  (Public),  (Private)

# Combine LightGBM and XGBoost predictions

In [8]:
xl = pd.merge(lgb_pred, xgb_pred, on='row_id', how='left')
xl.columns = ['row_id', 'lgb', 'xgb']
xl.head()

Unnamed: 0,row_id,lgb,xgb
0,0,54.809699,51.882465
1,1,17.976733,16.754257
2,2,1.477563,3.712897
3,3,71.624491,373.01392
4,4,142.16725,117.07958


In [11]:
xl['xl55'] = xl[['lgb', 'xgb']].mean(axis=1)
xl['xl46'] = (xl.lgb * 0.4) + (xl.xgb * 0.6)
xl['xl2575'] = (xl.lgb * 0.25) + (xl.xgb * 0.75)
xl.head()

Unnamed: 0,row_id,lgb,xgb,xl55
0,0,54.809699,51.882465,53.346082
1,1,17.976733,16.754257,17.365495
2,2,1.477563,3.712897,2.59523
3,3,71.624491,373.01392,222.319205
4,4,142.16725,117.07958,129.623415


In [16]:
xl1 = xl[['row_id', 'xl55']]
xl1.columns = ['row_id', 'meter_reading']

xl1.to_csv(f'{path}xl1.csv', index=False)
xl1 = pd.read_csv(f'{path}xl1.csv')
xl1

Unnamed: 0,row_id,meter_reading
0,0,53.346082
1,1,17.365495
2,2,2.595230
3,3,222.319205
4,4,129.623415
...,...,...
41697595,41697595,4.897745
41697596,41697596,5.232897
41697597,41697597,7.730367
41697598,41697598,172.912913


In [17]:
xl2 = xl[['row_id', 'xl46']]
xl2.columns = ['row_id', 'meter_reading']

xl2.to_csv(f'{path}xl2.csv', index=False)
xl2 = pd.read_csv(f'{path}xl2.csv')
xl2

Unnamed: 0,row_id,meter_reading
0,0,53.053358
1,1,17.243247
2,2,2.818763
3,3,252.458148
4,4,127.114648
...,...,...
41697595,41697595,4.863569
41697596,41697596,5.207736
41697597,41697597,7.822719
41697598,41697598,172.888550


In [18]:
xl3 = xl[['row_id', 'xl2575']]
xl3.columns = ['row_id', 'meter_reading']

xl3.to_csv(f'{path}xl3.csv', index=False)
xl3 = pd.read_csv(f'{path}xl3.csv')
xl3

Unnamed: 0,row_id,meter_reading
0,0,52.614273
1,1,17.059876
2,2,3.154063
3,3,297.666563
4,4,123.351498
...,...,...
41697595,41697595,4.812304
41697596,41697596,5.169994
41697597,41697597,7.961248
41697598,41697598,172.852006
