In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import warnings
warnings.filterwarnings("ignore")

In [11]:
DATA_DIR = 'data/'
SUBSAMPLE_FEATURE_PREPROC = 0.01
SUBSAMPLE_TRAIN = 0.5

In [12]:
!ls {DATA_DIR}

building_metadata.csv  test.csv   weather_test.csv
sample_submission.csv  train.csv  weather_train.csv


# Load data 

In [13]:
building_metadata = pd.read_csv(DATA_DIR + 'building_metadata.csv')
weather_train = pd.read_csv(DATA_DIR + 'weather_train.csv')
weather_test = pd.read_csv(DATA_DIR + 'weather_test.csv')
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')

In [14]:
map_meter2desc = {0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'}

In [15]:
numerical = [
    'square_feet', 
    'year_built', 
    'floor_count', 
    'air_temperature', 
    'cloud_coverage', 
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_speed',
]
numerical_aug = [
    'wind_direction_sin',
    'wind_direction_cos',
]

categorical = [
    'building_id',
    'meter',
    'primary_use',
]
categorical_aug = [
    'month',
]

feature_names = numerical + categorical + numerical_aug + categorical_aug

# Combine table

In [16]:
df = train
df = df.merge(building_metadata, on='building_id')
df = df.merge(weather_train, on=['site_id', 'timestamp'])

In [17]:
df['log_meter_reading'] = df['meter_reading'].apply(lambda x: np.log(x + 1))
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Remove the problematic data

In [18]:
df = df.query('not (meter == 0 and site_id == 0 and timestamp < "2016-05-21")').reset_index(drop=True)

# Feature pre-processing

In [19]:
df.head()

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,log_meter_reading
0,7,1,2016-02-29 09:00:00,1857.26,0,Education,121074,1989.0,,12.8,,8.9,0.0,1021.9,0.0,0.0,7.527396
1,9,1,2016-02-29 09:00:00,590.945,0,Office,27000,2010.0,,12.8,,8.9,0.0,1021.9,0.0,0.0,6.383414
2,13,1,2016-02-29 09:00:00,1224.1,0,Education,99380,2000.0,,12.8,,8.9,0.0,1021.9,0.0,0.0,7.110778
3,14,1,2016-02-29 09:00:00,1435.15,0,Education,86250,2013.0,,12.8,,8.9,0.0,1021.9,0.0,0.0,7.269721
4,15,1,2016-02-29 09:00:00,422.104,0,Office,83957,1974.0,,12.8,,8.9,0.0,1021.9,0.0,0.0,6.047618


In [22]:
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import make_column_transformer

feature_preproc = make_pipeline(
    FeatureUnion([
        ('numeric_features', make_pipeline(
            FunctionTransformer(lambda x: x[numerical], validate=False),
            FunctionTransformer(lambda x: x.assign(**{'precip_depth_1_hr': lambda y: y['precip_depth_1_hr'].clip(lower=0).fillna(0)}), validate=False),
            SimpleImputer(strategy="median"),
            StandardScaler(),
        )),
        ('categorical_features', make_pipeline(
            FunctionTransformer(lambda x: x[categorical], validate=False),
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        )),
        ('wind_direction', make_pipeline(
            FunctionTransformer(lambda x: pd.concat((
                    x['wind_direction'].apply(lambda x: np.sin(x/360 * 2*np.pi)),
                    x['wind_direction'].apply(lambda x: np.cos(x/360 * 2*np.pi)),
                ), axis='columns'), validate=False),
            SimpleImputer(strategy="median"),
        )),
        ('month', make_pipeline(
            FunctionTransformer(lambda x: x['timestamp'].dt.month.to_frame(), validate=False),
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        )), 
     ]),
    FunctionTransformer(
        lambda x: pd.DataFrame(x, columns=feature_names).astype(float), 
        validate=False,
    ),
)

feature_preproc.fit(df.sample(int(SUBSAMPLE_FEATURE_PREPROC * df.shape[0]), random_state=42));



In [56]:
# np.random.seed(42)
# idx_train = np.random.choice(df.shape[0], int(SUBSAMPLE_TRAIN * df.shape[0]), replace=False)

# idx = np.zeros(df.shape[0]).astype(bool)
# idx[idx_train] = True
# idx_train = idx

In [67]:
np.random.seed(42)
timestamp_train = np.random.choice(df['timestamp'].unique(), int(SUBSAMPLE_TRAIN * df['timestamp'].nunique()), replace=False)

idx_train = df['timestamp'].isin(timestamp_train).values

In [69]:
X_train = feature_preproc.transform(df.iloc[idx_train, :])
y_train = df.iloc[idx_train]['log_meter_reading']

In [70]:
X_val = feature_preproc.transform(df.iloc[~idx_train, :])
y_val = df.iloc[~idx_train]['log_meter_reading']

In [72]:
X_train.head()

Unnamed: 0,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_speed,building_id,meter,primary_use,wind_direction_sin,wind_direction_cos,month
0,0.110083,1.136783,-0.121318,-0.283217,-0.523995,0.123019,-0.106223,0.844127,-1.48913,7.0,1.0,0.0,0.0,1.0,1.0
1,-0.693621,2.257966,-0.121318,-0.283217,-0.523995,0.123019,-0.106223,0.844127,-1.48913,9.0,1.0,6.0,0.0,1.0,1.0
2,-0.075255,1.724069,-0.121318,-0.283217,-0.523995,0.123019,-0.106223,0.844127,-1.48913,13.0,1.0,0.0,0.0,1.0,1.0
3,-0.187429,2.418135,-0.121318,-0.283217,-0.523995,0.123019,-0.106223,0.844127,-1.48913,14.0,1.0,0.0,0.0,1.0,1.0
4,-0.207019,0.335938,-0.121318,-0.283217,-0.523995,0.123019,-0.106223,0.844127,-1.48913,15.0,1.0,6.0,0.0,1.0,1.0


# Train

In [76]:
import lightgbm as lgb

d_train = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical + categorical_aug)
d_val = lgb.Dataset(X_val, label=y_val, categorical_feature=categorical + categorical_aug)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': {'rmse'},
    'subsample': 1,
    'subsample_freq': 1,
    'learning_rate': 0.25,
    'num_leaves': 31,
}

model = lgb.train(
    params,
    d_train,
    num_boost_round=20000,
    valid_sets=(d_train, d_val),
    early_stopping_rounds=100,
    verbose_eval=500,
)

Training until validation scores don't improve for 100 rounds
[500]	training's rmse: 0.782742	valid_1's rmse: 0.818587
[1000]	training's rmse: 0.733363	valid_1's rmse: 0.791136
[1500]	training's rmse: 0.704043	valid_1's rmse: 0.777903
[2000]	training's rmse: 0.685779	valid_1's rmse: 0.769517
[2500]	training's rmse: 0.6699	valid_1's rmse: 0.7633
[3000]	training's rmse: 0.65615	valid_1's rmse: 0.758774
[3500]	training's rmse: 0.644877	valid_1's rmse: 0.755763
[4000]	training's rmse: 0.635252	valid_1's rmse: 0.752956
[4500]	training's rmse: 0.626657	valid_1's rmse: 0.750859
[5000]	training's rmse: 0.618844	valid_1's rmse: 0.748881
[5500]	training's rmse: 0.611122	valid_1's rmse: 0.74675
[6000]	training's rmse: 0.60415	valid_1's rmse: 0.745192
[6500]	training's rmse: 0.597781	valid_1's rmse: 0.743727
[7000]	training's rmse: 0.592511	valid_1's rmse: 0.742654
[7500]	training's rmse: 0.586859	valid_1's rmse: 0.741662
[8000]	training's rmse: 0.581245	valid_1's rmse: 0.74063
[8500]	training's r

In [77]:
# Re-train with fixed iteration, on all data
d_train = lgb.Dataset(
    feature_preproc.transform(df),
    label=df['log_meter_reading'], 
    categorical_feature=categorical + categorical_aug,
)

model = lgb.train(
    params,
    d_train,
    num_boost_round=17746,
)

In [None]:
y_pred = model.predict(feature_preproc.transform(df))

In [87]:
from sklearn.metrics import mean_squared_error

mean_squared_error(df['log_meter_reading'], y_pred)

0.29987854396921737

# Feature importance

In [88]:
pd.DataFrame({
    'feature_name': feature_names,
    'feature_importance': model.feature_importance(),
}).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
9,building_id,100350
7,sea_level_pressure,83059
5,dew_temperature,67025
3,air_temperature,65436
10,meter,49520
14,month,39372
8,wind_speed,30335
12,wind_direction_sin,29832
13,wind_direction_cos,27173
0,square_feet,23134


# Save model

In [89]:
import dill as pickle

model.save_model('model/model_lightgbm.p')
    
with open('model/feature_preproc.p', 'wb') as file:
    pickle.dump(feature_preproc, file)

# Load model

In [90]:
import dill as pickle
import lightgbm as lgb

model = lgb.Booster(model_file='model/model_lightgbm.p')

with open('model/feature_preproc.p', 'rb') as file:
    feature_preproc = pickle.load(file)

# Test

In [105]:
df = test
df = df.merge(building_metadata, on='building_id')
df = df.merge(weather_test, on=['site_id', 'timestamp'])

In [106]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [93]:
X_test = feature_preproc.transform(df)

In [108]:
X_test.head()

Unnamed: 0,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_speed,building_id,meter,primary_use,wind_direction_sin,wind_direction_cos,month
0,-0.860796,2.151186,-0.121318,0.171592,1.43669,0.397364,-0.106223,0.771564,0.104617,0.0,0.0,0.0,0.984808,-0.173648,0.0
1,-0.901052,1.937628,-0.121318,0.171592,1.43669,0.397364,-0.106223,0.771564,0.104617,1.0,0.0,0.0,0.984808,-0.173648,0.0
2,-0.878361,1.243562,-0.121318,0.171592,1.43669,0.397364,-0.106223,0.771564,0.104617,2.0,0.0,0.0,0.984808,-0.173648,0.0
3,-0.721942,1.830848,-0.121318,0.171592,1.43669,0.397364,-0.106223,0.771564,0.104617,3.0,0.0,0.0,0.984808,-0.173648,0.0
4,0.07192,0.389327,-0.121318,0.171592,1.43669,0.397364,-0.106223,0.771564,0.104617,4.0,0.0,0.0,0.984808,-0.173648,0.0


In [109]:
y_pred = model.predict(X_test)

In [110]:
result = test[['row_id']].merge(
    pd.DataFrame({'row_id': df['row_id'], 'meter_reading': np.round(np.exp(y_pred) - 1, 4)}), 
    on='row_id', how='left',
)
# Default value for missing rows
result['meter_reading'] = result['meter_reading'].fillna(0.0)

In [111]:
result.to_csv('output/submit.csv', index=False)