In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import dill as pickle

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_DIR = 'data/'
MODEL_DIR = 'model/'
OUTPUT_DIR = 'output/'

In [3]:
!ls {DATA_DIR}

building_metadata.csv  test.csv   weather_test.csv
sample_submission.csv  train.csv  weather_train.csv


# Load data 

In [4]:
building_metadata = pd.read_csv(DATA_DIR + 'building_metadata.csv')
weather_val = pd.read_csv(DATA_DIR + 'weather_train.csv')
val = pd.read_csv(DATA_DIR + 'train.csv')

In [5]:
map_meter2desc = {0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'}

In [6]:
numerical = [
    'square_feet', 
    'year_built', 
    'floor_count', 
    'air_temperature', 
    'cloud_coverage', 
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_speed',
]

categorical = [
    'primary_use',
]

# Combine table

In [7]:
df = val
df = df.merge(building_metadata, on='building_id')
df = df.merge(weather_val, on=['site_id', 'timestamp'])
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['row_id'] = range(df.shape[0])

# Inference for each METER

In [8]:
result = pd.DataFrame()

for meter in (0, 1, 2, 3):
    print(map_meter2desc[meter])
    
    df_meter = df.query('meter == {}'.format(meter))
    
    # Load feature preprocessor
    with open(MODEL_DIR + 'feature_preproc_meter{}.p'.format(meter), 'rb') as file:
        feature_preproc = pickle.load(file)
    feature_names = numerical \
        + [ re.sub(r"^(?:x)([0-9])", lambda m: categorical[int(m.group(1))], x) \
             for x in feature_preproc.steps[-1][-1].transformer_list[1][-1].steps[-1][-1].get_feature_names().tolist() ] \
        + ['precip_depth_1_hr_isnan'] \
        + ['wind_direction_sin', 'wind_direction_cos'] \
        + ['month_sin', 'month_cos']
        
    # Load regression model
    with open(MODEL_DIR + 'model_xgboost_meter{}.p'.format(meter), 'rb') as file:
        model = pickle.load(file)
        
    y_pred = model.predict(pd.DataFrame(feature_preproc.transform(df_meter), columns=feature_names))
    result_meter = pd.DataFrame({'row_id': df_meter['row_id'], 'meter_reading': np.round(np.exp(y_pred) - 1, 4)})
    
    # Merge
    result = result.append(result_meter)
    
result = df[['row_id']].merge(result, on='row_id', how='left')

# Default value for missing rows
result['meter_reading'] = result['meter_reading'].fillna(0.0)

electricity
chilledwater
steam
hotwater


# Validate

In [9]:
df = df.query('not (meter == 0 and site_id == 0 and timestamp < "2016-05-21")')

df = df[['row_id', 'meter_reading']].merge(result, on='row_id', suffixes=('_true', '_pred'))

In [10]:
from sklearn.metrics import mean_squared_error

mean_squared_error(np.log1p(df['meter_reading_true']), np.log1p(df['meter_reading_pred']))

2.063445372266943