In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import warnings
warnings.filterwarnings("ignore")

In [2]:
DATA_DIR = 'data/'
SUBSAMPLE_FEATURE_PREPROC = 0.01
SUBSAMPLE_TRAIN = 0.9
METER = 3

In [3]:
!ls {DATA_DIR}

building_metadata.csv  test.csv   weather_test.csv
sample_submission.csv  train.csv  weather_train.csv


# Load data 

In [4]:
building_metadata = pd.read_csv(DATA_DIR + 'building_metadata.csv')
weather_train = pd.read_csv(DATA_DIR + 'weather_train.csv')
weather_test = pd.read_csv(DATA_DIR + 'weather_test.csv')
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')

In [5]:
map_meter2desc = {0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'}

In [6]:
numerical = [
    'square_feet', 
    'year_built', 
    'floor_count', 
    'air_temperature', 
    'cloud_coverage', 
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_direction',
    'wind_speed',
]

categorical = [
    'primary_use',
]

# Combine table

In [7]:
df = train
df = df.merge(building_metadata, on='building_id')
df = df.merge(weather_train, on=['site_id', 'timestamp'])

In [8]:
df['log_meter_reading'] = df['meter_reading'].apply(lambda x: np.log(x + 1))
df['timestamp'] = pd.to_datetime(df['timestamp'])
# df = df.query('meter_reading > 0')

In [9]:
# df0 = df.query('meter == 0')
# df1 = df.query('meter == 1')
# df2 = df.query('meter == 2')
# df3 = df.query('meter == 3')
df = df.query('meter == {}'.format(METER))

# Remove the problematic data

In [10]:
df = df.query('not (meter == 0 and site_id == 0 and timestamp < "2016-05-21")')

# Feature pre-processing

In [11]:
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer

feature_preproc = make_pipeline(
    FeatureUnion([
        ('numeric_features', make_pipeline(
            make_column_transformer(('passthrough', numerical)),
            SimpleImputer(strategy="median"),
            StandardScaler(),
        )),
        ('categorical_features', make_pipeline(
            make_column_transformer(('passthrough', categorical)),
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(sparse=False),
        )),
     ]),
)

feature_preproc.fit(df.sample(int(SUBSAMPLE_FEATURE_PREPROC * df.shape[0]))[numerical + categorical]);

feature_names = numerical + \
    [ re.sub(r"^(?:x)([0-9])", lambda m: categorical[int(m.group(1))], x) \
         for x in feature_preproc.steps[-1][-1].transformer_list[1][-1].steps[-1][-1].get_feature_names().tolist() ]

In [12]:
idx_train = np.random.choice(df.shape[0], int(SUBSAMPLE_TRAIN * df.shape[0]), replace=False)

idx = np.zeros(df.shape[0]).astype(bool)
idx[idx_train] = True
idx_train = idx

In [13]:
X_train = pd.DataFrame(feature_preproc.transform(df.iloc[idx_train, :][numerical + categorical]), columns=feature_names)
y_train = df.iloc[idx_train]['log_meter_reading']

In [14]:
X_train.head()

Unnamed: 0,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,primary_use_Education,primary_use_Entertainment/public assembly,primary_use_Food sales and service,primary_use_Healthcare,primary_use_Lodging/residential,primary_use_Office,primary_use_Public services,primary_use_Technology/science
0,-1.092873,-0.073289,-0.80468,-1.161514,-0.457978,-0.257321,-0.107227,0.872074,0.567538,-0.026294,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.572417,-0.952847,0.80903,-1.161514,-0.457978,-0.257321,-0.107227,0.872074,0.567538,-0.026294,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.822346,-0.073289,0.80903,-1.161514,-0.457978,-0.257321,-0.107227,0.872074,0.567538,-0.026294,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.13398,-0.659661,3.229594,-1.161514,-0.457978,-0.257321,-0.107227,0.872074,0.567538,-0.026294,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.261265,-0.659661,6.457013,-1.161514,-0.457978,-0.257321,-0.107227,0.872074,0.567538,-0.026294,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train

In [15]:
from xgboost import XGBRegressor

model = XGBRegressor()

In [16]:
!date

Sun Nov  3 11:51:07 EST 2019


In [17]:
model.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [18]:
!date

Sun Nov  3 11:52:14 EST 2019


In [19]:
y_pred = model.predict(X_train)

In [20]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_train, y_pred)

3.4341690123592357

# Validate

In [21]:
X_val = pd.DataFrame(feature_preproc.transform(df.iloc[~idx_train, :][numerical + categorical]), columns=feature_names)
y_val = df.iloc[~idx_train]['log_meter_reading']

In [22]:
X_val.shape, idx_train.sum()

((126143, 18), 1135283)

In [23]:
y_pred = model.predict(X_val)

In [24]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_val, y_pred)

3.436085755807605

# Test

In [25]:
df = test
df = df.merge(building_metadata, on='building_id')
df = df.merge(weather_test, on=['site_id', 'timestamp'])

In [None]:
df = df.query('meter == {}'.format(METER))

In [None]:
X_test = pd.DataFrame(feature_preproc.transform(df[numerical + categorical]), columns=feature_names)

In [24]:
y_pred = model.predict(X_test)

In [25]:
submit = pd.DataFrame({'id': df['row_id'], 'meter_reading': np.round(np.exp(y_pred) - 1, 4)})

In [26]:
submit.to_csv('output/result_meter{}_test.csv'.format(METER), index=False)

# Save model

In [25]:
import dill as pickle

with open('model/model_xgboost_meter{}.p'.format(METER), 'wb') as file:
    pickle.dump(model, file)
    
with open('model/feature_preproc_meter{}.p'.format(METER), 'wb') as file:
    pickle.dump(feature_preproc, file)