In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dill as pickle
import re
import lightgbm as lgb
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer

import warnings
warnings.filterwarnings("ignore")

In [2]:
def interpolate(df):
    df = df.copy()
    df['ts'] = pd.to_datetime(df['timestamp'])
    df = df.set_index('ts')
    df = df.interpolate(method='time', limit_direction='both')    
    return df

In [3]:
DATA_DIR = 'data/'
SUBSAMPLE_FEATURE_PREPROC = 0.01

map_meter2desc = {0: 'electricity', 1: 'chilledwater', 2: 'steam', 3: 'hotwater'}

numerical = [
    'square_feet', 
    #'year_built', 
    'floor_count', 
    'air_temperature', 
    'cloud_coverage', 
    'dew_temperature',
    'precip_depth_1_hr',
    'sea_level_pressure',
    'wind_speed',
]
numerical_aug = [
    'wind_direction_sin',
    'wind_direction_cos',
    'year_since_built',
]

categorical = [
    'building_id',
    'meter',
    'primary_use',
]
categorical_aug = [
    'month',
    'day',
    'hour',
]

feature_names = numerical + categorical + numerical_aug + categorical_aug

# Load data 

In [4]:
building_metadata = pd.read_csv(DATA_DIR + 'building_metadata.csv')
weather_train = pd.read_csv(DATA_DIR + 'weather_train.csv')
weather_test = pd.read_csv(DATA_DIR + 'weather_test.csv')
train = pd.read_csv(DATA_DIR + 'train.csv')
test = pd.read_csv(DATA_DIR + 'test.csv')

# Combine table

In [5]:
df = train
df = df.merge(building_metadata, on='building_id', how='left')

# Interpolate weather data
weather_train_full = df[['site_id', 'timestamp']].drop_duplicates()\
    .merge(weather_train, on=['site_id', 'timestamp'], how='left')\
    .groupby(by='site_id', group_keys=False).apply(interpolate).reset_index(drop=True)
df = df.merge(weather_train_full, on=['site_id', 'timestamp'], how='left')

# Transform data type
df['log_meter_reading'] = df['meter_reading'].apply(lambda x: np.log(x + 1))
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Remove the problematic data
df = df.query('not (meter == 0 and building_id <= 104 and timestamp < "2016-05-21")').reset_index(drop=True)

# Feature pre-processing

In [6]:
feature_preproc = make_pipeline(
    FeatureUnion([
        ('numeric_features', make_pipeline(
            FunctionTransformer(lambda x: x[numerical], validate=False),
            FunctionTransformer(lambda x: x.assign(**{'precip_depth_1_hr': lambda y: y['precip_depth_1_hr'].clip(lower=0).fillna(0)}), validate=False),
            SimpleImputer(strategy="median"),
            #StandardScaler(),
        )),
        ('categorical_features', make_pipeline(
            FunctionTransformer(lambda x: x[categorical], validate=False),
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        )),
        ('wind_direction', make_pipeline(
            FunctionTransformer(lambda x: pd.concat((
                    x['wind_direction'].apply(lambda x: np.sin(x/360 * 2*np.pi)),
                    x['wind_direction'].apply(lambda x: np.cos(x/360 * 2*np.pi)),
                ), axis='columns'), validate=False),
            SimpleImputer(strategy="median"),
        )),
        ('year_since_build', make_pipeline(
            FunctionTransformer(lambda x: (x['timestamp'].dt.year - x['year_built']).to_frame(), validate=False),
            SimpleImputer(strategy="median"),
        )),
        ('month', make_pipeline(
            FunctionTransformer(lambda x: x['timestamp'].dt.month.to_frame(), validate=False),
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        )), 
        ('day', make_pipeline(
            FunctionTransformer(lambda x: x['timestamp'].dt.day.to_frame(), validate=False),
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        )), 
        ('hour', make_pipeline(
            FunctionTransformer(lambda x: x['timestamp'].dt.hour.to_frame(), validate=False),
            SimpleImputer(strategy="most_frequent"),
            OrdinalEncoder(),
        )), 
     ]),
    FunctionTransformer(
        lambda x: pd.DataFrame(x, columns=feature_names).astype({ x: 'category' if x in categorical + categorical_aug else float for x in feature_names }), 
        validate=False,
    ),
)

feature_preproc.fit(df.sample(int(SUBSAMPLE_FEATURE_PREPROC * df.shape[0]), random_state=42));

# Train

In [7]:
X_train, y_train = feature_preproc.transform(df), df['log_meter_reading']

X_train.head()

Unnamed: 0,square_feet,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_speed,building_id,meter,primary_use,wind_direction_sin,wind_direction_cos,year_since_built,month,day,hour
0,50623.0,5.0,3.8,0.0,2.4,0.0,1020.9,3.1,105.0,0.0,0.0,-0.866025,-0.5,48.0,0.0,0.0,0.0
1,5374.0,4.0,3.8,0.0,2.4,0.0,1020.9,3.1,106.0,0.0,0.0,-0.866025,-0.5,48.0,0.0,0.0,0.0
2,5374.0,4.0,3.8,0.0,2.4,0.0,1020.9,3.1,106.0,3.0,0.0,-0.866025,-0.5,48.0,0.0,0.0,0.0
3,97532.0,10.0,3.8,0.0,2.4,0.0,1020.9,3.1,107.0,0.0,0.0,-0.866025,-0.5,11.0,0.0,0.0,0.0
4,81580.0,5.0,3.8,0.0,2.4,0.0,1020.9,3.1,108.0,0.0,0.0,-0.866025,-0.5,103.0,0.0,0.0,0.0


In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'feature_fraction': 0.85,
    'random_state': 42,
}

model = lgb.LGBMRegressor(
    **params,
)

gridParams = {
    'learning_rate': [0.05, 0.1, 0.2, 0.5],
    'n_estimators': [50, 100, 200, 500],
    'num_leaves': [15, 31, 63, 127],
}

grid_search = GridSearchCV(
    model, 
    gridParams,
    cv=3,
    scoring=make_scorer(mean_squared_error, greater_is_better=False),
    verbose=5,
    n_jobs=4,
    return_train_score=False,
    refit=False,
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  5.0min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 58.1min


In [22]:
pd.DataFrame([ 
    {**param, **{'mean_test_score': score}} \
    for param, score in zip(grid_search.cv_results_['params'], grid_search.cv_results_['mean_test_score']) 
])

Unnamed: 0,learning_rate,n_estimators,num_leaves,mean_test_score
0,0.05,50,15,-2.235905
1,0.05,50,31,-1.907500
2,0.05,50,63,-1.749959
3,0.05,50,127,-1.628794
4,0.05,100,15,-1.847954
...,...,...,...,...
59,0.50,200,127,-1.524543
60,0.50,500,15,-1.415630
61,0.50,500,31,-1.479727
62,0.50,500,63,-1.484545


In [31]:
grid_search.best_params_, grid_search.best_score_

({'learning_rate': 0.05, 'n_estimators': 500, 'num_leaves': 127},
 -1.3218098968333896)

In [None]:
model = lgb.train(
    {**params, **grid_search.best_params_},
    lgb.Dataset(X_train, y_train),
)

# Feature importance

In [24]:
pd.DataFrame({
    'feature_name': feature_names,
    'feature_importance': model.feature_importance(),
}).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_name,feature_importance
8,building_id,19180
14,month,8100
15,day,6548
9,meter,5675
2,air_temperature,3881
0,square_feet,3863
4,dew_temperature,3487
16,hour,3103
6,sea_level_pressure,2514
13,year_since_built,1282


# Save model

In [None]:
model.save_model('model/model_lightgbm.p')
    
with open('model/feature_preproc.p', 'wb') as file:
    pickle.dump(feature_preproc, file)

# Load model

In [25]:
model = lgb.Booster(model_file='model/model_lightgbm.p')

with open('model/feature_preproc.p', 'rb') as file:
    feature_preproc = pickle.load(file)

# Test

In [26]:
df = test
df = df.merge(building_metadata, on='building_id')

# Interpolate weather data
weather_test_full = df[['site_id', 'timestamp']].drop_duplicates()\
    .merge(weather_test, on=['site_id', 'timestamp'], how='left')\
    .groupby(by='site_id', group_keys=False).apply(interpolate).reset_index(drop=True)
df = df.merge(weather_test_full, on=['site_id', 'timestamp'], how='left')

# Transform data type
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [27]:
X_test = feature_preproc.transform(df)

X_test.head()

Unnamed: 0,square_feet,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_speed,building_id,meter,primary_use,wind_direction_sin,wind_direction_cos,year_since_built,month,day,hour
0,7432.0,3.0,17.8,4.0,11.7,0.0,1021.4,3.6,0.0,0.0,0.0,0.984808,-0.173648,9.0,0.0,0.0,0.0
1,7432.0,3.0,17.8,2.0,12.8,0.0,1022.0,3.1,0.0,0.0,0.0,0.766044,-0.642788,9.0,0.0,0.0,1.0
2,7432.0,3.0,16.1,0.0,12.8,0.0,1021.9,3.1,0.0,0.0,0.0,0.642788,-0.766044,9.0,0.0,0.0,2.0
3,7432.0,3.0,17.2,0.0,13.3,0.0,1022.2,3.1,0.0,0.0,0.0,0.642788,-0.766044,9.0,0.0,0.0,3.0
4,7432.0,3.0,16.7,2.0,13.3,0.0,1022.3,2.6,0.0,0.0,0.0,0.766044,-0.642788,9.0,0.0,0.0,4.0


In [28]:
y_pred = model.predict(X_test)

In [29]:
result = test[['row_id']].merge(
    pd.DataFrame({'row_id': df['row_id'], 'meter_reading': np.round(np.exp(y_pred) - 1, 4)}), 
    on='row_id', how='left',
)
# Default value for missing rows
result['meter_reading'] = result['meter_reading'].fillna(0.0)

In [30]:
result.to_csv('output/submit.csv', index=False)