In [1]:
import sys
sys.path.insert(0, '/home/jovyan/work/Molecular_Properties/model-pipeline/src')
import pandas as pd
import warnings
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#from model_construction import prepare_data_split, run_model, show_varimp
from bearinmind_pipeline.data_preprocessing import dataPreprocessing as dp
#from bearinmind_pipeline.model_construction import modelBuilder as mb
import bearinmind_pipeline.model_construction as mc
#from bearinmind_pipeline import data_preprocessing, model_construction, results_blend
import gc
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import os
import lightgbm as lgb
from imblearn.over_sampling import SMOTE
from fbprophet import Prophet
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
np.set_printoptions(suppress=True)
from multiprocessing import Pool, cpu_count
import time


Using TensorFlow backend.


In [2]:
import datetime
from tqdm import tqdm_notebook, tqdm 

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')
mapping = pd.read_csv('data/building_metadata.csv')
weather_train = pd.read_csv('data/weather_train.csv')
weather_test = pd.read_csv('data/weather_test.csv')

In [4]:
df_train = df_train.merge(mapping, how = "left", on = "building_id")

In [5]:
df_test = df_test.merge(mapping, how = "left", on = "building_id")

In [6]:
df_train = df_train.merge(weather_train, how = "left", on = ["site_id", "timestamp"])
df_test = df_test.merge(weather_test, how = "left", on =  ["site_id", "timestamp"])

In [7]:
def rmsle(y, yhat):
    score = np.sqrt(np.mean(np.power(np.log(yhat+1)-np.log(y.reset_index(drop=True)+1), 2)))
    return(score)

In [8]:
df_train["timestamp"] = pd.to_datetime(df_train["timestamp"])

In [None]:
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])

In [None]:
df_train["air_temperature"] = df_train["air_temperature"].fillna(df_train["air_temperature"].mean())
df_test["air_temperature"] = df_test["air_temperature"].fillna(df_test["air_temperature"].mean())
df_train["dew_temperature"] = df_train["dew_temperature"].fillna(df_train["dew_temperature"].mean())
df_test["dew_temperature"] = df_test["dew_temperature"].fillna(df_test["dew_temperature"].mean())
df_train["sea_level_pressure"] = df_train["sea_level_pressure"].fillna(df_train["sea_level_pressure"].mean())
df_test["sea_level_pressure"] = df_test["sea_level_pressure"].fillna(df_test["sea_level_pressure"].mean())
df_train["wind_direction"] = df_train["wind_direction"].fillna(df_train["wind_direction"].mean())
df_test["wind_direction"] = df_test["wind_direction"].fillna(df_test["wind_direction"].mean())
df_train["wind_speed"] = df_train["wind_speed"].fillna(df_train["wind_speed"].mean())
df_test["wind_speed"] = df_test["wind_speed"].fillna(df_test["wind_speed"].mean())

In [None]:
def run_prophet(build_idx):
    #for i in build_idx:
        scores = []
        df_train_build = df_train[df_train["building_id"] == build_idx]
        df_test_build = df_test[df_test["building_id"] == build_idx]

        #print(f'building {i} is being trained ...')
        
        
        for j in df_train_build["meter"].unique():
            #print(f'Jumping to {j} meter ...')

            df_train_build_meter = df_train_build[df_train_build["meter"] == j][["timestamp", "meter_reading", 
                                                                                 "air_temperature", 
                                                                                 "dew_temperature", 
                                                                                 "sea_level_pressure",
                                                                                 "wind_direction", 
                                                                                 "wind_speed"]].reset_index(drop=True)
            df_test_build_meter = df_test_build[df_test_build["meter"] == j][["row_id","timestamp",
                                                                             "air_temperature", 
                                                                             "dew_temperature", 
                                                                             "sea_level_pressure",
                                                                             "wind_direction", 
                                                                             "wind_speed"]]

            df_train_build_meter.columns = ["ds", "y", "air_temperature", "dew_temperature",
                                                                        "sea_level_pressure",
                                                                        "wind_direction", 
                                                                        "wind_speed"]
            df_test_build_meter.columns = ["row_id", "ds", "air_temperature", "dew_temperature", 
                                                                              "sea_level_pressure",
                                                                              "wind_direction", 
                                                                              "wind_speed"]

            z = Prophet(interval_width=0.95, daily_seasonality=True, weekly_seasonality=True,
               seasonality_mode = 'multiplicative')
            z.add_regressor('air_temperature')
            z.add_regressor('dew_temperature')
            z.add_regressor('sea_level_pressure')
            z.add_regressor('wind_direction')
            z.add_regressor('wind_speed')
            z.add_seasonality(name='hourly', period=24, fourier_order=2)
            z.add_seasonality(name='monthly', period=30.5, fourier_order=5)
            z.fit(df_train_build_meter)

            submit = z.predict(df_test_build_meter[["ds", "air_temperature", 
                                   "dew_temperature", 
                                   "sea_level_pressure",
                                   "wind_direction", 
                                   "wind_speed"
                                  ]])

            #submit_pred = fit_model.predict(X_test)
            submit= pd.DataFrame(submit["yhat"])
            submit.columns = ['meter_reading']
            submit = pd.concat([df_test_build_meter['row_id'].reset_index(drop=True), submit], axis = 1)
            #submit_pred.append(submit)
            
            #gc.collect()
            
            print(f'Building {build_idx} is finished ...')
            
            del df_train_build_meter
            del df_test_build_meter

            return(submit)

In [None]:
dict_scores = {}
submit_pred = []
#test_build_idx = range(15)
build_idx = list(df_train["building_id"].unique())

In [None]:
gc.collect()

51

In [None]:
#cpu = int(cpu_count()/3)
start_time = time.time()
cpu = 40

p = Pool(cpu)
predictions = list(tqdm(p.imap(run_prophet, build_idx), total=len(build_idx)))
p.close()
p.join()

print("--- %s seconds ---" % (time.time() - start_time))

  0%|          | 0/1449 [00:00<?, ?it/s]

In [None]:
list(predictions)

In [None]:
final_submit = pd.concat(predictions, axis = 0).reset_index(drop = True)

In [325]:
final_submit.to_csv("submission_prophet.csv", index = False)

Unnamed: 0,row_id,meter_reading
0,0,155.096
1,129,157.546
2,258,155.900
3,387,157.603
4,516,156.628
...,...,...
35035,2259435,-204.551
35036,2259564,-203.025
35037,2259693,-205.838
35038,2259822,-204.293


In [None]:
########## Experiment ###############

In [306]:
### Loop to run on all buildings and meters ###
start_time = time.time()
dict_scores = {}
submit_pred = []
build_idx = df_train["building_id"].unique()
for i in tqdm_notebook(test_build_idx):
    scores = []
    df_train_build = df_train[df_train["building_id"] == i]
    df_test_build = df_test[df_test["building_id"] == i]

    #print(f'building {i} is being trained ...')
    for j in df_train_build["meter"].unique():
        #print(f'Jumping to {j} meter ...')
        
        df_train_build_meter = df_train_build[df_train_build["meter"] == j][["timestamp", "meter_reading", 
                                                                             "air_temperature", 
                                                                             "dew_temperature", 
                                                                             "sea_level_pressure",
                                                                             "wind_direction", 
                                                                             "wind_speed"]].reset_index(drop=True)
        df_test_build_meter = df_test_build[df_test_build["meter"] == j][["row_id","timestamp",
                                                                         "air_temperature", 
                                                                         "dew_temperature", 
                                                                         "sea_level_pressure",
                                                                         "wind_direction", 
                                                                         "wind_speed"]]
        
        df_train_build_meter.columns = ["ds", "y", "air_temperature", "dew_temperature",
                                                                    "sea_level_pressure",
                                                                    "wind_direction", 
                                                                    "wind_speed"]
        df_test_build_meter.columns = ["row_id", "ds", "air_temperature", "dew_temperature", 
                                                                          "sea_level_pressure",
                                                                          "wind_direction", 
                                                                          "wind_speed"]

        z = Prophet(interval_width=0.95, daily_seasonality=True, weekly_seasonality=True,
           seasonality_mode = 'multiplicative')
        z.add_regressor('air_temperature')
        z.add_regressor('dew_temperature')
        z.add_regressor('sea_level_pressure')
        z.add_regressor('wind_direction')
        z.add_regressor('wind_speed')
        z.add_seasonality(name='hourly', period=24, fourier_order=2)
        z.add_seasonality(name='monthly', period=30.5, fourier_order=5)
        z.fit(df_train_build_meter)
        
        submit = z.predict(df_test_build_meter[["ds", "air_temperature", 
                               "dew_temperature", 
                               "sea_level_pressure",
                               "wind_direction", 
                               "wind_speed"
                              ]])
        
        #submit_pred = fit_model.predict(X_test)
        submit= pd.DataFrame(submit["yhat"])
        submit.columns = ['meter_reading']
        submit = pd.concat([df_test_build_meter['row_id'].reset_index(drop=True), submit], axis = 1)
        submit_pred.append(submit)

print("--- %s seconds ---" % (time.time() - start_time))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

--- 129.63623976707458 seconds ---


In [321]:
final_submit = pd.concat(submit_pred, axis = 0).reset_index(drop = True)

In [322]:
final_submit['meter_reading'].isnull().sum()

0

In [323]:
final_submit

Unnamed: 0,row_id,meter_reading
0,0,155.096
1,129,157.546
2,258,155.900
3,387,157.603
4,516,156.628
...,...,...
35035,2259436,33.147
35036,2259565,31.234
35037,2259694,31.262
35038,2259823,31.239
