# Import library and Load data

In [1]:
import time
import joblib
import numpy as np
import pandas as pd

# models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# model evaluate
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold, cross_val_score

In [2]:
training_df = pd.read_csv("datasets/training.csv")
training_df.set_index("date", inplace=True)

X_train = training_df.iloc[:,1:]
y_train = training_df.Appliances
training_df

Unnamed: 0_level_0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.00,45.566667,...,18.2000,48.900000,17.033333,45.530000,6.600000,733.5,92.000000,7.000000,63.000000,5.300000
2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.00,45.992500,...,18.2000,48.863333,17.066667,45.560000,6.483333,733.6,92.000000,6.666667,59.166667,5.200000
2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.89,45.723333,...,18.1000,48.590000,17.000000,45.400000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000
2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.89,45.530000,...,18.1000,48.590000,17.000000,45.400000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000
2016-01-11 17:50:00,50,40,19.890000,46.026667,19.200000,44.500000,19.790000,44.933333,18.89,45.730000,...,18.1000,48.590000,17.000000,45.290000,6.016667,734.0,92.000000,5.333333,43.833333,4.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-05-27 17:00:00,110,0,25.600000,47.193333,25.968571,42.528571,27.390000,41.030000,24.70,45.626667,...,24.6000,50.863333,23.200000,46.751429,23.000000,755.2,55.000000,3.000000,22.000000,13.400000
2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.70,45.590000,...,24.7000,50.074000,23.200000,46.790000,22.733333,755.2,55.666667,3.333333,23.666667,13.333333
2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.70,45.590000,...,24.7000,49.790000,23.200000,46.790000,22.600000,755.2,56.000000,3.500000,24.500000,13.300000
2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.70,45.790000,...,24.6625,49.518750,23.200000,46.817500,22.333333,755.2,56.666667,3.833333,26.166667,13.233333


# Train Models

In [3]:
models = {
    "LR": LinearRegression(n_jobs=-1),
    # TODO: change RF  n_estimator to optimal RMSE
    "RF": RandomForestRegressor(n_estimators=5, n_jobs=-1, verbose=1, random_state=1337),
    # TODO: change GBM n_estimator to optimal RMSE
    "GBM": GradientBoostingRegressor(n_estimators=5, max_depth=5, verbose=1, random_state=1337),
}

rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1337)

training_results = {}
for name, model in models.items():
    start = time.time()
    print("#########", name, "#########")
    
    neg_rmse_scores = cross_val_score(model, X_train, y_train, cv=rkf, scoring="neg_root_mean_squared_error", n_jobs=-1)
    rmse_scores = -neg_rmse_scores
    training_results[name] = rmse_scores.mean()
    end = time.time()
    
    print(np.round(rmse_scores, 3)) 
    print(f"-> Runtime: {end-start:.3f} seconds, Mean RMSE: {np.round(rmse_scores.mean(), 3)}\n",)

    model.fit(X_train, y_train)
    if name == "LR":
        joblib.dump(model, f"models/{model}.pkl")
    else:
        joblib.dump(model, f"models/{model}.pkl")

traing_results_df = pd.DataFrame(training_results, index=["RMSE"])
traing_results_df.to_csv(f"results/training/{models["LR"]}_{models["RF"]}_{models["GBM"]}.csv")
traing_results_df

######### LR #########
[ 94.234  94.151  87.045  84.367 101.149  90.718  99.48   89.221  97.187
  97.768  87.527  91.569  88.36  104.043 102.328  90.564  88.529  92.737
  99.057  90.226  96.605  86.106  85.81   97.608 102.328  92.121  90.723
  92.186  98.202  93.888]
-> Runtime: 4.473 seconds, Mean RMSE: 93.528

######### RF #########


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.8s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.1s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.3s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.4s remaining:    3.6s
[Parallel(n_jobs=-1)]: Done 

[82.542 84.823 74.767 71.737 84.039 78.937 84.267 75.971 81.117 83.522
 73.375 80.272 72.552 85.369 80.646 74.203 76.756 81.856 84.497 76.609
 77.09  82.096 73.078 81.    84.332 74.175 80.517 79.045 81.653 79.603]
-> Runtime: 11.125 seconds, Mean RMSE: 79.348



[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    0.6s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.6s finished


######### GBM #########
[ 97.488  97.675  90.219  88.588 104.151  92.201 103.414  92.771  98.453
 101.265  90.401  94.807  90.848 105.792 104.11   92.696  93.779  96.025
 105.593  90.958  99.218  88.343  85.907 100.249 103.452  95.638  96.8
  96.085 102.844  96.488]
-> Runtime: 4.310 seconds, Mean RMSE: 96.542

      Iter       Train Loss   Remaining Time 
         1       10109.6321            0.58s
         2        9755.4868            0.45s
         3        9455.9143            0.30s
         4        9188.0192            0.15s
         5        8955.7233            0.00s


Unnamed: 0,LR,RF,GBM
RMSE,93.527881,79.348269,96.541892


# Test Models

In [4]:
testing_df = pd.read_csv("datasets/testing.csv")
testing_df.set_index("date", inplace=True)
X_test = testing_df.iloc[:, 1:]
y_test = testing_df.Appliances
testing_df

Unnamed: 0_level_0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,...,18.200000,48.730000,17.000000,45.500000,6.366667,733.700000,92.000000,6.333333,55.333333,5.100000
2016-01-11 18:50:00,580,60,20.066667,46.396667,19.426667,44.400000,19.790000,44.826667,19.000000,46.430000,...,18.066667,48.633333,16.890000,45.290000,5.983333,734.433333,91.166667,5.833333,40.000000,4.616667
2016-01-11 19:00:00,430,50,20.133333,48.000000,19.566667,44.400000,19.890000,44.900000,19.000000,46.363333,...,18.066667,48.560000,16.963333,45.290000,6.000000,734.500000,91.000000,6.000000,40.000000,4.600000
2016-01-11 19:10:00,250,40,20.260000,52.726667,19.730000,45.100000,19.890000,45.493333,19.000000,47.223333,...,18.033333,48.666667,16.890000,45.326667,6.000000,734.616667,90.500000,6.000000,40.000000,4.516667
2016-01-11 19:30:00,100,10,20.566667,53.893333,20.033333,46.756667,20.100000,48.466667,19.000000,48.490000,...,18.150000,49.200000,16.890000,45.326667,6.000000,734.850000,89.500000,6.000000,40.000000,4.350000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-05-27 15:00:00,60,0,25.500000,45.933333,26.277143,41.000000,28.356667,40.560000,24.666667,45.400000,...,23.963333,49.000000,23.100000,46.590000,21.800000,755.800000,59.000000,2.000000,21.000000,13.300000
2016-05-27 15:30:00,80,0,25.500000,45.590000,26.100000,41.000000,28.200000,40.126667,24.700000,45.163333,...,24.000000,48.790000,23.166667,46.590000,22.300000,755.650000,57.000000,2.000000,22.000000,13.250000
2016-05-27 16:30:00,220,0,25.426667,46.060000,26.000000,41.700000,28.000000,40.760000,24.700000,45.400000,...,24.356667,51.333333,23.200000,46.700000,22.900000,755.350000,55.000000,2.500000,22.500000,13.300000
2016-05-27 17:10:00,90,0,25.533333,46.860000,25.978000,42.534000,27.323333,41.090000,24.700000,45.626667,...,24.666667,50.445000,23.200000,46.745000,22.866667,755.200000,55.333333,3.166667,22.833333,13.366667


In [5]:
testing_results = {}
for name, model in models.items():
    print(name)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    testing_results[name] = rmse
    print("   -> RMSE", np.round(rmse, 3))

testing_results_df = pd.DataFrame(testing_results, index=["RMSE"])
testing_results_df.to_csv(f"results/testing/{models["LR"]}_{models["RF"]}_{models["GBM"]}.csv")
testing_results_df

LR
   -> RMSE 94.422
RF
   -> RMSE 78.151
GBM
   -> RMSE 97.07


[Parallel(n_jobs=5)]: Using backend ThreadingBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.0s finished


Unnamed: 0,LR,RF,GBM
RMSE,94.42246,78.150972,97.069656


      Iter       Train Loss   Remaining Time 
         1       10220.3280            1.23s
         2        9880.0368            0.85s
         3        9571.6909            0.53s
         4        9307.7568            0.26s
         5        9069.6729            0.00s
      Iter       Train Loss   Remaining Time 
         1        9875.9247            1.06s
         2        9544.9999            0.75s
         3        9276.3753            0.47s
         4        8996.7679            0.23s
         5        8759.8012            0.00s
      Iter       Train Loss   Remaining Time 
         1       10002.7666            0.73s
         2        9670.8944            0.55s
         3        9379.8886            0.37s
         4        9115.7174            0.18s
         5        8894.8426            0.00s
      Iter       Train Loss   Remaining Time 
         1       10273.0461            1.01s
         2        9950.4136            0.77s
         3        9630.8855            0.51s
      