# Import library and Load data

In [1]:
import joblib
import numpy as np
import pandas as pd
import sklearn
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold, cross_val_score

# models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
training_df = pd.read_csv("datasets/training.csv")
training_df.set_index("date", inplace=True)

X_train = training_df.iloc[:,1:]
y_train = training_df.Appliances
training_df

Unnamed: 0_level_0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.00,45.566667,...,18.2000,48.900000,17.033333,45.530000,6.600000,733.5,92.000000,7.000000,63.000000,5.300000
2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.00,45.992500,...,18.2000,48.863333,17.066667,45.560000,6.483333,733.6,92.000000,6.666667,59.166667,5.200000
2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.89,45.723333,...,18.1000,48.590000,17.000000,45.400000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000
2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.89,45.530000,...,18.1000,48.590000,17.000000,45.400000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000
2016-01-11 17:50:00,50,40,19.890000,46.026667,19.200000,44.500000,19.790000,44.933333,18.89,45.730000,...,18.1000,48.590000,17.000000,45.290000,6.016667,734.0,92.000000,5.333333,43.833333,4.800000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-05-27 17:00:00,110,0,25.600000,47.193333,25.968571,42.528571,27.390000,41.030000,24.70,45.626667,...,24.6000,50.863333,23.200000,46.751429,23.000000,755.2,55.000000,3.000000,22.000000,13.400000
2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.70,45.590000,...,24.7000,50.074000,23.200000,46.790000,22.733333,755.2,55.666667,3.333333,23.666667,13.333333
2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.70,45.590000,...,24.7000,49.790000,23.200000,46.790000,22.600000,755.2,56.000000,3.500000,24.500000,13.300000
2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.70,45.790000,...,24.6625,49.518750,23.200000,46.817500,22.333333,755.2,56.666667,3.833333,26.166667,13.233333


# Train Models

In [3]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForest": RandomForestRegressor(n_estimators=10, random_state=1337),
}

rkf = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1337)

training_results = {}
for name, model in models.items():
    print(name)
    
    neg_rmse_scores = cross_val_score(model, X_train, y_train, cv=rkf, scoring="neg_root_mean_squared_error", n_jobs=-1)
    rmse_scores = -neg_rmse_scores
    training_results[name] = rmse_scores

    print(np.round(rmse_scores, 2)) 
    print("-> Mean RMSE", np.round(rmse_scores.mean(), 2), '\n')

    model.fit(X_train, y_train)
    if name != "LinearRegression":
        joblib.dump(model, f"models/{name}_{model.n_estimators}.pkl")

LinearRegression
[ 94.23  94.15  87.04  84.37 101.15  90.72  99.48  89.22  97.19  97.77
  87.53  91.57  88.36 104.04 102.33  90.56  88.53  92.74  99.06  90.23
  96.61  86.11  85.81  97.61 102.33  92.12  90.72  92.19  98.2   93.89]
-> Mean RMSE 93.53 

RandomForest
[78.4  80.9  70.37 67.8  82.26 72.96 80.6  70.72 78.08 78.96 70.82 76.92
 68.77 79.01 76.82 71.52 73.78 77.38 81.35 73.14 74.28 79.21 68.15 77.64
 80.14 71.23 77.24 73.92 80.18 73.99]
-> Mean RMSE 75.55 



In [4]:
traing_results_df = pd.DataFrame(training_results)
traing_results_df.to_csv(f"results/training/training_results_{models["RandomForest"].n_estimators}_.csv")
traing_results_df

Unnamed: 0,LinearRegression,RandomForest
0,94.234284,78.404597
1,94.15105,80.899157
2,87.044941,70.369099
3,84.366899,67.797064
4,101.148619,82.261432
5,90.718328,72.961796
6,99.47983,80.603266
7,89.220983,70.718198
8,97.187303,78.080386
9,97.767544,78.961259


# Test Models

In [5]:
testing_df = pd.read_csv("datasets/testing.csv")
testing_df.set_index("date", inplace=True)
X_test = testing_df.iloc[:, 1:]
y_test = testing_df.Appliances

In [6]:
testing_results = {}
for name, model in models.items():
    print(name)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    testing_results[name] = rmse
    print("   -> RMSE", np.round(rmse, 2))

LinearRegression
   -> RMSE 94.42
RandomForest
   -> RMSE 73.57


In [7]:
testing_results_df = pd.DataFrame(testing_results, index=["RMSE"])
testing_results_df.to_csv(f"results/testing/testing_results_{models["RandomForest"].n_estimators}_.csv")
testing_results_df

Unnamed: 0,LinearRegression,RandomForest
RMSE,94.42246,73.565985


# TODO
- Chỉnh sửa n_estimators của RandomForest, GDM