 # Power Load Prediction for West Zone - XGBoost Model (Monthly)

In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from pandas.plotting import autocorrelation_plot
import pandas as pd
%matplotlib inline
plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120})

In [2]:
power_data_west = pd.read_csv("westdata_augmented.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
power_data_west['end'] = pd.to_datetime(power_data_west['end'], infer_datetime_format=True)
indexedDataset = power_data_west.set_index(['end'])

In [4]:
power_data_west['end'].min()

Timestamp('2019-01-29 00:15:00')

In [5]:
power_data_west['end'].max()

Timestamp('2022-01-25 16:00:00')

In [6]:
power_data_west.head()

Unnamed: 0.1,Unnamed: 0,index_x,service,start,end,usage,cost,temp,event_flag,demand,sfm,index_y,dt,dt_iso,humidity,humidity_temp,pt_date
0,0,0,west,2019-01-29 00:00:00,2019-01-29 00:15:00,211.2,,54.0,,844.8,2019-01-29 00,680,1548748800,2019-01-29 08,21,63.86,2019-01-29 00
1,1,1,west,2019-01-29 00:15:00,2019-01-29 00:30:00,210.4,,54.0,,841.6,2019-01-29 00,680,1548748800,2019-01-29 08,21,63.86,2019-01-29 00
2,2,2,west,2019-01-29 00:30:00,2019-01-29 00:45:00,208.0,,54.0,,832.0,2019-01-29 00,680,1548748800,2019-01-29 08,21,63.86,2019-01-29 00
3,3,3,west,2019-01-29 00:45:00,2019-01-29 01:00:00,212.0,,54.0,,848.0,2019-01-29 00,680,1548748800,2019-01-29 08,21,63.86,2019-01-29 00
4,4,4,west,2019-01-29 01:00:00,2019-01-29 01:15:00,203.2,,55.0,,812.8,2019-01-29 01,681,1548752400,2019-01-29 09,21,63.41,2019-01-29 01


In [7]:
power_data_west['month'] = power_data_west['end'].dt.strftime('%b')

In [8]:
from calendar import month_abbr

lower_ma = [m.lower() for m in month_abbr]

# one-liner with Pandas
power_data_west['month_int'] = power_data_west['month'].str.lower().map(lambda m: lower_ma.index(m)).astype('Int8')

In [9]:
power_data_west_mod = power_data_west[['end','demand', 'temp', 'humidity', 'month_int']].copy() 

In [10]:
power_data_west_mod.head()

Unnamed: 0,end,demand,temp,humidity,month_int
0,2019-01-29 00:15:00,844.8,54.0,21,1
1,2019-01-29 00:30:00,841.6,54.0,21,1
2,2019-01-29 00:45:00,832.0,54.0,21,1
3,2019-01-29 01:00:00,848.0,54.0,21,1
4,2019-01-29 01:15:00,812.8,55.0,21,1


In [11]:
power_data_west_mod.describe()

Unnamed: 0,demand,temp,humidity,month_int
count,104596.0,104588.0,104608.0,104608.0
mean,1238.634456,67.39232,52.635401,6.54298
std,302.399419,16.59371,22.815913,3.43464
min,0.0,29.0,3.0,1.0
25%,1008.0,54.0,34.0,4.0
50%,1232.0,66.0,49.0,7.0
75%,1472.0,79.0,70.0,10.0
max,2124.8,114.0,100.0,12.0


In [12]:
power_data_west_monthly = power_data_west_mod.resample('M', on='end').agg({'demand':'sum', 'temp':'max', 'humidity':'max', 'month_int':'max'})

In [13]:
power_data_west_monthly.describe()

Unnamed: 0,demand,temp,humidity,month_int
count,37.0,37.0,37.0,37.0
mean,3501519.0,92.378378,92.675676,6.351351
std,943465.9,14.234839,8.61863,3.568506
min,277216.0,65.0,66.0,1.0
25%,2839475.0,81.0,92.0,3.0
50%,3485974.0,91.0,96.0,6.0
75%,4200819.0,105.0,98.0,9.0
max,4855107.0,114.0,100.0,12.0


In [14]:
power_data_west_monthly

Unnamed: 0_level_0,demand,temp,humidity,month_int
end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-31,277216.0,71.0,92,1
2019-02-28,2218374.4,73.0,97,2
2019-03-31,2509120.0,82.0,96,3
2019-04-30,2498371.21,96.0,95,4
2019-05-31,3713468.8,91.0,100,5
2019-06-30,4560950.4,107.0,82,6
2019-07-31,4615408.0,109.0,72,7
2019-08-31,4078729.6,107.0,81,8
2019-09-30,3485974.4,102.0,95,9
2019-10-31,3125310.4,90.0,91,10


In [15]:
xgboost_data = power_data_west_monthly[['demand', 'temp', 'humidity', 'month_int']].copy() 

dataset.kW.plot()

## XG Boost Model

In [16]:
xgboost_data.head(10)

Unnamed: 0_level_0,demand,temp,humidity,month_int
end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-31,277216.0,71.0,92,1
2019-02-28,2218374.4,73.0,97,2
2019-03-31,2509120.0,82.0,96,3
2019-04-30,2498371.21,96.0,95,4
2019-05-31,3713468.8,91.0,100,5
2019-06-30,4560950.4,107.0,82,6
2019-07-31,4615408.0,109.0,72,7
2019-08-31,4078729.6,107.0,81,8
2019-09-30,3485974.4,102.0,95,9
2019-10-31,3125310.4,90.0,91,10


In [17]:
for i in range(1, 6):
    xgboost_data["demand_lag_{}".format(i)] = xgboost_data['demand'].shift(i)
    xgboost_data["temp_lag_{}".format(i)] = xgboost_data['temp'].shift(i)
    xgboost_data["humidity_lag_{}".format(i)] = xgboost_data['humidity'].shift(i)

In [18]:
xgboost_data.head(10)

Unnamed: 0_level_0,demand,temp,humidity,month_int,demand_lag_1,temp_lag_1,humidity_lag_1,demand_lag_2,temp_lag_2,humidity_lag_2,demand_lag_3,temp_lag_3,humidity_lag_3,demand_lag_4,temp_lag_4,humidity_lag_4,demand_lag_5,temp_lag_5,humidity_lag_5
end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-01-31,277216.0,71.0,92,1,,,,,,,,,,,,,,,
2019-02-28,2218374.4,73.0,97,2,277216.0,71.0,92.0,,,,,,,,,,,,
2019-03-31,2509120.0,82.0,96,3,2218374.4,73.0,97.0,277216.0,71.0,92.0,,,,,,,,,
2019-04-30,2498371.21,96.0,95,4,2509120.0,82.0,96.0,2218374.4,73.0,97.0,277216.0,71.0,92.0,,,,,,
2019-05-31,3713468.8,91.0,100,5,2498371.21,96.0,95.0,2509120.0,82.0,96.0,2218374.4,73.0,97.0,277216.0,71.0,92.0,,,
2019-06-30,4560950.4,107.0,82,6,3713468.8,91.0,100.0,2498371.21,96.0,95.0,2509120.0,82.0,96.0,2218374.4,73.0,97.0,277216.0,71.0,92.0
2019-07-31,4615408.0,109.0,72,7,4560950.4,107.0,82.0,3713468.8,91.0,100.0,2498371.21,96.0,95.0,2509120.0,82.0,96.0,2218374.4,73.0,97.0
2019-08-31,4078729.6,107.0,81,8,4615408.0,109.0,72.0,4560950.4,107.0,82.0,3713468.8,91.0,100.0,2498371.21,96.0,95.0,2509120.0,82.0,96.0
2019-09-30,3485974.4,102.0,95,9,4078729.6,107.0,81.0,4615408.0,109.0,72.0,4560950.4,107.0,82.0,3713468.8,91.0,100.0,2498371.21,96.0,95.0
2019-10-31,3125310.4,90.0,91,10,3485974.4,102.0,95.0,4078729.6,107.0,81.0,4615408.0,109.0,72.0,4560950.4,107.0,82.0,3713468.8,91.0,100.0


In [19]:
xgboost_data.tail(10)

Unnamed: 0_level_0,demand,temp,humidity,month_int,demand_lag_1,temp_lag_1,humidity_lag_1,demand_lag_2,temp_lag_2,humidity_lag_2,demand_lag_3,temp_lag_3,humidity_lag_3,demand_lag_4,temp_lag_4,humidity_lag_4,demand_lag_5,temp_lag_5,humidity_lag_5
end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2021-04-30,3342217.6,97.0,100,4,3699497.59,86.0,98.0,3039411.2,74.0,99.0,3484392.0,76.0,98.0,3459385.6,79.0,98.0,3399993.6,90.0,96.0
2021-05-31,4185731.2,103.0,92,5,3342217.6,97.0,100.0,3699497.59,86.0,98.0,3039411.2,74.0,99.0,3484392.0,76.0,98.0,3459385.6,79.0,98.0
2021-06-30,4855107.24,114.0,92,6,4185731.2,103.0,92.0,3342217.6,97.0,100.0,3699497.59,86.0,98.0,3039411.2,74.0,99.0,3484392.0,76.0,98.0
2021-07-31,3984480.0,114.0,79,7,4855107.24,114.0,92.0,4185731.2,103.0,92.0,3342217.6,97.0,100.0,3699497.59,86.0,98.0,3039411.2,74.0,99.0
2021-08-31,4376488.0,110.0,77,8,3984480.0,114.0,79.0,4855107.24,114.0,92.0,4185731.2,103.0,92.0,3342217.6,97.0,100.0,3699497.59,86.0,98.0
2021-09-30,4704580.8,112.0,96,9,4376488.0,110.0,77.0,3984480.0,114.0,79.0,4855107.24,114.0,92.0,4185731.2,103.0,92.0,3342217.6,97.0,100.0
2021-10-31,4200819.2,99.0,97,10,4704580.8,112.0,96.0,4376488.0,110.0,77.0,3984480.0,114.0,79.0,4855107.24,114.0,92.0,4185731.2,103.0,92.0
2021-11-30,4068856.0,80.0,98,11,4200819.2,99.0,97.0,4704580.8,112.0,96.0,4376488.0,110.0,77.0,3984480.0,114.0,79.0,4855107.24,114.0,92.0
2021-12-31,3493072.0,81.0,99,12,4068856.0,80.0,98.0,4200819.2,99.0,97.0,4704580.8,112.0,96.0,4376488.0,110.0,77.0,3984480.0,114.0,79.0
2022-01-31,2618046.38,75.0,98,1,3493072.0,81.0,99.0,4068856.0,80.0,98.0,4200819.2,99.0,97.0,4704580.8,112.0,96.0,4376488.0,110.0,77.0


In [20]:
xgboost_data.dtypes

demand            float64
temp              float64
humidity            int64
month_int            Int8
demand_lag_1      float64
temp_lag_1        float64
humidity_lag_1    float64
demand_lag_2      float64
temp_lag_2        float64
humidity_lag_2    float64
demand_lag_3      float64
temp_lag_3        float64
humidity_lag_3    float64
demand_lag_4      float64
temp_lag_4        float64
humidity_lag_4    float64
demand_lag_5      float64
temp_lag_5        float64
humidity_lag_5    float64
dtype: object

In [21]:
from sklearn import preprocessing
lbl = preprocessing.LabelEncoder()
xgboost_data['month_int'] = lbl.fit_transform(xgboost_data['month_int'].astype(str))

In [22]:
xgboost_data.dtypes

demand            float64
temp              float64
humidity            int64
month_int           int64
demand_lag_1      float64
temp_lag_1        float64
humidity_lag_1    float64
demand_lag_2      float64
temp_lag_2        float64
humidity_lag_2    float64
demand_lag_3      float64
temp_lag_3        float64
humidity_lag_3    float64
demand_lag_4      float64
temp_lag_4        float64
humidity_lag_4    float64
demand_lag_5      float64
temp_lag_5        float64
humidity_lag_5    float64
dtype: object

In [23]:
xgboost_data

Unnamed: 0_level_0,demand,temp,humidity,month_int,demand_lag_1,temp_lag_1,humidity_lag_1,demand_lag_2,temp_lag_2,humidity_lag_2,demand_lag_3,temp_lag_3,humidity_lag_3,demand_lag_4,temp_lag_4,humidity_lag_4,demand_lag_5,temp_lag_5,humidity_lag_5
end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-01-31,277216.0,71.0,92,0,,,,,,,,,,,,,,,
2019-02-28,2218374.4,73.0,97,4,277216.0,71.0,92.0,,,,,,,,,,,,
2019-03-31,2509120.0,82.0,96,5,2218374.4,73.0,97.0,277216.0,71.0,92.0,,,,,,,,,
2019-04-30,2498371.21,96.0,95,6,2509120.0,82.0,96.0,2218374.4,73.0,97.0,277216.0,71.0,92.0,,,,,,
2019-05-31,3713468.8,91.0,100,7,2498371.21,96.0,95.0,2509120.0,82.0,96.0,2218374.4,73.0,97.0,277216.0,71.0,92.0,,,
2019-06-30,4560950.4,107.0,82,8,3713468.8,91.0,100.0,2498371.21,96.0,95.0,2509120.0,82.0,96.0,2218374.4,73.0,97.0,277216.0,71.0,92.0
2019-07-31,4615408.0,109.0,72,9,4560950.4,107.0,82.0,3713468.8,91.0,100.0,2498371.21,96.0,95.0,2509120.0,82.0,96.0,2218374.4,73.0,97.0
2019-08-31,4078729.6,107.0,81,10,4615408.0,109.0,72.0,4560950.4,107.0,82.0,3713468.8,91.0,100.0,2498371.21,96.0,95.0,2509120.0,82.0,96.0
2019-09-30,3485974.4,102.0,95,11,4078729.6,107.0,81.0,4615408.0,109.0,72.0,4560950.4,107.0,82.0,3713468.8,91.0,100.0,2498371.21,96.0,95.0
2019-10-31,3125310.4,90.0,91,1,3485974.4,102.0,95.0,4078729.6,107.0,81.0,4615408.0,109.0,72.0,4560950.4,107.0,82.0,3713468.8,91.0,100.0


In [24]:
from sklearn.ensemble import RandomForestRegressor

In [25]:
def timeseries_train_test_split(X, y, test_size):
    
    # get the index after which test set starts
    test_index = int(len(X)*(1-test_size))
    
    X_train = X.iloc[:test_index]
    y_train = y.iloc[:test_index]
    X_test = X.iloc[test_index:]
    y_test = y.iloc[test_index:]
    
    return X_train, X_test, y_train, y_test

In [26]:
y = xgboost_data.dropna()['demand']
X = xgboost_data.dropna().drop(['demand', 'temp', 'humidity'], axis=1)

In [27]:
X_train, X_test, y_train, y_test = timeseries_train_test_split(X, y, test_size=0.3)

In [28]:
X_train.shape

(22, 16)

In [29]:
y_train.shape

(22,)

In [30]:
X_test.shape

(10, 16)

In [31]:
y_test.shape

(10,)

In [32]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

print('Starting Model Training')

xgboost_model = XGBRegressor(n_estimators=1000, max_depth=10, eta=0.1, subsample=0.7, colsample_bytree=0.8)

xgboost_model.fit(X_train, y_train)

print(' Model Training Completed')

Starting Model Training


AttributeError: dlsym(0x219017b20, XGDMatrixCreateFromDense): symbol not found

In [None]:
from sklearn.metrics import r2_score, median_absolute_error, mean_absolute_error
from sklearn.metrics import median_absolute_error, mean_squared_error, mean_squared_log_error
from sklearn.metrics import mean_squared_error



def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def mean_absolute_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred)))

def mse_error(y_true, y_pred): 
    return mean_squared_error(y_true, y_pred, squared=True)

def rmse_error(y_true, y_pred): 
    return mean_squared_error(y_true, y_pred, squared=False)

In [None]:
def plotModelTrainingResults(model, X_train=X_train, X_test=X_test):
  
    prediction = model.predict(X_train)
    
    plt.figure(figsize=(15, 10))
    plt.plot(prediction, "g", label="prediction", linewidth=2.0)
    plt.plot(y_train.values, label="actual", linewidth=2.0, alpha=0.5)

    mape_error_var = mean_absolute_percentage_error(prediction, y_train)
    mae_error_var = mean_absolute_error(prediction, y_train)
    mse_error_var = mse_error(prediction, y_train)
    rmse_error_var = rmse_error(prediction, y_train)
    
    plt.title("Mean absolute percentage error {0:.2f}%".format(mae_error_var))
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True);
    
    results = pd.DataFrame({'Actual':y_train, 
                            'Prediction':prediction  
                           })
    
    return results, mape_error_var, mae_error_var, mse_error_var, rmse_error_var
    
train_results, train_mape_error_var, train_mae_error_var, train_mse_error_var, train_rmse_error_var = plotModelTrainingResults(xgboost_model)

In [None]:
train_results

In [None]:
train_results.to_csv('Monthly_XGBoost_Model_Train_Results.csv')

In [None]:
print('MAPE: ' , train_mape_error_var)
print('MAE: ', train_mae_error_var)
print('MSE: ', train_mse_error_var)
print('RMSE: ', train_rmse_error_var)

In [None]:
def plotModelPredictions(model, X_train=X_train, X_test=X_test):
  
    prediction = model.predict(X_test)
    
    plt.figure(figsize=(15, 10))
    plt.plot(prediction, "g", label="prediction", linewidth=2.0)
    plt.plot(y_test.values, label="actual", linewidth=2.0, alpha=0.5)

    mape_error_var = mean_absolute_percentage_error(prediction, y_test)
    mae_error_var = mean_absolute_error(prediction, y_test)
    mse_error_var = mse_error(prediction, y_test)
    rmse_error_var = rmse_error(prediction, y_test)
    
    plt.title("Mean absolute percentage error {0:.2f}%".format(mae_error_var))
    plt.legend(loc="best")
    plt.tight_layout()
    plt.grid(True);
    
    results = pd.DataFrame({'Actual':y_test, 
                            'Prediction':prediction  
                           })
    
    return results, mape_error_var, mae_error_var, mse_error_var, rmse_error_var
    
results, mape_error_var, mae_error_var, mse_error_var, rmse_error_var = plotModelPredictions(xgboost_model)

In [None]:
results.tail(10)

In [None]:
print('MAPE: ' , mape_error_var)
print('MAE: ', mae_error_var)
print('MSE: ', mse_error_var)
print('RMSE: ', rmse_error_var)

In [358]:
import joblib
joblib.dump(xgboost_model, "./xgboost_model_monthly.joblib")

['./xgboost_model_monthly.joblib']

In [359]:
results.to_csv('Monthly_XGBoost_Model_Test_Results.csv')