In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

from statsmodels.tsa.statespace.sarimax import SARIMAX
import pmdarima as pm

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("../data/historic_demand_2009_2023.csv")

In [4]:
elec_df = df[192_816: 194_788]

In [5]:
elec_df = elec_df[["settlement_date", "embedded_wind_generation"]]

In [6]:
elec_df.head()

Unnamed: 0,settlement_date,embedded_wind_generation
192816,2020-01-01,1244
192817,2020-01-01,1188
192818,2020-01-01,1156
192819,2020-01-01,1125
192820,2020-01-01,1106


In [7]:
wind_df = pd.read_csv("../data/Hull_2020-01-01_to_2020-02-10.csv")
wind_df["windgust"].tail()


979    53.6
980    49.7
981    57.8
982    65.0
983    78.4
Name: windgust, dtype: float64

In [8]:
wind_df = wind_df[["datetime","windgust", "windspeed"]]

In [9]:
wind_df['windgust'] = wind_df['windgust'].fillna(0)

In [10]:
wind_df.head()

Unnamed: 0,datetime,windgust,windspeed
0,2020-01-01T00:00:00,0.0,1.7
1,2020-01-01T01:00:00,0.0,4.1
2,2020-01-01T02:00:00,0.0,4.0
3,2020-01-01T03:00:00,0.0,4.3
4,2020-01-01T04:00:00,0.0,4.3


In [11]:
wind_df_long = pd.DataFrame(np.repeat(wind_df.values, 2, axis=0))
wind_df_long.columns = wind_df.columns
wind_df_long.head()

Unnamed: 0,datetime,windgust,windspeed
0,2020-01-01T00:00:00,0.0,1.7
1,2020-01-01T00:00:00,0.0,1.7
2,2020-01-01T01:00:00,0.0,4.1
3,2020-01-01T01:00:00,0.0,4.1
4,2020-01-01T02:00:00,0.0,4.0


In [12]:
wind_df_long["windspeed"] = wind_df_long["windspeed"]*100

In [13]:
wind_df_long.head()

Unnamed: 0,datetime,windgust,windspeed
0,2020-01-01T00:00:00,0.0,170.0
1,2020-01-01T00:00:00,0.0,170.0
2,2020-01-01T01:00:00,0.0,410.0
3,2020-01-01T01:00:00,0.0,410.0
4,2020-01-01T02:00:00,0.0,400.0


In [31]:
test_begin_point = 1550

In [32]:
endog_train = elec_df["embedded_wind_generation"][:test_begin_point].to_numpy()
endog_train = np.array(endog_train, dtype=float)
len(endog_train)

1550

In [33]:
endog_test_df = elec_df["embedded_wind_generation"][test_begin_point:-4]
endog_test = np.array(endog_test_df, dtype=float)
len(endog_test)

418

In [34]:
exog_train = wind_df_long["windspeed"][:test_begin_point].to_numpy()
exog_train = np.array(exog_train, dtype=float)
exog_train = exog_train
len(exog_train)

1550

In [35]:
exog_test = wind_df_long["windspeed"][test_begin_point:].to_numpy()
exog_test = np.array(exog_test, dtype=float)
exog_test = exog_test
len(exog_test)

418

In [36]:
# import pmdarima as pm
# sarimax = pm.auto_arima(elec_df["embedded_wind_generation"], exogenous=wind_df_long["windspeed"],
#                            start_p=0, start_q=0,
#                            test='adf',
#                            max_p=2, max_q=2, m=12,
#                            start_P=0, seasonal=True,
#                            d=None, D=1, trace=True,
#                            suppress_warnings=True, 
#                            stepwise=True)

In [37]:
SARIMAX(endog=endog_train,
        exog=exog_train,
        order=(2, 0, 2),seasonal_order=(2,1,0,12)
       )

<statsmodels.tsa.statespace.sarimax.SARIMAX at 0x1541348b0>

In [38]:
mod = SARIMAX(endog_train, order=(1, 0, 0),seasonal_order=(2,1,0,12), exog=exog_train, alpha=0.02, beta=0.02)
mod = mod.fit(maxiter=300)

RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f=  7.01819D+00    |proj g|=  8.28595D-01


 This problem is unconstrained.



At iterate    5    f=  6.75014D+00    |proj g|=  3.11250D-02

At iterate   10    f=  6.73331D+00    |proj g|=  7.16262D-03

At iterate   15    f=  6.69953D+00    |proj g|=  1.64016D-01

At iterate   20    f=  6.60263D+00    |proj g|=  4.97657D-02

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     24     30      1     0     0   8.103D-06   6.601D+00
  F =   6.6006166277252722     

CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            


In [39]:
print(mod.summary())

                                     SARIMAX Results                                      
Dep. Variable:                                  y   No. Observations:                 1550
Model:             SARIMAX(1, 0, 0)x(2, 1, 0, 12)   Log Likelihood              -10230.956
Date:                            Tue, 21 Mar 2023   AIC                          20471.912
Time:                                    19:59:28   BIC                          20498.603
Sample:                                         0   HQIC                         20481.843
                                           - 1550                                         
Covariance Type:                              opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1             0.0004      0.014      0.027      0.979      -0.028       0.029
ar.L1          1.0000      0.004   

In [47]:
# Forecast
forecast_length = 100
results = mod.get_forecast(forecast_length, exog = exog_test[:forecast_length], alpha=0.05)
forecast = results.predicted_mean
confidence_int = results.conf_int()

In [48]:
def plot_forecast(fc, train, test, windspeed, upper=None, lower=None):
    is_confidence_int = isinstance(upper, np.ndarray) and isinstance(lower, np.ndarray)
    # Prepare plot series
    fc_series = pd.Series(fc, index=test.index)
    wind_series = pd.Series(windspeed)
    wind_series = wind_series
    lower_series = pd.Series(upper, index=test.index) if is_confidence_int else None
    upper_series = pd.Series(lower, index=test.index) if is_confidence_int else None

    # Plot
    plt.figure(figsize=(10,4), dpi=100)
    plt.plot(train, label='training generation', color='black')
    plt.plot(test, label='actual generation', color='black', ls='--')
    plt.plot(fc_series, label='forecast generation', color='orange')
    plt.plot(wind_series, label='wind speed in hull', color='blue')
    if is_confidence_int:
        plt.fill_between(lower_series.index, lower_series, upper_series, color='k', alpha=.15)
    plt.title('Forecast vs Actuals')
    plt.legend(loc='upper left', fontsize=8);

In [None]:
endog_test = pd.DataFrame(endog_test)
endog_test.index = endog_test.index +1500

In [49]:
forecast_recons = pd.Series(forecast, index=endog_test.index[])

plot_forecast(forecast_recons,endog_train, endog_test, wind_df_long["windspeed"])

ValueError: Length of values (100) does not match length of index (418)