In [1]:
import pandas as pd 
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from datetime import datetime

pd.set_option("display.max_columns", 100)

In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


## Main Class
class HourlyCrossValidator:

    columns = ["energy_demand_kWh", "peak_power_W"]

    def __init__(self, max_neighbors, max_depth):
        self.max_neighbors = max_neighbors
        self.max_depth = max_depth

    def cross_validate(self, df):

        best_params = {}

        for column in self.columns:
            # initalize cross validator, cross validate  
            params = kNNCrossValidator(self.max_neighbors, self.max_depth, column).cross_validate_one(df)
            best_params[column] = params

        best_params["avg_power_demand_W"] = best_params["energy_demand_kWh"] # same data, different units, so same parameters

        return best_params
    

## Helper Class
class kNNCrossValidator:

    test_size = 0.2

    def __init__(self, max_neighbors, max_depth, col_name):
        super().__init__()
        self.max_neighbors = max_neighbors
        self.max_depth = max_depth
        self.col_name = col_name


    def cross_validate_one(self, df) -> dict:
        # create features
        df = self.__create_all_lag_features(df)

        # create validation pipeline
        validation_pipeline = Pipeline([
            ("subset_features", SubsetLags()),
            ("estimator", KNeighborsRegressor())
        ])

        # create parameter grid
        params = {
            "estimator__n_neighbors": np.arange(10, self.max_neighbors+1), # start searching at 10 neighbors
            "subset_features__num_lags": np.arange(40, self.max_depth+1) # start searching at 40 lags as features
        }

        # split data into train, validation, and test
        X_train_validation, X_train, X_validation, X_test, y_train_validation, y_train, y_validation, y_test = self.__train_test_split(df)

        # create grid and iteratively search
        grid = GridSearchCV(
            estimator=validation_pipeline,
            param_grid=params,
            scoring="neg_mean_squared_error",
            n_jobs=8,
            verbose=4,
            cv=[(np.arange(0, len(X_train)), np.arange(len(X_train), len(X_train_validation)))]
        )
        grid.fit(X_train_validation, y_train_validation)

        best_params = {
            "best_depth": grid.best_params_["subset_features__num_lags"],
            "best_n_neighbors": grid.best_params_["estimator__n_neighbors"]
        }

        return best_params


    def __create_all_lag_features(self, df) -> pd.DataFrame:
        # create pipeline, pass data, create features
        pipeline = Pipeline([
            ("create_features", CreateLagFeatures(
                self.max_depth, self.col_name))
        ])
        df_with_features = pipeline.fit_transform(df)

        return df_with_features


    def __train_test_split(self, df):

        # split into train+validation and test
        X_train_validation, X_test, y_train_validation, y_test = train_test_split(
            df.filter(regex="lag"),  # select only "lag-" features
            df[[self.col_name]],
            test_size=self.test_size,
            shuffle=False  # time series split
        )

        # split into train and validation
        X_train, X_validation, y_train, y_validation = train_test_split(
            X_train_validation,
            y_train_validation,
            test_size=0.2,
            shuffle=False
        )

        return X_train_validation, X_train, X_validation, X_test, y_train_validation, y_train, y_validation, y_test



## Pipeline Classes
class CreateLagFeatures(BaseEstimator, TransformerMixin):

    def __init__(self, num_lag_depths, col_name):
        super().__init__()
        self.num_lags_depths = num_lag_depths
        self.col_name = col_name

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        return self.__create_lag_features(X, self.num_lags_depths, self.col_name)
    
    @staticmethod
    def __create_lag_features(df, num_lag_depths, col_name):
        df_with_lags = df.copy(deep=True)
        for lag_depth in np.arange(1,num_lag_depths+1):
            column = df_with_lags[col_name].shift(24*lag_depth)
            df_with_lags = pd.concat([df_with_lags, column.rename("lag" + f"{lag_depth}")], axis=1)
        return df_with_lags.dropna()


class SubsetLags(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_lags=1):
        super().__init__()
        self.num_lags = num_lags
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.__select_subset_lags(X, self.num_lags)
    
    @staticmethod
    def __select_subset_lags(df, num_lags):
        return df[[f"lag{depth}" for depth in np.arange(1, num_lags+1)]]


import pandas as pd
from sklearn.model_selection import train_test_split
import pmdarima as pm


# Main Class
class DailyCrossValidator:

    columns = ["energy_demand_kWh", "peak_power_W"]

    @classmethod
    def cross_validate(cls, df):

        best_params = {}

        for column in cls.columns:
            # initalize cross validator, cross validate
            params = SARIMACrossValidator(column).cross_validate_one(df)
            best_params[column] = params

        # same data, different units, so same parameters
        best_params["avg_power_demand_W"] = best_params["energy_demand_kWh"]

        return best_params


# Helper Class
class SARIMACrossValidator:

    test_size = 0.2

    def __init__(self, col_name):
        super().__init__()
        self.col_name = col_name

    def cross_validate_one(self, df: pd.DataFrame):

        X_train, X_test, y_train, y_test = self.__train_test_split(df)

        stepwise_fit = pm.auto_arima(y_train,
                                     start_p=0, start_q=0,
                                     max_p=3, max_q=3, max_Q=3, max_P=3,
                                     d=0, D=1, m=7,
                                     X=None,
                                     seasonal=True, trace=True, stepwise=True)

        return stepwise_fit

    def __train_test_split(self, df: pd.DataFrame):
        # take only needed column 
        df = df[[self.col_name]]
        
        X_train, X_test, y_train, y_test = train_test_split(
            df.drop(columns=[self.col_name]),
            df[[self.col_name]],
            test_size=self.test_size,
            shuffle=False
        )

        return X_train, X_test, y_train, y_test


import pandas as pd
import statsmodels.api as sm 

class CreateDailyForecasts:

    columns = ["avg_power_demand_W", "energy_demand_kWh", "peak_power_W"]

    def __init__():
        pass 
    
    @classmethod
    def run_daily_forecast(cls, df, best_params: dict):

        existing_forecasts = pd.read_csv("forecastdata/dailyforecasts.csv", index_col="time", parse_dates=True)
        new_forecasts = pd.DataFrame()

        for column in cls.columns:
            
            # train on ALL available data
            train = df[[column]].copy() 
            # create ARIMA model 
            best_model_arima = sm.tsa.arima.ARIMA(train, order=(1,0,0), seasonal_order=(1,1,1,7)).fit()
            # forecast on day ahead, convert to a dataframe
            one_column_forecast = best_model_arima.forecast()
            one_column_forecast = pd.DataFrame(one_column_forecast, columns=[column+'_predictions']) 
            new_forecasts = pd.concat([new_forecasts, one_column_forecast], axis=1)

        # append new forecasts existing set of forecasts
        forecasts = pd.concat([existing_forecasts, new_forecasts], axis=0)

        return forecasts

    
    @staticmethod
    def save_empty_prediction_df():
        empty_df = pd.DataFrame(columns=["avg_power_demand_W_predictions", "energy_demand_kWh_predictions", "peak_power_W_predictions"], index=pd.Index([], name="time"))
        empty_df.to_csv("forecastdata/dailyforecasts.csv")
        return empty_df


In [2]:
hourlydemand = pd.read_csv("data/hourlydemand.csv", index_col="time", parse_dates=True)
dailydemand = pd.read_csv("data/dailydemand.csv", index_col="time", parse_dates=True)

In [9]:
{'energy_demand_kWh': {"order":(2, 0, 1), "seasonal_order":(0, 1, 2, 7)},
 'peak_power_W': {"order":(1, 0, 0), "seasonal_order":(0, 1, 2, 7)},
 'avg_power_demand_W': {"order":(2, 0, 1), "seasonal_order":(0, 1, 2, 7)}
}

{'energy_demand_kWh': {'order': (2, 0, 1), 'seasonal_order': (0, 1, 2, 7)},
 'peak_power_W': {'order': (1, 0, 0), 'seasonal_order': (0, 1, 2, 7)},
 'avg_power_demand_W': {'order': (2, 0, 1), 'seasonal_order': (0, 1, 2, 7)}}

In [10]:
{'energy_demand_kWh': {'best_depth': 57, 'best_n_neighbors': 25},
 'peak_power_W': {'best_depth': 57, 'best_n_neighbors': 23},
 'avg_power_demand_W': {'best_depth': 57, 'best_n_neighbors': 25}}

{'energy_demand_kWh': {'best_depth': 57, 'best_n_neighbors': 25},
 'peak_power_W': {'best_depth': 57, 'best_n_neighbors': 23},
 'avg_power_demand_W': {'best_depth': 57, 'best_n_neighbors': 25}}

In [11]:
df = dailydemand.loc[dailydemand["day"] == "Friday"]
df

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-06,0.000000,0.000000,0.0,Friday,November
2020-11-13,407.930556,9.790333,1277.0,Friday,November
2020-11-20,0.000000,0.000000,0.0,Friday,November
2020-11-27,0.000000,0.000000,0.0,Friday,November
2020-12-04,586.625000,14.079000,1482.0,Friday,December
...,...,...,...,...,...
2023-03-24,8251.256944,198.030167,36628.0,Friday,March
2023-03-31,751.975694,18.047417,12388.0,Friday,March
2023-04-07,6193.690972,148.648583,19607.0,Friday,April
2023-04-14,5319.187500,127.660500,12860.0,Friday,April


In [12]:
px.histogram(df, x="peak_power_W")