Playground

Goal: Create an ML pipeline for each dataframe.

- 5-Min: If it's not too slow, maybe(?) I can use k-NN. 
- Hourly: k-NN. 
- Daily: ARIMA.
- Montly: Average(?, fuck lol idk)

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("data/hourlydemand.csv")
df.set_index("time", inplace=True)
df.index = pd.to_datetime(df.index)
df.head()

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-11-05 10:00:00,6335.0,3.1675,6335.0,Thursday
2020-11-05 11:00:00,527.916667,0.527917,6335.0,Thursday
2020-11-05 12:00:00,0.0,0.0,0.0,Thursday
2020-11-05 13:00:00,0.0,0.0,0.0,Thursday
2020-11-05 14:00:00,0.0,0.0,0.0,Thursday


In [15]:
import numpy as np 
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


class kNNCrossValidator:

    def __init__(self, max_depth, max_neighbors, df:pd.DataFrame, granularity:int):
        self.max_depth = max_depth
        self.max_neighbors = max_neighbors
        self.granularity = granularity
        self.__kNN_df = self.__create_df(df)
        self.best_params = {
            "best_depth":None,
            "best_n_neighbors":None
        }

    def __create_df(self, df):
        df = self.__create_lag_features(df, self.max_depth, self.granularity)
        return df

    def __X_y_split(self):
        X = self.__kNN_df.drop(
                columns=[
                    self.__kNN_df.columns[0] 
                ]
            ) 
        y = self.__kNN_df[[self.__kNN_df.columns[0]]]
        return X, y

    def cross_validate(self):
        X, y = self.__X_y_split()
        pipeline = Pipeline(
            [
                ("subset_features", SubsetLags()),
                ("estimator", KNeighborsRegressor())
            ]
        )
        params = {"estimator__n_neighbors":np.arange(1,self.max_neighbors+1), "subset_features__num_lags":np.arange(1,self.max_depth+1)}
        grid = GridSearchCV(
            estimator=pipeline,
            param_grid=params,
            scoring="neg_mean_squared_error",
            n_jobs=6,
            verbose=0,
            cv=[(np.arange(0,int(0.8*len(self.__kNN_df))), np.arange(int(0.8*len(self.__kNN_df)), len(self.__kNN_df)))]
        )
        grid.fit(X, y)
        self.best_params["best_depth"] = grid.best_params_["estimator__n_neighbors"]
        self.best_params["best_n_neighbors"] = grid.best_params_["subset_features__num_lags"]

    def get_params(self):
        """
        Return order: best depth, best # of neighbors
        """
        return self.best_params["best_depth"], self.best_params["best_n_neighbors"]

    def __create_lag_features(self, df, num_lag_depths, num_pts_per_day):
        """Function takes in a dataframe at a below daily granularity, and returns a 
        new dataframe with lagged values; a "lag1" column will have the previous day's power demand at the same time, 
        a "lag2" column will have two days ago's power demand at the same time, etc. Function will drop rows with 
        any 'NaN' vlaues. 
        ~~~
        Parameters:
        df: Dataframe to append features.
        num_lag_depths: Number of lagged features to create.
        num_pts_per_day: Number of data points per day. For example, an dataframe with one data point per hour will have one a parameter value of 24. 
        """
        
        with_lags_df = df.copy(deep=True)
        for lag_depth in np.arange(1,num_lag_depths+1):
            column = with_lags_df[df.columns[0]].shift(num_pts_per_day*lag_depth)
            with_lags_df = pd.concat([with_lags_df, column.rename("lag" + f"{lag_depth}")], axis=1)
        return with_lags_df.dropna()

    def __prepare_TS(self, df):
        """
        Function takes in a dataframe and sets the index to the "time" column, 
        changes that index to a pd.datetime-like object.
        """
        df.set_index("time", inplace=True)
        df.index = pd.to_datetime(df.index)
        if len(df.columns == 1):
            raise Exception("There should only be a prediction quantity. Dataframe must be Nx1!")
        return df

class SubsetLags(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_lags=1):
        super().__init__()
        self.num_lags = num_lags
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.__select_subset_lags(X, self.num_lags)
        
    def __select_subset_lags(self, X, num_features):
        """Function takes in a dataframe with a 'energy_demand_kWh' column at select granularity with "lag1", "lag2",..., "lagN" features. 
        Function will select the first "num_features" of lags."""
        return X[[f"lag{depth}" for depth in np.arange(1, num_features+1)]]

In [16]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
import pandas as pd 
import numpy as np 

class GenerateHourlyDayAheadForecast(BaseEstimator, TransformerMixin):

    def __init__(self, granularity=24, best_depth=None, best_neighbors=None, max_depth=20, max_neighbors=10,) -> None:
        self.max_depth = max_depth
        self.max_neighbors = max_neighbors
        self.best_depth = best_depth
        self.best_neighbors = best_neighbors

        self.granularity = granularity 
        super().__init__()

    def fit(self, X, y=None):

        # X = self.__prepare_TS(X) # Can be another step in the pipeline? 

        if self.best_depth == None and self.best_neighbors == None:
            # cross validate 
            CV = kNNCrossValidator(
                max_depth=self.max_depth,
                max_neighbors=self.max_neighbors,
                df=X,
                granularity=self.granularity
            )
            CV.cross_validate()
            self.best_depth, self.best_neighbors = CV.get_params()

        return self

    def transform(self, X):
        # add features to dataframe
        X = self.__create_lag_features(
            df=X, 
            num_lag_depths=self.best_depth, 
            col_name=X.columns[0],
            num_pts_per_day=self.granularity,
            )
        # create estimator 
        estimator = KNeighborsRegressor(
            n_neighbors = self.best_neighbors,
            n_jobs=8
            )
        # train test split 
        X_train, X_test, y_train, y_test = self.__X_y_split(X)
        estimator.fit(X_train, y_train)
        
        return pd.DataFrame(index=X_test.index, data={f"{X.columns[0]} forecast":estimator.predict(X_test).reshape(-1)})
    
    def __X_y_split(self, df, test_days=7):
        """
        """
        X_train, X_test, y_train, y_test = train_test_split(
            df.drop(columns=[df.columns[0]]),
            df[[df.columns[0]]],
            test_size = self.granularity*test_days, # last week of data
            shuffle=False
        )
        return X_train, X_test, y_train, y_test

    def __append_forecast_index(self, df, extra_days=1):
        """
        Adds extra datetime indicies at the end of a time series dataframe. 
        """
        appendage = pd.DataFrame(index=df.tail(24).index + pd.Timedelta(days=extra_days))
        return pd.concat([df, appendage])
    
    def __create_lag_features(self, df, num_lag_depths, col_name, num_pts_per_day):
        """
        Function takes in a dataframe at a below daily granularity, and returns a 
        new dataframe with lagged values; a "lag1" column will have the previous day's power demand at the same time, 
        a "lag2" column will have two days ago's power demand at the same time, etc. Function will drop rows with 
        any 'NaN' vlaues. 
        ~~~
        Parameters:
        df: Dataframe to append features.
        col_name: Column to use.
        num_lag_depths: Number of lagged features to create.
        num_pts_per_day: Number of data points per day. For example, an dataframe with one data point per hour will have one a parameter value of 24. 
        """
        
        with_lags_df = df.copy(deep=True)
        for lag_depth in np.arange(1,num_lag_depths+1):
            column = with_lags_df[col_name].shift(num_pts_per_day*lag_depth)
            with_lags_df = pd.concat([with_lags_df, column.rename("lag" + f"{lag_depth}")], axis=1)
        return with_lags_df.dropna()

    def __prepare_TS(self, df):
        """
        Function takes in a dataframe and sets the index to the "time" column, 
        changes that index to a pd.datetime-like object.
        """
        df.set_index("time", inplace=True)
        df.index = pd.to_datetime(df.index)
        return df


In [17]:
pipe = Pipeline(
    [
        ("forecasts", GenerateHourlyDayAheadForecast())
    ]
)

pipe.fit_transform(df[["avg_power_demand_W"]])

Unnamed: 0_level_0,avg_power_demand_W forecast
time,Unnamed: 1_level_1
2023-01-03 17:00:00,2629.854167
2023-01-03 18:00:00,0.000000
2023-01-03 19:00:00,0.000000
2023-01-03 20:00:00,0.000000
2023-01-03 21:00:00,0.000000
...,...
2023-01-10 12:00:00,18046.135417
2023-01-10 13:00:00,18031.843750
2023-01-10 14:00:00,13076.958333
2023-01-10 15:00:00,10643.916667
