In [1]:
import pandas as pd 
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

pd.set_option("display.max_columns", 100)

In [56]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd


class SortDropCast(BaseEstimator, TransformerMixin):
    """
    This pipeline step will sort values by field "connectTime",
    drop columns "user_email", "slrpPaymentId", 
    and cast columns "cumEnergy_Wh", "peakPower_W" as float values. 
    """

    def fit(self, X, y=None):
        return self

    @staticmethod
    def transform(X) -> pd.DataFrame:
        X = X.sort_values(by="connectTime").drop(
            columns=["user_email", "slrpPaymentId"]).reset_index(drop=True)
        X["cumEnergy_Wh"] = X["cumEnergy_Wh"].astype(float)
        X["peakPower_W"] = X["peakPower_W"].astype(float)
        return X


class HelperFeatureCreation(BaseEstimator, TransformerMixin):
    """
    This pipeline step will drop any records that contain 0 for 
    "peakPower_W" or "cumEnergy_Wh". Four additional columns will be created:
    "reqChargeTime", "finishChargeTime", "Overstay", and "Overstay_h". 
    Any records with calculated charging durations greater than a day will be dropped. 
    Raw data (with these new features) at this staged will be saved.
    """

    def fit(self, X, y=None):
        return self

    @classmethod
    def transform(cls, X) -> pd.DataFrame:
        X = X.loc[(X["peakPower_W"] != 0) & (
            X["cumEnergy_Wh"] != 0)].copy(deep=True)

        X["connectTime"] = pd.to_datetime(X["connectTime"])
        X["startChargeTime"] = pd.to_datetime(X["startChargeTime"])
        X["Deadline"] = pd.to_datetime(X["Deadline"])
        X["lastUpdate"] = pd.to_datetime(X["lastUpdate"])

        X["finishChargeTime"] = X.apply(cls.__get_finishChargeTime, axis=1)
        X["trueDurationHrs"] = X.apply(cls.__get_duration, axis=1)
        X["true_peakPower_W"] = X["cumEnergy_Wh"] / X["trueDurationHrs"]

        # filter out bad rows (this occurs when there is a very low peak power and high energy delivered)
        X = X.loc[X["trueDurationHrs"] <= 24].copy()

        X['temp_0'] = pd.Timedelta(days=0, seconds=0)
        X['Overstay'] = X["lastUpdate"] - X['Deadline']
        X["Overstay"] = X[["Overstay", "temp_0"]].max(axis=1)
        X['Overstay_h'] = X['Overstay'].dt.seconds / 3600

        X.drop(columns=['temp_0'], inplace=True)

        X.to_csv("data/raw_data.csv")

        return X
    
    @staticmethod
    def __get_duration(row):
        if row["regular"] == 1:
            return round(((row["lastUpdate"] - row["startChargeTime"]).seconds/3600), 3)
        else: 
            return round(((row["Deadline"] - row["startChargeTime"]).seconds/3600), 3)
        
    @staticmethod
    def __get_finishChargeTime(row):
        if row["regular"] == 1:
            return row["lastUpdate"]
        else:
            return row["Deadline"]
        

class CreateSessionTimeSeries(BaseEstimator, TransformerMixin):
    """
    This pipeline step will create a time series for each session. A dataframe
    with 5-min granularity will be returned, with one column, "power_demand_W".
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X) -> pd.DataFrame:
        self.rows = []
        X.apply(self.__create_ts, axis=1)
        X = pd.concat(self.rows, axis=0).sort_index()
        X = X.resample("5MIN").sum()
        return X

    def __create_ts(self, session):
        """
        This helper function takes in a session, with a "connectTime", "finishChargeTime", and 
        a "peakPower_W" column. Function will return a time series at 5-min granularity. 
        """
        date_range = pd.date_range(
            start=session["startChargeTime"], end=session["finishChargeTime"], freq="5min")
        temp_df = pd.DataFrame(index=date_range)
        temp_df["avg_power_demand_W"] = session["true_peakPower_W"]  # rename
        self.rows.append(temp_df)


class FeatureCreation(BaseEstimator, TransformerMixin):
    """
    This pipeline step will create an "energy_demand_kWh" and "peak_power_W" column. 
    The name of the dataframe's index will be set to "time", and "day" and "month" columns 
    will be created. 
    """

    def fit(self, X, y=None):
        return self

    @ staticmethod
    def transform(X) -> pd.DataFrame:
        X["energy_demand_kWh"] = (X["avg_power_demand_W"]/1000)/12
        # for the highest granularity, peak power is equal to the power demand
        # (different for different granularities though)
        X["peak_power_W"] = X["avg_power_demand_W"]
        X.index.name = "time"
        X["day"] = X.index.day_name()
        X["month"] = X.index.month_name()
        return X


class SaveToCsv(BaseEstimator, TransformerMixin):
    """
    This pipeline step takes each dataframe and creates new granularities
    (hourly, daily, and monthly). Each dataframe is saved to a "data/" file. 
    """

    def __init__(self) -> None:
        self.agg_key = {
            "avg_power_demand_W": "mean",
            "energy_demand_kWh": "sum",
            "peak_power_W": "max",
            "day": "first",
            "month": "first"
        }
        self.dataframe_names = [
            "fivemindemand",
            "hourlydemand",
            "dailydemand",
            "monthlydemand"
        ]
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X) -> dict:
        # create new granularities
        hourlydemand = X.resample("1H").agg(self.agg_key)
        dailydemand = X.resample("24H").agg(self.agg_key)
        monthlydemand = X.resample("1M").agg(self.agg_key)

        new_dataframes = {
            "fivemindemand": X,
            "hourlydemand": hourlydemand,
            "dailydemand": dailydemand,
            "monthlydemand": monthlydemand
        }

        # save to file system
        for idx, dataframe in enumerate(new_dataframes.values()):
            dataframe.to_csv(f"data/{self.dataframe_names[idx]}.csv")
        return new_dataframes


In [2]:
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd


class HourlyForecast(BaseEstimator, TransformerMixin):

    columns = ["avg_power_demand_W", "energy_demand_kWh", "peak_power_W"]

    def __init__(self, best_params: dict) -> None:
        super().__init__()
        self.best_params = best_params

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):

        # add 24 hours to end of data 
        X = self.__create_24hrs_future(X)
        # copy prediction interval
        forecasts = pd.DataFrame(index=X.index[-24:])

        for column in self.columns:
            # get params
            params = self.best_params.get(column)

            # create regressor
            regressor = KNeighborsRegressor(
                n_neighbors=params["best_n_neighbors"])
            
            # isolate column, create features
            df = X[[column]].copy()
            df = self.__create_lag_features(df,params["best_depth"], col_name=column)

            # split into training set and test set
            X_train, X_test, y_train, y_test = self.__train_test_split(df, col_name=column)

            # train regressor and predict 24 hours ahead
            regressor.fit(X_train, y_train)
            forecasts[column + "_predictions"] = regressor.predict(X_test).reshape(-1)

        return forecasts


    @staticmethod
    def __create_24hrs_future(df):
        prediction_range = pd.DataFrame(index=df.index[-24:] + pd.Timedelta(hours=24), columns=df.columns)
        df = pd.concat([df, prediction_range])
        return df 

    @staticmethod
    def __create_lag_features(df, num_lag_depths, col_name):
        df_with_lags = df.copy(deep=True)
        for lag_depth in np.arange(1, num_lag_depths+1):
            column = df_with_lags[col_name].shift(24*lag_depth)
            df_with_lags = pd.concat([df_with_lags, column.rename("lag" + f"{lag_depth}")], axis=1)
        return df_with_lags.dropna(subset=df_with_lags.columns.drop(col_name)) # only rows with NaN as features, NaN in true value column is OK
    
    @staticmethod
    def __train_test_split(df, col_name):
        """
        Withhold the last 24 hours to predict the next 24 hours.
        """
        X_train, X_test, y_train, y_test = train_test_split(
            df.drop(columns=col_name),
            df[[col_name]],
            test_size=24, # withhold last 24 hours
            shuffle=False
        )
        return X_train, X_test, y_train, y_test


In [6]:
from app import update_ml_parameters

best = update_ml_parameters()
best

Fetching data...
Cleaning data...
Done!
Fitting 1 folds for each of 336 candidates, totalling 336 fits
Fitting 1 folds for each of 336 candidates, totalling 336 fits
{'energy_demand_kWh': {'best_depth': 57, 'best_n_neighbors': 25}, 'peak_power_W': {'best_depth': 57, 'best_n_neighbors': 24}, 'avg_power_demand_W': {'best_depth': 57, 'best_n_neighbors': 25}}


{'best_params': {'energy_demand_kWh': {'best_depth': 57,
   'best_n_neighbors': 25},
  'peak_power_W': {'best_depth': 57, 'best_n_neighbors': 24},
  'avg_power_demand_W': {'best_depth': 57, 'best_n_neighbors': 25}},
 'last_validated_time': '04/02/23 17:36:38'}

In [5]:
best = {'best_params': {'energy_demand_kWh': {'best_depth': 57,
   'best_n_neighbors': 25},
  'peak_power_W': {'best_depth': 57, 'best_n_neighbors': 24},
  'avg_power_demand_W': {'best_depth': 57, 'best_n_neighbors': 25}},
 'last_validated_time': '04/02/23 17:36:38'}
df = pd.read_csv("data/hourlydemand.csv", parse_dates=True, index_col="time")
df

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-05 10:00:00,879.000000,0.439500,879.0,Thursday,November
2020-11-05 11:00:00,879.000000,0.879000,879.0,Thursday,November
2020-11-05 12:00:00,879.000000,0.879000,879.0,Thursday,November
2020-11-05 13:00:00,879.000000,0.879000,879.0,Thursday,November
2020-11-05 14:00:00,219.750000,0.219750,879.0,Thursday,November
...,...,...,...,...,...
2023-04-02 13:00:00,0.000000,0.000000,0.0,Sunday,April
2023-04-02 14:00:00,0.000000,0.000000,0.0,Sunday,April
2023-04-02 15:00:00,4984.166667,4.984167,5981.0,Sunday,April
2023-04-02 16:00:00,5981.000000,5.981000,5981.0,Sunday,April


In [6]:
pipeline = Pipeline([
    ("estimator", HourlyForecast(best_params=best["best_params"]))
])

forecasts = pipeline.fit_transform(df)
forecasts

Unnamed: 0_level_0,avg_power_demand_W_predictions,energy_demand_kWh_predictions,peak_power_W_predictions
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-04-02 18:00:00,724.366667,0.58481,1379.041667
2023-04-02 19:00:00,1091.01,1.09101,782.0
2023-04-02 20:00:00,1040.526667,1.040527,950.833333
2023-04-02 21:00:00,407.486667,0.407487,607.958333
2023-04-02 22:00:00,468.393333,0.468393,231.375
2023-04-02 23:00:00,0.0,0.0,9.958333
2023-04-03 00:00:00,0.0,0.0,0.0
2023-04-03 01:00:00,0.0,0.0,0.0
2023-04-03 02:00:00,40.933333,0.040933,242.916667
2023-04-03 03:00:00,0.0,0.0,127.916667
