In [1]:
import pandas as pd 
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from datetime import datetime

pd.set_option("display.max_columns", 100)

In [3]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


## Main Class
class HourlyCrossValidator:

    columns = ["energy_demand_kWh", "peak_power_W"]

    def __init__(self, max_neighbors, max_depth):
        self.max_neighbors = max_neighbors
        self.max_depth = max_depth

    def cross_validate(self, df):

        best_params = {}

        for column in self.columns:
            # initalize cross validator, cross validate  
            params = kNNCrossValidator(self.max_neighbors, self.max_depth, column).cross_validate_one(df)
            best_params[column] = params

        best_params["avg_power_demand_W"] = best_params["energy_demand_kWh"] # same data, different units, so same parameters

        return best_params
    

## Helper Class
class kNNCrossValidator:

    test_size = 0.2

    def __init__(self, max_neighbors, max_depth, col_name):
        super().__init__()
        self.max_neighbors = max_neighbors
        self.max_depth = max_depth
        self.col_name = col_name


    def cross_validate_one(self, df) -> dict:
        # create features
        df = self.__create_all_lag_features(df)

        # create validation pipeline
        validation_pipeline = Pipeline([
            ("subset_features", SubsetLags()),
            ("estimator", KNeighborsRegressor())
        ])

        # create parameter grid
        params = {
            "estimator__n_neighbors": np.arange(10, self.max_neighbors+1), # start searching at 10 neighbors
            "subset_features__num_lags": np.arange(40, self.max_depth+1) # start searching at 40 lags as features
        }

        # split data into train, validation, and test
        X_train_validation, X_train, X_validation, X_test, y_train_validation, y_train, y_validation, y_test = self.__train_test_split(df)

        # create grid and iteratively search
        grid = GridSearchCV(
            estimator=validation_pipeline,
            param_grid=params,
            scoring="neg_mean_squared_error",
            n_jobs=8,
            verbose=4,
            cv=[(np.arange(0, len(X_train)), np.arange(len(X_train), len(X_train_validation)))]
        )
        grid.fit(X_train_validation, y_train_validation)

        best_params = {
            "best_depth": grid.best_params_["subset_features__num_lags"],
            "best_n_neighbors": grid.best_params_["estimator__n_neighbors"]
        }

        return best_params


    def __create_all_lag_features(self, df) -> pd.DataFrame:
        # create pipeline, pass data, create features
        pipeline = Pipeline([
            ("create_features", CreateLagFeatures(
                self.max_depth, self.col_name))
        ])
        df_with_features = pipeline.fit_transform(df)

        return df_with_features


    def __train_test_split(self, df):

        # split into train+validation and test
        X_train_validation, X_test, y_train_validation, y_test = train_test_split(
            df.filter(regex="lag"),  # select only "lag-" features
            df[[self.col_name]],
            test_size=self.test_size,
            shuffle=False  # time series split
        )

        # split into train and validation
        X_train, X_validation, y_train, y_validation = train_test_split(
            X_train_validation,
            y_train_validation,
            test_size=0.2,
            shuffle=False
        )

        return X_train_validation, X_train, X_validation, X_test, y_train_validation, y_train, y_validation, y_test



## Pipeline Classes
class CreateLagFeatures(BaseEstimator, TransformerMixin):

    def __init__(self, num_lag_depths, col_name):
        super().__init__()
        self.num_lags_depths = num_lag_depths
        self.col_name = col_name

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        return self.__create_lag_features(X, self.num_lags_depths, self.col_name)
    
    @staticmethod
    def __create_lag_features(df, num_lag_depths, col_name):
        df_with_lags = df.copy(deep=True)
        for lag_depth in np.arange(1,num_lag_depths+1):
            column = df_with_lags[col_name].shift(24*lag_depth)
            df_with_lags = pd.concat([df_with_lags, column.rename("lag" + f"{lag_depth}")], axis=1)
        return df_with_lags.dropna()


class SubsetLags(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_lags=1):
        super().__init__()
        self.num_lags = num_lags
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.__select_subset_lags(X, self.num_lags)
    
    @staticmethod
    def __select_subset_lags(df, num_lags):
        return df[[f"lag{depth}" for depth in np.arange(1, num_lags+1)]]


import pandas as pd
from sklearn.model_selection import train_test_split
import pmdarima as pm


# Main Class
class DailyCrossValidator:

    columns = ["energy_demand_kWh", "peak_power_W"]

    @classmethod
    def cross_validate(cls, df):

        best_params = {}

        for column in cls.columns:
            # initalize cross validator, cross validate
            params = SARIMACrossValidator(column).cross_validate_one(df)
            best_params[column] = params

        # same data, different units, so same parameters
        best_params["avg_power_demand_W"] = best_params["energy_demand_kWh"]

        return best_params


# Helper Class
class SARIMACrossValidator:

    test_size = 0.2

    def __init__(self, col_name):
        super().__init__()
        self.col_name = col_name

    def cross_validate_one(self, df: pd.DataFrame):

        X_train, X_test, y_train, y_test = self.__train_test_split(df)

        stepwise_fit = pm.auto_arima(y_train,
                                     start_p=0, start_q=0,
                                     max_p=3, max_q=3, max_Q=3, max_P=3,
                                     d=0, D=1, m=7,
                                     X=None,
                                     seasonal=True, trace=True, stepwise=True)

        return stepwise_fit

    def __train_test_split(self, df: pd.DataFrame):
        # take only needed column 
        df = df[[self.col_name]]
        
        X_train, X_test, y_train, y_test = train_test_split(
            df.drop(columns=[self.col_name]),
            df[[self.col_name]],
            test_size=self.test_size,
            shuffle=False
        )

        return X_train, X_test, y_train, y_test


import pandas as pd
import statsmodels.api as sm 

class CreateDailyForecasts:

    columns = ["avg_power_demand_W", "energy_demand_kWh", "peak_power_W"]

    def __init__():
        pass 
    
    @classmethod
    def run_daily_forecast(cls, df, best_params: dict):

        existing_forecasts = pd.read_csv("forecastdata/dailyforecasts.csv", index_col="time", parse_dates=True)
        new_forecasts = pd.DataFrame()

        for column in cls.columns:
            
            # train on ALL available data
            train = df[[column]].copy() 
            # create ARIMA model 
            best_model_arima = sm.tsa.arima.ARIMA(train, order=(1,0,0), seasonal_order=(1,1,1,7)).fit()
            # forecast on day ahead, convert to a dataframe
            one_column_forecast = best_model_arima.forecast()
            one_column_forecast = pd.DataFrame(one_column_forecast, columns=[column+'_predictions']) 
            new_forecasts = pd.concat([new_forecasts, one_column_forecast], axis=1)

        # append new forecasts existing set of forecasts
        forecasts = pd.concat([existing_forecasts, new_forecasts], axis=0)

        return forecasts

    
    @staticmethod
    def save_empty_prediction_df():
        empty_df = pd.DataFrame(columns=["avg_power_demand_W_predictions", "energy_demand_kWh_predictions", "peak_power_W_predictions"], index=pd.Index([], name="time"))
        empty_df.to_csv("forecastdata/dailyforecasts.csv")
        return empty_df


In [1]:
import redis 
import pickle
redis_client = redis.Redis(host='localhost', port=6360)
df = pickle.loads(redis_client.get("daily_forecasts"))
df

Unnamed: 0_level_0,avg_power_demand_W_predictions,energy_demand_kWh_predictions,peak_power_W_predictions
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-05-11,5192.02441,119.769066,16937.413808


In [27]:
import pandas as pd
import statsmodels.api as sm 

df = pickle.loads(redis_client.get("dailydemand"))
df 

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-05,249.592593,3.369500,879.0,Thursday,November
2020-11-06,0.000000,0.000000,0.0,Friday,November
2020-11-07,0.000000,0.000000,0.0,Saturday,November
2020-11-08,0.000000,0.000000,0.0,Sunday,November
2020-11-09,0.000000,0.000000,0.0,Monday,November
...,...,...,...,...,...
2023-05-07,0.000000,0.000000,0.0,Sunday,May
2023-05-08,4128.437500,99.082500,12934.0,Monday,May
2023-05-09,6369.315972,152.863583,18923.0,Tuesday,May
2023-05-10,3080.510417,73.932250,15203.0,Wednesday,May


In [24]:
pickle.loads(redis_client.get("daily_forecasts"))

Unnamed: 0_level_0,avg_power_demand_W_predictions,energy_demand_kWh_predictions,peak_power_W_predictions
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-05-11,5192.02441,119.769066,16937.413808


In [41]:
#existing_forecasts = pd.read_csv("forecastdata/dailyforecasts.csv", index_col="time", parse_dates=True)
existing_forecasts = pickle.loads(redis_client.get("daily_forecasts"))
new_forecasts = pd.DataFrame()

for column in ["avg_power_demand_W", "energy_demand_kWh", "peak_power_W"]:
    
    # train on ALL available data
    train = df[[column]].copy() 

    # create ARIMA model 
    best_model_arima = sm.tsa.arima.ARIMA(train, order=(1,0,1), seasonal_order=(1,1,1,7)).fit()
    
    # forecast on day ahead, convert to a dataframe
    one_column_forecast = best_model_arima.forecast(7)
    one_column_forecast = pd.DataFrame(one_column_forecast)
    one_column_forecast.rename(columns={one_column_forecast.columns[0]: column+'_predictions'}, inplace=True)  
    new_forecasts = pd.concat([new_forecasts, one_column_forecast], axis=1)
    new_forecasts.index.name = "time"



In [63]:
df = pickle.loads(redis_client.get("monthlydemand"))
df

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-30,313.452594,192.303167,11612.0,Thursday,November
2020-12-31,399.305444,297.08325,12908.0,Tuesday,December
2021-01-31,549.877128,409.108583,12021.0,Friday,January
2021-02-28,564.586186,379.401917,10105.0,Monday,February
2021-03-31,1143.15681,850.508667,16055.0,Monday,March
2021-04-30,2047.83669,1474.442417,19787.0,Thursday,April
2021-05-31,1568.521169,1166.97975,17241.0,Saturday,May
2021-06-30,2258.041551,1625.789917,27095.0,Tuesday,June
2021-07-31,2209.987343,1644.230583,23971.0,Thursday,July
2021-08-31,2644.3722,1967.412917,24607.0,Sunday,August


In [66]:
df["peak_power_W"][0]

29825.0

In [48]:
df.index.month_name()

Index(['November', 'December', 'January', 'February', 'March', 'April', 'May',
       'June', 'July', 'August', 'September', 'October', 'November',
       'December', 'January', 'February', 'March', 'April', 'May', 'June',
       'July', 'August', 'September', 'October', 'November', 'December',
       'January', 'February', 'March', 'April', 'May'],
      dtype='object', name='time')

In [42]:
new_forecasts

Unnamed: 0_level_0,avg_power_demand_W_predictions,energy_demand_kWh_predictions,peak_power_W_predictions
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-05-12,3752.679779,90.826229,18765.565238
2023-05-13,25.375707,0.717514,6235.976188
2023-05-14,36.178211,0.954205,5999.842836
2023-05-15,5397.506081,129.426661,20769.412666
2023-05-16,4503.515468,108.4447,18757.902182
2023-05-17,4158.186519,99.682433,18724.066881
2023-05-18,4386.069212,104.607192,16440.908117


In [22]:
pd.DataFrame(best_model_arima.forecast(1))

Unnamed: 0,0
2023-05-12,18765.565238


In [19]:
pd.DataFrame(best_model_arima.forecast(7)).rename({"predicted_mean":"avg"})

Unnamed: 0,predicted_mean
2023-05-12,18765.565238
2023-05-13,6235.976188
2023-05-14,5999.842836
2023-05-15,20769.412666
2023-05-16,18757.902182
2023-05-17,18724.066881
2023-05-18,16440.908117


In [14]:
new_forecasts

Unnamed: 0_level_0,avg_power_demand_W_predictions,energy_demand_kWh_predictions,peak_power_W_predictions
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [5]:
best_model_arima.forecast(7)

2023-05-12    18765.565238
2023-05-13     6235.976188
2023-05-14     5999.842836
2023-05-15    20769.412666
2023-05-16    18757.902182
2023-05-17    18724.066881
2023-05-18    16440.908117
Freq: D, Name: predicted_mean, dtype: float64

In [14]:
pickle.loads(redis_client.get("daily_params"))

{'energy_demand_kWh': ARIMA(order=(2, 0, 1), scoring_args={}, seasonal_order=(0, 1, 2, 7),
 'peak_power_W': ARIMA(order=(1, 0, 0), scoring_args={}, seasonal_order=(1, 1, 1, 7),
 'avg_power_demand_W': ARIMA(order=(2, 0, 1), scoring_args={}, seasonal_order=(0, 1, 2, 7),

In [82]:
from datetime import datetime
datetime.now().strftime('%H:%M:%S')

'19:20:17'

In [20]:
df["finishChargeTime"] = pd.to_datetime(df["finishChargeTime"])
df = df.sort_values(by="finishChargeTime")
df["cumEnergy_Wh"].cumsum(axis=0) / 1000

0           3.281
1          36.739
2          51.955
3          66.333
4          78.817
          ...    
2915    56575.227
2916    56585.206
2912    56613.743
2914    56624.438
2917    56629.084
Name: cumEnergy_Wh, Length: 2918, dtype: float64

In [19]:
hourlydemand = pd.read_csv("data/hourlydemand.csv", index_col="time", parse_dates=True)
dailydemand = pd.read_csv("data/dailydemand.csv", index_col="time", parse_dates=True)
monthlydemand = pd.read_csv("data/monthlydemand.csv", index_col="time", parse_dates=True)

In [16]:
dailydemand.index.hour

Int64Index([0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
            ...
            0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
           dtype='int64', name='time', length=905)

In [13]:
hourlydemand[hourlydemand.index.hour == 7]

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-06 07:00:00,0.000000,0.000000,0.0,Friday,November
2020-11-07 07:00:00,0.000000,0.000000,0.0,Saturday,November
2020-11-08 07:00:00,0.000000,0.000000,0.0,Sunday,November
2020-11-09 07:00:00,0.000000,0.000000,0.0,Monday,November
2020-11-10 07:00:00,0.000000,0.000000,0.0,Tuesday,November
...,...,...,...,...,...
2023-04-23 07:00:00,0.000000,0.000000,0.0,Sunday,April
2023-04-24 07:00:00,0.000000,0.000000,0.0,Monday,April
2023-04-25 07:00:00,6445.000000,6.445000,6445.0,Tuesday,April
2023-04-26 07:00:00,4581.666667,4.581667,5498.0,Wednesday,April


In [26]:
px.line(monthlydemand, y="peak_power_W")

In [13]:
j = hourlydemand.to_json(date_format="iso", orient="table")
j = pd.read_json(j, orient="table")
j

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-05 10:00:00,879.000000,0.439500,879.0,Thursday,November
2020-11-05 11:00:00,879.000000,0.879000,879.0,Thursday,November
2020-11-05 12:00:00,879.000000,0.879000,879.0,Thursday,November
2020-11-05 13:00:00,879.000000,0.879000,879.0,Thursday,November
2020-11-05 14:00:00,293.000000,0.293000,879.0,Thursday,November
...,...,...,...,...,...
2023-04-27 08:00:00,0.000000,0.000000,0.0,Thursday,April
2023-04-27 09:00:00,0.000000,0.000000,0.0,Thursday,April
2023-04-27 10:00:00,2401.500000,2.401500,3202.0,Thursday,April
2023-04-27 11:00:00,5282.833333,5.282833,12602.0,Thursday,April


In [9]:
{'energy_demand_kWh': {"order":(2, 0, 1), "seasonal_order":(0, 1, 2, 7)},
 'peak_power_W': {"order":(1, 0, 0), "seasonal_order":(0, 1, 2, 7)},
 'avg_power_demand_W': {"order":(2, 0, 1), "seasonal_order":(0, 1, 2, 7)}
}

{'energy_demand_kWh': {'order': (2, 0, 1), 'seasonal_order': (0, 1, 2, 7)},
 'peak_power_W': {'order': (1, 0, 0), 'seasonal_order': (0, 1, 2, 7)},
 'avg_power_demand_W': {'order': (2, 0, 1), 'seasonal_order': (0, 1, 2, 7)}}

In [10]:
{'energy_demand_kWh': {'best_depth': 57, 'best_n_neighbors': 25},
 'peak_power_W': {'best_depth': 57, 'best_n_neighbors': 23},
 'avg_power_demand_W': {'best_depth': 57, 'best_n_neighbors': 25}}

{'energy_demand_kWh': {'best_depth': 57, 'best_n_neighbors': 25},
 'peak_power_W': {'best_depth': 57, 'best_n_neighbors': 23},
 'avg_power_demand_W': {'best_depth': 57, 'best_n_neighbors': 25}}

In [11]:
df = dailydemand.loc[dailydemand["day"] == "Friday"]
df

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-06,0.000000,0.000000,0.0,Friday,November
2020-11-13,407.930556,9.790333,1277.0,Friday,November
2020-11-20,0.000000,0.000000,0.0,Friday,November
2020-11-27,0.000000,0.000000,0.0,Friday,November
2020-12-04,586.625000,14.079000,1482.0,Friday,December
...,...,...,...,...,...
2023-03-24,8251.256944,198.030167,36628.0,Friday,March
2023-03-31,751.975694,18.047417,12388.0,Friday,March
2023-04-07,6193.690972,148.648583,19607.0,Friday,April
2023-04-14,5319.187500,127.660500,12860.0,Friday,April


In [12]:
px.histogram(df, x="peak_power_W")