In [1]:
import pandas as pd 
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from datetime import datetime

pd.set_option("display.max_columns", 100)

In [35]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin


## Main Class
class HourlyCrossValidator:

    columns = ["energy_demand_kWh", "peak_power_W"]

    def __init__(self, max_neighbors, max_depth):
        self.max_neighbors = max_neighbors
        self.max_depth = max_depth

    def cross_validate(self, df):

        best_params = {}

        for column in self.columns:
            # initalize cross validator, cross validate  
            params = kNNCrossValidator(self.max_neighbors, self.max_depth, column).cross_validate_one(df)
            best_params[column] = params

        best_params["avg_power_demand_W"] = best_params["energy_demand_kWh"] # same data, different units, so same parameters

        return best_params
    

## Helper Class
class kNNCrossValidator:

    test_size = 0.2

    def __init__(self, max_neighbors, max_depth, col_name):
        super().__init__()
        self.max_neighbors = max_neighbors
        self.max_depth = max_depth
        self.col_name = col_name


    def cross_validate_one(self, df) -> dict:
        # create features
        df = self.__create_all_lag_features(df)

        # create validation pipeline
        validation_pipeline = Pipeline([
            ("subset_features", SubsetLags()),
            ("estimator", KNeighborsRegressor())
        ])

        # create parameter grid
        params = {
            "estimator__n_neighbors": np.arange(10, self.max_neighbors+1), # start searching at 10 neighbors
            "subset_features__num_lags": np.arange(40, self.max_depth+1) # start searching at 40 lags as features
        }

        # split data into train, validation, and test
        X_train_validation, X_train, X_validation, X_test, y_train_validation, y_train, y_validation, y_test = self.__train_test_split(df)

        # create grid and iteratively search
        grid = GridSearchCV(
            estimator=validation_pipeline,
            param_grid=params,
            scoring="neg_mean_squared_error",
            n_jobs=8,
            verbose=4,
            cv=[(np.arange(0, len(X_train)), np.arange(len(X_train), len(X_train_validation)))]
        )
        grid.fit(X_train_validation, y_train_validation)

        best_params = {
            "best_depth": grid.best_params_["subset_features__num_lags"],
            "best_n_neighbors": grid.best_params_["estimator__n_neighbors"]
        }

        return best_params


    def __create_all_lag_features(self, df) -> pd.DataFrame:
        # create pipeline, pass data, create features
        pipeline = Pipeline([
            ("create_features", CreateLagFeatures(
                self.max_depth, self.col_name))
        ])
        df_with_features = pipeline.fit_transform(df)

        return df_with_features


    def __train_test_split(self, df):

        # split into train+validation and test
        X_train_validation, X_test, y_train_validation, y_test = train_test_split(
            df.filter(regex="lag"),  # select only "lag-" features
            df[[self.col_name]],
            test_size=self.test_size,
            shuffle=False  # time series split
        )

        # split into train and validation
        X_train, X_validation, y_train, y_validation = train_test_split(
            X_train_validation,
            y_train_validation,
            test_size=0.2,
            shuffle=False
        )

        return X_train_validation, X_train, X_validation, X_test, y_train_validation, y_train, y_validation, y_test



## Pipeline Classes
class CreateLagFeatures(BaseEstimator, TransformerMixin):

    def __init__(self, num_lag_depths, col_name):
        super().__init__()
        self.num_lags_depths = num_lag_depths
        self.col_name = col_name

    def fit(self, X, y=None):
        return self 
    
    def transform(self, X):
        return self.__create_lag_features(X, self.num_lags_depths, self.col_name)
    
    @staticmethod
    def __create_lag_features(df, num_lag_depths, col_name):
        df_with_lags = df.copy(deep=True)
        for lag_depth in np.arange(1,num_lag_depths+1):
            column = df_with_lags[col_name].shift(24*lag_depth)
            df_with_lags = pd.concat([df_with_lags, column.rename("lag" + f"{lag_depth}")], axis=1)
        return df_with_lags.dropna()


class SubsetLags(BaseEstimator, TransformerMixin):
    
    def __init__(self, num_lags=1):
        super().__init__()
        self.num_lags = num_lags
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return self.__select_subset_lags(X, self.num_lags)
    
    @staticmethod
    def __select_subset_lags(df, num_lags):
        return df[[f"lag{depth}" for depth in np.arange(1, num_lags+1)]]


import pandas as pd
from sklearn.model_selection import train_test_split
import pmdarima as pm


# Main Class
class DailyCrossValidator:

    columns = ["energy_demand_kWh", "peak_power_W"]

    @classmethod
    def cross_validate(cls, df):

        best_params = {}

        for column in cls.columns:
            # initalize cross validator, cross validate
            params = SARIMACrossValidator(column).cross_validate_one(df)
            best_params[column] = params

        # same data, different units, so same parameters
        best_params["avg_power_demand_W"] = best_params["energy_demand_kWh"]

        return best_params


# Helper Class
class SARIMACrossValidator:

    test_size = 0.2

    def __init__(self, col_name):
        super().__init__()
        self.col_name = col_name

    def cross_validate_one(self, df: pd.DataFrame):

        X_train, X_test, y_train, y_test = self.__train_test_split(df)

        stepwise_fit = pm.auto_arima(y_train,
                                     start_p=0, start_q=0,
                                     max_p=3, max_q=3, max_Q=3, max_P=3,
                                     d=0, D=1, m=7,
                                     X=None,
                                     seasonal=True, trace=True, stepwise=True)

        return stepwise_fit

    def __train_test_split(self, df: pd.DataFrame):
        # take only needed column 
        df = df[[self.col_name]]
        
        X_train, X_test, y_train, y_test = train_test_split(
            df.drop(columns=[self.col_name]),
            df[[self.col_name]],
            test_size=self.test_size,
            shuffle=False
        )

        return X_train, X_test, y_train, y_test


import pandas as pd
import statsmodels.api as sm 

class CreateDailyForecasts:

    columns = ["avg_power_demand_W", "energy_demand_kWh", "peak_power_W"]

    def __init__():
        pass 
    
    @classmethod
    def run_daily_forecast(cls, df, best_params: dict):

        existing_forecasts = pd.read_csv("forecastdata/dailyforecasts.csv", index_col="time", parse_dates=True)
        new_forecasts = pd.DataFrame()

        for column in cls.columns:
            
            # train on ALL available data
            train = df[[column]].copy() 
            # create ARIMA model 
            best_model_arima = sm.tsa.arima.ARIMA(train, order=(1,0,0), seasonal_order=(1,1,1,7)).fit()
            # forecast on day ahead, convert to a dataframe
            one_column_forecast = best_model_arima.forecast()
            one_column_forecast = pd.DataFrame(one_column_forecast, columns=[column+'_predictions']) 
            new_forecasts = pd.concat([new_forecasts, one_column_forecast], axis=1)

        # append new forecasts existing set of forecasts
        forecasts = pd.concat([existing_forecasts, new_forecasts], axis=0)

        return forecasts

    
    @staticmethod
    def save_empty_prediction_df():
        empty_df = pd.DataFrame(columns=["avg_power_demand_W_predictions", "energy_demand_kWh_predictions", "peak_power_W_predictions"], index=pd.Index([], name="time"))
        empty_df.to_csv("forecastdata/dailyforecasts.csv")
        return empty_df


In [2]:
import redis 
import pickle
redis_client = redis.Redis(host='localhost', port=6360)

In [3]:
df = pickle.loads(redis_client.get("raw_data"))
df.head()

Unnamed: 0.1,Unnamed: 0,vehicle_maxChgRate_W,peakPower_W,sch_centsPerHr,connectTime,vehicle_model,Duration,userId,regular,startChargeTime,sch_centsPerOverstayHr,sch_centsPerKwh,choice,siteId,estCost,DurationHrs,dcosId,lastUpdate,power,stationId,defaultDeadline,scheduled,cumEnergy_Wh,reg_centsPerHr,Deadline,energyReq_Wh,finishChargeTime,trueDurationHrs,true_peakPower_W,Overstay,Overstay_h
0,0,6600,6335.0,9.0,2020-11-05 10:30:16,500e,0 days 03:43:57,605,1,2020-11-05 10:31:09,200.0,15.0,REGULAR,23,5.35224,3.73249,24,2020-11-05 14:15:06,"[{'power_W': Decimal('6259'), 'timestamp': Dec...",7,1969-12-31T16:00:00,0,3281.0,130.0,,,2020-11-05 14:15:06,3.732,879.0,0 days 00:00:00,0.0
1,1,24000,7005.0,3.0,2020-11-11 07:39:55,Model 3,0 days 06:50:07,486,1,2020-11-11 07:39:59,200.0,15.0,REGULAR,23,10.75291,6.83527,26,2020-11-11 14:30:06,"[{'power_W': Decimal('0'), 'timestamp': Decima...",3,2020-11-12T03:11:00,0,33458.0,150.0,,,2020-11-11 14:30:06,6.835,4895.0,0 days 00:00:00,0.0
2,2,3600,3450.0,3.0,2020-11-13 16:19:55,Volt,0 days 20:40:02,620,0,2020-11-13 16:20:06,300.0,12.0,SCHEDULED,25,29.32211,20.66722,30,2020-11-14 13:00:08,"[{'power_W': Decimal('0'), 'timestamp': Decima...",12,2020-11-14T04:11:00,1,15216.0,180.0,2020-11-14 04:15:00,18400.0,2020-11-14 04:15:00,11.915,1277.0,0 days 08:45:08,8.752222
3,3,7200,6889.0,3.0,2020-11-14 23:47:06,Bolt,0 days 02:12:51,618,1,2020-11-14 23:47:16,400.0,18.0,REGULAR,23,3.82125,2.21416,31,2020-11-15 02:00:07,"[{'power_W': Decimal('6889'), 'timestamp': Dec...",6,1969-12-31T16:00:00,0,14378.0,150.0,,,2020-11-15 02:00:07,2.214,6494.0,0 days 00:00:00,0.0
4,4,6000,6852.0,,2020-11-16 11:38:44,B-Class Electric Drive,0 days 03:12:45,623,1,2020-11-16 11:42:22,,,REGULAR,23,,3.21249,32,2020-11-16 14:55:07,"[{'power_W': Decimal('6813'), 'timestamp': Dec...",9,2020-11-17T04:11:00,0,12484.0,,,,2020-11-16 14:55:07,3.212,3887.0,0 days 00:00:00,0.0


In [4]:
df2 = df[["sch_centsPerHr", "reg_centsPerHr", "choice"]].value_counts().to_frame("counts").reset_index()
df2

Unnamed: 0,sch_centsPerHr,reg_centsPerHr,choice,counts
0,250.0,137.0,REGULAR,390
1,200.0,150.0,REGULAR,337
2,127.0,184.0,SCHEDULED,210
3,227.0,137.0,REGULAR,117
4,127.0,184.0,REGULAR,81
...,...,...,...,...
316,120.0,300.0,REGULAR,1
317,120.0,100.0,SCHEDULED,1
318,115.0,125.0,REGULAR,1
319,112.0,75.0,SCHEDULED,1


In [5]:
fig = px.scatter(data_frame=df2, y="sch_centsPerHr", x="reg_centsPerHr", color="choice", size="counts", size_max=100)
fig