In [1]:
import pandas as pd 
from sklearn.pipeline import Pipeline
import plotly.express as px
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

pd.set_option("display.max_columns", 100)

In [56]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd


class SortDropCast(BaseEstimator, TransformerMixin):
    """
    This pipeline step will sort values by field "connectTime",
    drop columns "user_email", "slrpPaymentId", 
    and cast columns "cumEnergy_Wh", "peakPower_W" as float values. 
    """

    def fit(self, X, y=None):
        return self

    @staticmethod
    def transform(X) -> pd.DataFrame:
        X = X.sort_values(by="connectTime").drop(
            columns=["user_email", "slrpPaymentId"]).reset_index(drop=True)
        X["cumEnergy_Wh"] = X["cumEnergy_Wh"].astype(float)
        X["peakPower_W"] = X["peakPower_W"].astype(float)
        return X


class HelperFeatureCreation(BaseEstimator, TransformerMixin):
    """
    This pipeline step will drop any records that contain 0 for 
    "peakPower_W" or "cumEnergy_Wh". Four additional columns will be created:
    "reqChargeTime", "finishChargeTime", "Overstay", and "Overstay_h". 
    Any records with calculated charging durations greater than a day will be dropped. 
    Raw data (with these new features) at this staged will be saved.
    """

    def fit(self, X, y=None):
        return self

    @classmethod
    def transform(cls, X) -> pd.DataFrame:
        X = X.loc[(X["peakPower_W"] != 0) & (
            X["cumEnergy_Wh"] != 0)].copy(deep=True)

        X["connectTime"] = pd.to_datetime(X["connectTime"])
        X["startChargeTime"] = pd.to_datetime(X["startChargeTime"])
        X["Deadline"] = pd.to_datetime(X["Deadline"])
        X["lastUpdate"] = pd.to_datetime(X["lastUpdate"])

        X["finishChargeTime"] = X.apply(cls.__get_finishChargeTime, axis=1)
        X["trueDurationHrs"] = X.apply(cls.__get_duration, axis=1)
        X["true_peakPower_W"] = X["cumEnergy_Wh"] / X["trueDurationHrs"]

        # filter out bad rows (this occurs when there is a very low peak power and high energy delivered)
        X = X.loc[X["trueDurationHrs"] <= 24].copy()

        X['temp_0'] = pd.Timedelta(days=0, seconds=0)
        X['Overstay'] = X["lastUpdate"] - X['Deadline']
        X["Overstay"] = X[["Overstay", "temp_0"]].max(axis=1)
        X['Overstay_h'] = X['Overstay'].dt.seconds / 3600

        X.drop(columns=['temp_0'], inplace=True)

        X.to_csv("data/raw_data.csv")

        return X
    
    @staticmethod
    def __get_duration(row):
        if row["regular"] == 1:
            return round(((row["lastUpdate"] - row["startChargeTime"]).seconds/3600), 3)
        else: 
            return round(((row["Deadline"] - row["startChargeTime"]).seconds/3600), 3)
        
    @staticmethod
    def __get_finishChargeTime(row):
        if row["regular"] == 1:
            return row["lastUpdate"]
        else:
            return row["Deadline"]
        

class CreateSessionTimeSeries(BaseEstimator, TransformerMixin):
    """
    This pipeline step will create a time series for each session. A dataframe
    with 5-min granularity will be returned, with one column, "power_demand_W".
    """

    def fit(self, X, y=None):
        return self

    def transform(self, X) -> pd.DataFrame:
        self.rows = []
        X.apply(self.__create_ts, axis=1)
        X = pd.concat(self.rows, axis=0).sort_index()
        X = X.resample("5MIN").sum()
        return X

    def __create_ts(self, session):
        """
        This helper function takes in a session, with a "connectTime", "finishChargeTime", and 
        a "peakPower_W" column. Function will return a time series at 5-min granularity. 
        """
        date_range = pd.date_range(
            start=session["startChargeTime"], end=session["finishChargeTime"], freq="5min")
        temp_df = pd.DataFrame(index=date_range)
        temp_df["avg_power_demand_W"] = session["true_peakPower_W"]  # rename
        self.rows.append(temp_df)


class FeatureCreation(BaseEstimator, TransformerMixin):
    """
    This pipeline step will create an "energy_demand_kWh" and "peak_power_W" column. 
    The name of the dataframe's index will be set to "time", and "day" and "month" columns 
    will be created. 
    """

    def fit(self, X, y=None):
        return self

    @ staticmethod
    def transform(X) -> pd.DataFrame:
        X["energy_demand_kWh"] = (X["avg_power_demand_W"]/1000)/12
        # for the highest granularity, peak power is equal to the power demand
        # (different for different granularities though)
        X["peak_power_W"] = X["avg_power_demand_W"]
        X.index.name = "time"
        X["day"] = X.index.day_name()
        X["month"] = X.index.month_name()
        return X


class SaveToCsv(BaseEstimator, TransformerMixin):
    """
    This pipeline step takes each dataframe and creates new granularities
    (hourly, daily, and monthly). Each dataframe is saved to a "data/" file. 
    """

    def __init__(self) -> None:
        self.agg_key = {
            "avg_power_demand_W": "mean",
            "energy_demand_kWh": "sum",
            "peak_power_W": "max",
            "day": "first",
            "month": "first"
        }
        self.dataframe_names = [
            "fivemindemand",
            "hourlydemand",
            "dailydemand",
            "monthlydemand"
        ]
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform(self, X) -> dict:
        # create new granularities
        hourlydemand = X.resample("1H").agg(self.agg_key)
        dailydemand = X.resample("24H").agg(self.agg_key)
        monthlydemand = X.resample("1M").agg(self.agg_key)

        new_dataframes = {
            "fivemindemand": X,
            "hourlydemand": hourlydemand,
            "dailydemand": dailydemand,
            "monthlydemand": monthlydemand
        }

        # save to file system
        for idx, dataframe in enumerate(new_dataframes.values()):
            dataframe.to_csv(f"data/{self.dataframe_names[idx]}.csv")
        return new_dataframes


In [57]:
pipe = Pipeline([
    ("1", SortDropCast()),
    ("2", HelperFeatureCreation()),
    ("3", CreateSessionTimeSeries()),
    ("4", FeatureCreation()),
    ("5", SaveToCsv())
])

In [58]:
from datacleaning.FetchData import FetchData

raw = FetchData.scan_save_all_records()

In [59]:
df = pipe.fit_transform(raw)
df

{'fivemindemand':                      avg_power_demand_W  energy_demand_kWh  peak_power_W  \
 time                                                                       
 2020-11-05 10:30:00          879.153269           0.073263    879.153269   
 2020-11-05 10:35:00          879.153269           0.073263    879.153269   
 2020-11-05 10:40:00          879.153269           0.073263    879.153269   
 2020-11-05 10:45:00          879.153269           0.073263    879.153269   
 2020-11-05 10:50:00          879.153269           0.073263    879.153269   
 ...                                 ...                ...           ...   
 2023-03-20 18:50:00         6382.719468           0.531893   6382.719468   
 2023-03-20 18:55:00         6382.719468           0.531893   6382.719468   
 2023-03-20 19:00:00         6382.719468           0.531893   6382.719468   
 2023-03-20 19:05:00         6382.719468           0.531893   6382.719468   
 2023-03-20 19:10:00         6382.719468           0.531893

In [63]:
df["fivemindemand"]

Unnamed: 0_level_0,avg_power_demand_W,energy_demand_kWh,peak_power_W,day,month
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-11-05 10:30:00,879.153269,0.073263,879.153269,Thursday,November
2020-11-05 10:35:00,879.153269,0.073263,879.153269,Thursday,November
2020-11-05 10:40:00,879.153269,0.073263,879.153269,Thursday,November
2020-11-05 10:45:00,879.153269,0.073263,879.153269,Thursday,November
2020-11-05 10:50:00,879.153269,0.073263,879.153269,Thursday,November
...,...,...,...,...,...
2023-03-20 18:50:00,6382.719468,0.531893,6382.719468,Monday,March
2023-03-20 18:55:00,6382.719468,0.531893,6382.719468,Monday,March
2023-03-20 19:00:00,6382.719468,0.531893,6382.719468,Monday,March
2023-03-20 19:05:00,6382.719468,0.531893,6382.719468,Monday,March


In [64]:
pd.read_csv("data/raw_data.csv")

Unnamed: 0.1,Unnamed: 0,vehicle_maxChgRate_W,peakPower_W,sch_centsPerHr,connectTime,vehicle_model,Duration,userId,regular,Deadline,startChargeTime,sch_centsPerOverstayHr,sch_centsPerKwh,choice,siteId,estCost,DurationHrs,dcosId,lastUpdate,energyReq_Wh,power,stationId,defaultDeadline,scheduled,cumEnergy_Wh,reg_centsPerHr,finishChargeTime,trueDurationHrs,true_peakPower_W,Overstay,Overstay_h
0,0,6600,6335.0,9.0,2020-11-05 10:30:16,500e,0 days 03:43:57,605,1,,2020-11-05 10:31:09,200.0,15.0,REGULAR,23,5.35224,3.73249,24,2020-11-05 14:15:06,,"[{'power_W': Decimal('6259'), 'timestamp': Dec...",7,1969-12-31T16:00:00,0,3281.0,130.0,2020-11-05 14:15:06,3.732,879.153269,0 days 00:00:00,0.000000
1,1,24000,7005.0,3.0,2020-11-11 07:39:55,Model 3,0 days 06:50:07,486,1,,2020-11-11 07:39:59,200.0,15.0,REGULAR,23,10.75291,6.83527,26,2020-11-11 14:30:06,,"[{'power_W': Decimal('0'), 'timestamp': Decima...",3,2020-11-12T03:11:00,0,33458.0,150.0,2020-11-11 14:30:06,6.835,4895.098756,0 days 00:00:00,0.000000
2,2,3600,3450.0,3.0,2020-11-13 16:19:55,Volt,0 days 20:40:02,620,0,2020-11-14 04:15:00,2020-11-13 16:20:06,300.0,12.0,SCHEDULED,25,29.32211,20.66722,30,2020-11-14 13:00:08,18400.0,"[{'power_W': Decimal('0'), 'timestamp': Decima...",12,2020-11-14T04:11:00,1,15216.0,180.0,2020-11-14 04:15:00,11.915,1277.045741,0 days 08:45:08,8.752222
3,3,7200,6889.0,3.0,2020-11-14 23:47:06,Bolt,0 days 02:12:51,618,1,,2020-11-14 23:47:16,400.0,18.0,REGULAR,23,3.82125,2.21416,31,2020-11-15 02:00:07,,"[{'power_W': Decimal('6889'), 'timestamp': Dec...",6,1969-12-31T16:00:00,0,14378.0,150.0,2020-11-15 02:00:07,2.214,6494.128275,0 days 00:00:00,0.000000
4,4,6000,6852.0,,2020-11-16 11:38:44,B-Class Electric Drive,0 days 03:12:45,623,1,,2020-11-16 11:42:22,,,REGULAR,23,,3.21249,32,2020-11-16 14:55:07,,"[{'power_W': Decimal('6813'), 'timestamp': Dec...",9,2020-11-17T04:11:00,0,12484.0,,2020-11-16 14:55:07,3.212,3886.674969,0 days 00:00:00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2741,3349,50000,6325.0,200.0,2023-03-20 09:16:39,Bolt EUV,0 days 06:43:21,1271,1,,2023-03-20 09:16:47,300.0,0.0,REGULAR,25,10.58375,6.72250,4561,2023-03-20 16:00:08,,"[{'power_W': Decimal('6280'), 'timestamp': Dec...",12,2023-03-20T18:00:00,0,40397.0,150.0,2023-03-20 16:00:08,6.723,6008.775844,0 days 00:00:00,0.000000
2742,3350,3600,6675.0,200.0,2023-03-20 11:01:35,Volt,0 days 04:22:46,1261,0,2023-03-20 16:15:00,2023-03-20 11:07:23,300.0,0.0,SCHEDULED,25,10.75388,4.37944,4562,2023-03-20 15:30:09,18400.0,"[{'power_W': Decimal('6664'), 'timestamp': Dec...",17,1969-12-31T16:00:00,1,15166.0,150.0,2023-03-20 16:15:00,5.127,2958.065145,0 days 00:00:00,0.000000
2743,3351,6500,6750.0,200.0,2023-03-20 12:01:15,Mach-E,0 days 07:13:20,1208,1,,2023-03-20 12:01:48,300.0,0.0,REGULAR,25,11.33333,7.22222,4563,2023-03-20 19:15:08,,"[{'power_W': Decimal('6699'), 'timestamp': Dec...",11,2023-03-20T20:00:00,0,46096.0,150.0,2023-03-20 19:15:08,7.222,6382.719468,0 days 00:00:00,0.000000
2744,3352,7000,3889.0,200.0,2023-03-20 13:32:02,Leaf,0 days 03:35:42,1067,1,,2023-03-20 13:34:26,300.0,0.0,REGULAR,25,5.89250,3.59500,4564,2023-03-20 17:10:08,,"[{'power_W': Decimal('947'), 'timestamp': Deci...",18,2023-03-20T15:00:00,0,7255.0,150.0,2023-03-20 17:10:08,3.595,2018.080668,0 days 00:00:00,0.000000
