Testing Pipeline

In [2]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from app_utils import round_format_UNIX_time, ohe_day_name, ohe_federal_holiday

In [3]:

class CleanSession(BaseEstimator, TransformerMixin):

    pattern = r"(\[?\{'power_W':\sDecimal\(')|('timestamp':\sDecimal\(')|('\)\}?\]?)"

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # pattern match, remove pattern instances, cast to int
        power_and_time = X["power"].str.replace(self.pattern , "" , regex = True)
        power_and_time = power_and_time.str.split(', ')
        power_and_time = power_and_time.apply(lambda lst : [int(val) for val in lst])

        # extract power and time values, unnest data, round time values to clean 5-minutes
        power_vals = power_and_time.apply(lambda x : x[::2]).explode()
        time_vals = power_and_time.apply(lambda x : x[1::2]).explode().apply(round_format_UNIX_time)

        # create df w/ time and power
        temp = pd.DataFrame({"time" : time_vals , "power_demand" : power_vals}) 

        # join w/ original dataframe
        return X.join(temp)

class ExtractUpsampleGroupby(BaseEstimator, TransformerMixin):

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            # extract time and power fields, group and sort
            new_X = X[["time", "power_demand"]]
            new_X = new_X.groupby("time").sum()
            new_X = new_X.sort_values(by="time")
            # upsample to 5-min bins, will impute 0 for missing times
            new_X.index = pd.to_datetime(new_X.index)
            new_X = new_X.resample("5min").sum()
            return new_X

class OHEDaysHolidays(BaseEstimator, TransformerMixin):

        def fit(self, X, y=None):
            return self

        def transform(self, X):
            new_X = ohe_federal_holiday(X)
            new_X = ohe_day_name(new_X)
            return new_X

In [4]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("clean_session", CleanSession()),
    ("cleantime", ExtractUpsampleGroupby()),
    ("thingy", OHEDaysHolidays())
])

In [5]:
df = pd.read_csv("data/slrpEV11052020-09222022.csv")
df.head()

Unnamed: 0,dcosId,userId,vehicle_model,vehicle_maxChgRate_W,siteId,stationId,connectTime,startChargeTime,Deadline,energyReq_Wh,...,sch_centsPerOverstayHr,Duration,DurationHrs,choice,regular,scheduled,cumEnergy_Wh,peakPower_W,power,lastUpdate
0,24,605,500e,6600,23,7,2020-11-05T10:30:16,2020-11-05T10:31:09,,,...,200.0,0 days 03:43:57,3.73249,REGULAR,1,0,3281.0,6335,"[{'power_W': Decimal('6259'), 'timestamp': Dec...",2020-11-05T14:15:06
1,26,486,Model 3,24000,23,3,2020-11-11T07:39:55,2020-11-11T07:39:59,,,...,200.0,0 days 06:50:07,6.83527,REGULAR,1,0,33458.0,7005,"[{'power_W': Decimal('0'), 'timestamp': Decima...",2020-11-11T14:30:06
2,30,620,Volt,3600,25,12,2020-11-13T16:19:55,2020-11-13T16:20:06,2020-11-14T04:15:00,18400.0,...,300.0,0 days 20:40:02,20.66722,SCHEDULED,0,1,15216.0,3450,"[{'power_W': Decimal('0'), 'timestamp': Decima...",2020-11-14T13:00:08
3,31,618,Bolt EV,7200,23,6,2020-11-14T23:47:06,2020-11-14T23:47:16,,,...,400.0,0 days 02:12:51,2.21416,REGULAR,1,0,14378.0,6889,"[{'power_W': Decimal('6889'), 'timestamp': Dec...",2020-11-15T02:00:07
4,32,623,B-Class Electric Drive,6000,23,9,2020-11-16T11:38:44,2020-11-16T11:42:22,,,...,,0 days 03:12:45,3.21249,REGULAR,1,0,12484.0,6852,"[{'power_W': Decimal('6813'), 'timestamp': Dec...",2020-11-16T14:55:07


In [6]:
pipe.fit_transform(df)

Unnamed: 0_level_0,power_demand,Federal Holiday,day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-11-05 10:40:00,6259,0,Thursday,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2020-11-05 10:45:00,6269,0,Thursday,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2020-11-05 10:50:00,6298,0,Thursday,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2020-11-05 10:55:00,6318,0,Thursday,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2020-11-05 11:00:00,6335,0,Thursday,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2022-08-22 16:55:00,0,0,Monday,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2022-08-22 17:00:00,0,0,Monday,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2022-08-22 17:05:00,0,0,Monday,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2022-08-22 17:10:00,0,0,Monday,0.0,1.0,0.0,0.0,0.0,0.0,0.0
