<h1>Split time series data</h1>

In [1]:
import os 
import random
import numpy as np 
import pandas as pd

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
df = pd.read_csv("../data/transformed/watermain_breaks_dataset.csv")

In [4]:
df['first_break'] = pd.to_datetime(df['first_break'])
df['most_recent_break'] = pd.to_datetime(df['most_recent_break'])

See current date range

In [5]:
df['first_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: first_break, dtype: datetime64[ns]

In [6]:
df['most_recent_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: most_recent_break, dtype: datetime64[ns]

<h2>Split Data</h2>

Using January 1st, 2019 as cutoff

In [7]:
#how much of the total data we want in train set
TRAIN_SIZE = 0.85
VALIDATION_SIZE = 0.15
TRAIN_TEST_CUTOFF = pd.to_datetime('2019-01-01')

In [8]:
install_after_cutoff = df[pd.to_datetime(df['INSTALLDAT']) > TRAIN_TEST_CUTOFF]
train_eligible = df[pd.to_datetime(df['INSTALLDAT']) <= TRAIN_TEST_CUTOFF]

In [9]:
TRAIN_N = int(np.round(train_eligible.shape[0] * TRAIN_SIZE))

In [10]:
facility_ids = train_eligible['FACILITYID'].to_list()

random.Random(42).shuffle(facility_ids)

train_facility_ids = facility_ids[:TRAIN_N]
test_facility_ids = facility_ids[TRAIN_N:]

In [11]:
test = df[df['FACILITYID'].isin(test_facility_ids)]
train = df[df['FACILITYID'].isin(train_facility_ids)]

In [12]:
# add pipes installed after cutoff to test 
test = pd.concat([install_after_cutoff, test])

In [13]:
VAL_N = int(np.round(test.shape[0] * VALIDATION_SIZE))
test_facility_ids = test['FACILITYID'].to_list()
random.Random(42).shuffle(test_facility_ids)

val_facility_ids = test_facility_ids[:VAL_N]

In [14]:
val = test[test['FACILITYID'].isin(val_facility_ids)]
test = test[~test['FACILITYID'].isin(val_facility_ids)]

<h3>Process Dates</h3>

In [15]:
train['all_breaks'] = train['all_breaks'].apply(str)

In [16]:
train['all_breaks'] = train['all_breaks'].apply(str)
train['all_breaks'] = train['all_breaks'].apply(lambda s: ",".join([t for t in s.split(",") if pd.to_datetime(t) <= TRAIN_TEST_CUTOFF]))

In [17]:
train.shape, val.shape, test.shape

((20826, 14), (681, 14), (3856, 14))

<h2>Save to files</h2>

In [18]:
train.to_csv("../data/transformed/watermain_breaks_train.csv", index = False)

In [19]:
test.to_csv("../data/transformed/watermain_breaks_test.csv", index = False)

In [20]:
val.to_csv("../data/transformed/watermain_breaks_validation.csv", index = False)

In [21]:
train

Unnamed: 0,ENABLED,FACILITYID,LOCATION,INSTALLDAT,SUBTYPE,MATERIAL,LENGTH,DIAMETER,STATUS,PressureSy,break_status,all_breaks,first_break,most_recent_break
862,1,00-071340,Xlot 2625 Jackson Ave,2019-01-01,1,DI,18.0,12.0,IS,WH,has never broken,,NaT,NaT
863,1,00-071770,Hayward St,2018-12-01,1,DI,11.0,12.0,IS,GED,has never broken,,NaT,NaT
864,1,00-071764,2505 Hayward St,2018-12-01,3,DI,3.0,8.0,IS,GED,has never broken,,NaT,NaT
865,1,00-071761,Hayward St,2018-12-01,1,DI,6.0,12.0,IS,GED,has never broken,,NaT,NaT
866,1,00-071759,Hayward St,2018-12-01,1,DI,1.0,8.0,IS,GED,has never broken,,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27474,1,00-071704,,1950-01-01,1,CI,71.0,6.0,IS,SHE,has broken,2013-02-01 15:02:00,2013-02-01 15:02:00,2013-02-01 15:02:00
27475,1,00-072287,Ashley St,2009-12-08,1,DI,19.0,10.0,IS,GRA,has broken,,2021-02-05 11:20:00,2021-02-05 11:20:00
27476,1,00-073475,Briarcliff St,1962-03-31,1,CI,43.0,6.0,IS,NEH,has broken,2018-12-11 06:06:00,2018-12-11 06:06:00,2018-12-11 06:06:00
27477,1,00-073478,Pomona Rd,1960-01-01,1,CI,0.0,6.0,IS,WH,has broken,,2021-11-20 06:30:00,2021-11-20 06:30:00
