<h1>Split time series data</h1>

In [1]:
import os 
import random
import numpy as np 
import pandas as pd

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
df = pd.read_csv("../data/transformed/watermain_breaks_dataset.csv")

In [4]:
df['first_break'] = pd.to_datetime(df['first_break'])
df['most_recent_break'] = pd.to_datetime(df['most_recent_break'])

See current date range

In [5]:
df['first_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: first_break, dtype: datetime64[ns]

In [6]:
df['most_recent_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: most_recent_break, dtype: datetime64[ns]

<h2>Split Data</h2>

Using January 1st, 2019 as cutoff

In [7]:
#how much of the total data we want in train set
TRAIN_SIZE = 0.7
VALIDATION_SIZE = 0.4
TRAIN_TEST_CUTOFF = pd.to_datetime('2019-01-01')

TRAIN_N = int(np.round(df.shape[0] * TRAIN_SIZE))

In [8]:
facility_ids = df['FACILITYID'].to_list()

random.Random(42).shuffle(facility_ids)

train_facility_ids = facility_ids[:TRAIN_N]
test_facility_ids = facility_ids[TRAIN_N:]

In [9]:
test = df[df['FACILITYID'].isin(test_facility_ids)]
train = df[df['FACILITYID'].isin(train_facility_ids)]

In [10]:
VAL_N = int(np.round(test.shape[0] * VALIDATION_SIZE))
test_facility_ids = test['FACILITYID'].to_list()
random.Random(42).shuffle(test_facility_ids)

val_facility_ids = test_facility_ids[:VAL_N]

In [11]:
val = test[test['FACILITYID'].isin(val_facility_ids)]
test = test[~test['FACILITYID'].isin(val_facility_ids)]

<h3>Process Dates</h3>

In [12]:
train['all_breaks'] = train['all_breaks'].apply(str)
train['all_breaks'] = train['all_breaks'].apply(lambda s: [t for t in s.split(",") if pd.to_datetime(t) <= TRAIN_TEST_CUTOFF])

In [13]:
train.shape, val.shape, test.shape

((19235, 14), (3298, 14), (4946, 14))

<h2>Save to files</h2>

In [14]:
train.to_csv("../data/transformed/watermain_breaks_train.csv", index = False)

In [15]:
test.to_csv("../data/transformed/watermain_breaks_test.csv", index = False)

In [16]:
val.to_csv("../data/transformed/watermain_breaks_validation.csv", index = False)