<h1>Split time series data</h1>

In [1]:
import os 
import random
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv("../data/transformed/watermain_breaks_dataset.csv")

In [3]:
df['first_break'] = pd.to_datetime(df['first_break'])
df['most_recent_break'] = pd.to_datetime(df['most_recent_break'])

See current date range

In [4]:
df['first_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: first_break, dtype: datetime64[ns]

In [5]:
df['most_recent_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: most_recent_break, dtype: datetime64[ns]

In [6]:
df['first_break'].dt.year.value_counts()

first_break
2014.0    89
2013.0    80
2015.0    78
2019.0    68
2016.0    65
2022.0    64
2017.0    61
2018.0    58
2021.0    49
2020.0    47
2023.0    29
Name: count, dtype: int64

<h2>Split Data</h2>

Using January 1st, 2019 as cutoff

In [7]:
no_breaks = df[pd.isnull(df['first_break'])]
breaks = df[~pd.isnull(df['first_break'])]

In [8]:
#how much of the total data we want in train set
TRAIN_SIZE = 0.7
TEST_N = int(np.round(breaks.shape[0] * TRAIN_SIZE))

In [9]:
facility_ids = breaks['FACILITYID'].to_list()
#setting random state
random.Random(42).shuffle(facility_ids)

In [10]:
train_facility_ids = facility_ids[:TEST_N]
test_facility_ids = facility_ids[TEST_N:]

<h3>Train Set</h3>

In [11]:
train_test_cutoff = pd.to_datetime('2019-01-01')

In [12]:
breaks_train = breaks[breaks['first_break'] <= train_test_cutoff]
no_breaks_train = df[df['FACILITYID'].isin(train_facility_ids)]
train = pd.concat([breaks_train, no_breaks_train])

<h3>Test & Validation Set</h3>

In [13]:
breaks_test = breaks[breaks['first_break'] > train_test_cutoff]
no_breaks_test = df[df['FACILITYID'].isin(test_facility_ids)]
test = pd.concat([breaks_test, no_breaks_test])

In [14]:
#how much of the test set we want to use for validation
VALIDATION_SIZE = 0.4
VALIDATION_N = int(np.round(test.shape[0] * VALIDATION_SIZE))

In [15]:
val = test[:VALIDATION_N]
test = test[VALIDATION_N:]

<h2>Save to files</h2>

In [16]:
train.to_csv("../data/transformed/watermain_breaks_train.csv", index = False)

In [17]:
test.to_csv("../data/transformed/watermain_breaks_test.csv", index = False)

In [18]:
val.to_csv("../data/transformed/watermain_breaks_validation.csv", index = False)