<h1>Split time series data</h1>

In [1]:
import os 
import random
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv("../data/transformed/watermain_breaks_dataset.csv")

In [3]:
df['first_break'] = pd.to_datetime(df['first_break'])
df['most_recent_break'] = pd.to_datetime(df['most_recent_break'])

See current date range

In [4]:
df['first_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: first_break, dtype: datetime64[ns]

In [5]:
df['most_recent_break'].agg(['max', 'min'])

max   2023-08-07 14:04:00
min   2013-01-02 08:49:00
Name: most_recent_break, dtype: datetime64[ns]

<h2>Split Data</h2>

Using January 1st, 2019 as cutoff

In [6]:
#how much of the total data we want in train set
TRAIN_SIZE = 0.7
VALIDATION_SIZE = 0.4
TRAIN_TEST_CUTOFF = pd.to_datetime('2019-01-01')

TRAIN_N = int(np.round(df.shape[0] * TRAIN_SIZE))

In [7]:
facility_ids = df['FACILITYID'].to_list()

random.Random(42).shuffle(facility_ids)

train_facility_ids = facility_ids[:TRAIN_N]
test_facility_ids = facility_ids[TRAIN_N:]

In [8]:
test = df[df['FACILITYID'].isin(test_facility_ids)]
train = df[df['FACILITYID'].isin(train_facility_ids)]

In [9]:
VAL_N = int(np.round(test.shape[0] * VALIDATION_SIZE))
test_facility_ids = test['FACILITYID'].to_list()
random.Random(42).shuffle(test_facility_ids)

val_facility_ids = test_facility_ids[:VAL_N]

In [10]:
val = test[test['FACILITYID'].isin(val_facility_ids)]
test = test[~test['FACILITYID'].isin(val_facility_ids)]

<h3>Process Dates</h3>

In [11]:
train['all_breaks'] = train['all_breaks'].apply(str)
train['all_breaks'] = train['all_breaks'].apply(lambda s: [t for t in s.split(",") if pd.to_datetime(t) <= TRAIN_TEST_CUTOFF])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['all_breaks'] = train['all_breaks'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['all_breaks'] = train['all_breaks'].apply(lambda s: [t for t in s.split(",") if pd.to_datetime(t) <= TRAIN_TEST_CUTOFF])


In [12]:
train

Unnamed: 0,ENABLED,FACILITYID,LOCATION,INSTALLDAT,SUBTYPE,MATERIAL,LENGTH,DIAMETER,STATUS,PressureSy,break_status,all_breaks,first_break,most_recent_break
0,1,00-77046,Exmoor Rd,2023-10-12,3,DI,7.0,6.0,IS,GED,has never broken,[],NaT,NaT
1,1,00-77049,Newcastle Rd,2023-10-05,3,DI,9.0,6.0,IS,GED,has never broken,[],NaT,NaT
2,1,00-76773,Melrose Ave & Tuomy Rd,2023-08-15,3,DI,20.0,6.0,IS,GED,has never broken,[],NaT,NaT
4,1,00-77041,Olivia Ave,2023-07-18,3,DI,13.0,6.0,IS,GRA,has never broken,[],NaT,NaT
7,1,00-77008,E Washington St,2023-05-01,1,DI,21.0,12.0,IS,GRA,has never broken,[],NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27474,1,00-071704,,1950-01-01,1,CI,71.0,6.0,IS,SHE,has broken,[2013-02-01 15:02:00],2013-02-01 15:02:00,2013-02-01 15:02:00
27475,1,00-072287,Ashley St,2009-12-08,1,DI,19.0,10.0,IS,GRA,has broken,[],2021-02-05 11:20:00,2021-02-05 11:20:00
27476,1,00-073475,Briarcliff St,1962-03-31,1,CI,43.0,6.0,IS,NEH,has broken,[2018-12-11 06:06:00],2018-12-11 06:06:00,2018-12-11 06:06:00
27477,1,00-073478,Pomona Rd,1960-01-01,1,CI,0.0,6.0,IS,WH,has broken,[],2021-11-20 06:30:00,2021-11-20 06:30:00


In [None]:
no_breaks = df[pd.isnull(df['first_break'])]
breaks = df[~pd.isnull(df['first_break'])]

In [None]:
TEST_N = int(np.round(no_breaks.shape[0] * TRAIN_SIZE))

no_breaks_facility_ids = no_breaks['FACILITYID'].to_list()
#setting random state
random.Random(42).shuffle(no_breaks_facility_ids)

train_no_breaks_facility_ids = no_breaks_facility_ids[:TEST_N]
test_no_breaks_facility_ids = no_breaks_facility_ids[TEST_N:]

In [None]:
TEST_N = int(np.round(breaks.shape[0] * TRAIN_SIZE))

breaks_facility_ids = breaks['FACILITYID'].to_list()
#setting random state
random.Random(42).shuffle(breaks_facility_ids)

train_breaks_facility_ids = breaks_facility_ids[:TEST_N]
test_breaks_facility_ids = breaks_facility_ids[TEST_N:]

<h3>Train Set</h3>

In [None]:
breaks_train = breaks[breaks['first_break'].isin(train_breaks_facility_ids)]
no_breaks_train = df[df['FACILITYID'].isin(train_no_breaks_facility_ids)]
train = pd.concat([breaks_train, no_breaks_train])

In [None]:
s = train['all_breaks'].apply(prune_breaks)

In [None]:
train['all_breaks'] = train['all_breaks'].astype(str).apply(lambda s: s.split(","))
train['all_breaks'] = train['all_breaks'].apply(lambda s: [t for t in s if pd.to_datetime(t) <= train_test_cutoff])
train['all_breaks'] = train['all_breaks'].apply(lambda s: ",".join(s))

<h3>Test & Validation Set</h3>

In [None]:
breaks_test = breaks[breaks['first_break'].isin(test_breaks_facility_ids)]
no_breaks_test = df[df['FACILITYID'].isin(test_no_breaks_facility_ids)]
test = pd.concat([breaks_test, no_breaks_test])

In [None]:
#how much of the test set we want to use for validation
VALIDATION_SIZE = 0.4
VALIDATION_N = int(np.round(test.shape[0] * VALIDATION_SIZE))

In [None]:
val = test[:VALIDATION_N]
test = test[VALIDATION_N:]

<h2>Save to files</h2>

In [None]:
train.to_csv("../data/transformed/watermain_breaks_train.csv", index = False)

In [None]:
test.to_csv("../data/transformed/watermain_breaks_test.csv", index = False)

In [None]:
val.to_csv("../data/transformed/watermain_breaks_validation.csv", index = False)