<h1>Recreating Results of Syracuse Paper</h1>

In [18]:
import os
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../data/transformed/watermain_breaks_train.csv")
val = pd.read_csv("../data/transformed/watermain_breaks_validation.csv")
test = pd.read_csv("../data/transformed/watermain_breaks_test.csv")

<h2>Feature Engineering</h2>

2015-2018 is our target period

Outcome: if the pipe will breaks in the next 3 years

<h3>Format Dates</h3>

In [3]:
#looking from 2015 trying to predict next 3 years
CUTOFF = pd.to_datetime('01-01-2016')

In [4]:
df['first_break'] = pd.to_datetime(df['first_break'])
df['most_recent_break'] = pd.to_datetime(df['most_recent_break'])
df['INSTALLDAT'] = pd.to_datetime(df['INSTALLDAT'])

In [5]:
df['all_breaks'] = df['all_breaks'].astype(str).apply(lambda s: s.split(","))
df['all_breaks'] = df['all_breaks'].apply(lambda s: [pd.to_datetime(t) for t in s])

Remove any information from after the prediction cutoff

In [6]:
df['breaks_before_cutoff'] = df['all_breaks'].apply(lambda s: [t for t in s if t <= CUTOFF])
df['breaks_after_cutoff'] = df['all_breaks'].apply(lambda s: [t for t in s if t > CUTOFF])

In [20]:
df['first_break'] = df['first_break'].apply(lambda s: np.where(s <= CUTOFF, s, pd.NaT))
df['most_recent_break'] = df['most_recent_break'].apply(lambda s: np.where(s <= CUTOFF, s, pd.NaT))

In [7]:
df['will_break'] = (df['breaks_after_cutoff'].apply(len) > 0).astype(int)

In [8]:
df['will_break'].mean()

0.006550558877047049

<h3>Add Features & Transform Categoricals</h3>

In [9]:
df['installation_year'] = df['INSTALLDAT'].dt.year

In [10]:
df['n_previous_breaks'] = df['breaks_before_cutoff'].apply(len)

In [11]:
pressure_dummies = pd.get_dummies(df['PressureSy'], drop_first = True)
df = pd.concat([df, pressure_dummies], axis = 1)

In [12]:
status_dummies = pd.get_dummies(df['STATUS'], drop_first = True)
df = pd.concat([df, status_dummies], axis = 1)

In [13]:
#subtype is really a categorical 
df['SUBTYPE'] = df['SUBTYPE'].map({1: 'Distribution Main', 2: 'Transmission Main', 3: 'Hydrant Lead', 4: 'Raw Water'})
subtype_dummies = pd.get_dummies(df['SUBTYPE'], drop_first = True)
df = pd.concat([df, subtype_dummies], axis = 1)

In [15]:
df

Unnamed: 0,ENABLED,FACILITYID,LOCATION,INSTALLDAT,SUBTYPE,MATERIAL,LENGTH,DIAMETER,STATUS,PressureSy,...,OTH,SCIO,SEH,SHE,WH,WNPRZ,IS,Hydrant Lead,Raw Water,Transmission Main
0,1,00-77046,Exmoor Rd,2023-10-12,Hydrant Lead,DI,7.0,6.0,IS,GED,...,False,False,False,False,False,False,True,True,False,False
1,1,00-77049,Newcastle Rd,2023-10-05,Hydrant Lead,DI,9.0,6.0,IS,GED,...,False,False,False,False,False,False,True,True,False,False
2,1,00-76773,Melrose Ave & Tuomy Rd,2023-08-15,Hydrant Lead,DI,20.0,6.0,IS,GED,...,False,False,False,False,False,False,True,True,False,False
3,1,00-77041,Olivia Ave,2023-07-18,Hydrant Lead,DI,13.0,6.0,IS,GRA,...,False,False,False,False,False,False,True,True,False,False
4,1,00-77008,E Washington St,2023-05-01,Distribution Main,DI,21.0,12.0,IS,GRA,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19230,1,00-071704,,1950-01-01,Distribution Main,CI,71.0,6.0,IS,SHE,...,False,False,False,True,False,False,True,False,False,False
19231,1,00-072287,Ashley St,2009-12-08,Distribution Main,DI,19.0,10.0,IS,GRA,...,False,False,False,False,False,False,True,False,False,False
19232,1,00-073475,Briarcliff St,1962-03-31,Distribution Main,CI,43.0,6.0,IS,NEH,...,False,False,False,False,False,False,True,False,False,False
19233,1,00-073478,Pomona Rd,1960-01-01,Distribution Main,CI,0.0,6.0,IS,WH,...,False,False,False,False,True,False,True,False,False,False


<h3>Drop Extra Columns</h3>

In [None]:
to_drop = ['ENABLED', 'PressureSy', 'STATUS', 'INSTALLDAT', 'SUBTYPE']

df = df.drop(columns = to_drop)