<h1>Recreating Results of Syracuse Paper</h1>

In [1]:
import sys  
sys.path.insert(1, '../src')

In [32]:
import os
import numpy as np
import pandas as pd

from utils import * 
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score, auc
from sklearn.preprocessing import StandardScaler

In [3]:
pd.options.mode.chained_assignment = None

In [4]:
df = pd.read_csv("../data/transformed/watermain_breaks_train.csv")
val = pd.read_csv("../data/transformed/watermain_breaks_validation.csv")
test = pd.read_csv("../data/transformed/watermain_breaks_test.csv")

<h2>Prepare Data</h2>

2015-2018 is our target period

Outcome: if the pipe will breaks in the next 3 years

In [5]:
#looking from 2015 trying to predict next 3 years
CUTOFF = pd.to_datetime('01-01-2016')
TEST_VAL_CUTOFF = pd.to_datetime('01-01-2019')

In [6]:
df = process_date_cols(df, CUTOFF)

In [7]:
cols = ['PressureSy', 'STATUS', 'MATERIAL', 'SUBTYPE']
df = svm_data_transform_pipeline(df, CUTOFF, cols)

In [8]:
val = process_date_cols(val, TEST_VAL_CUTOFF)
val = svm_data_transform_pipeline(val, TEST_VAL_CUTOFF, cols)


test = process_date_cols(test, TEST_VAL_CUTOFF)
test = svm_data_transform_pipeline(test, TEST_VAL_CUTOFF, cols)

<h3>Handling some differences in dummy variables</h3>

In [9]:
all_cols = list(test.columns) 
all_cols.extend(list(df.columns))
all_cols.extend(list(val.columns))

for col in all_cols: 
    if col not in list(test.columns):
        test[col] = 0
    if col not in list(val.columns):
        val[col] = 0
    if col not in list(df.columns):
        df[col] = 0

<h2>Split</h2>

In [14]:
X_train = df.drop(columns = ['will_break'])
y_train = df['will_break']

X_test = test.drop(columns = ['will_break'])
y_test = test['will_break']

X_val = val.drop(columns = ['will_break'])
y_val = val['will_break']

<h2>Scaling</h2>

In [15]:
numeric_cols = list(X_train.columns)
numeric_cols.remove('installation_year')

In [16]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled = scaler.transform(X_test[numeric_cols])
X_val_scaled = scaler.transform(X_val[numeric_cols])

X_train = np.concatenate([X_train_scaled, np.array(X_train['installation_year'].values).reshape(-1, 1)], axis = 1)
X_test = np.concatenate([X_test_scaled, np.array(X_test['installation_year'].values).reshape(-1, 1)], axis = 1)
X_val = np.concatenate([X_val_scaled, np.array(X_val['installation_year'].values).reshape(-1, 1)], axis = 1)

<h2>Cross Validation</h2>

In [21]:
svc = SVC()
svc.fit(X_train, y_train)

In [22]:
svc.score(X_test, y_test)

0.9908883826879271

In [26]:
pred = svc.predict(X_val)

In [28]:
print(classification_report(pred, y_val))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99       525
           1       0.00      0.00      0.00         0

    accuracy                           0.98       525
   macro avg       0.50      0.49      0.50       525
weighted avg       1.00      0.98      0.99       525



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
f1_score(pred, y_val)

0.0

In [33]:
auc(pred, y_val)

0.0