In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
#reload modules
%load_ext autoreload
%autoreload 2

In [None]:
import join_data as jd

In [None]:
from helpers import *

# Load and join the public data

In [None]:
mergeddfpublic = jd.pipeline(directory = '../data/', FSfilename = 'FSR_221022.csv', FIfilename = 'FI_221022.csv', FWOfilename = 'FWO_221022.csv', FRAfilename = 'FRA_221024.csv')

In [None]:
mergeddfpublic.columns

# Calculate work order delays

First, combine multiple reports of the same incident so that I have one row per unique incident. If there are multiple inspections or work orders, grab date of the first inspection/work order

In [None]:
nodups = mergeddfpublic[['IncidentGlobalID','SRCategory','SRCreatedDate', 'InspectionDate','WOClosedDate','Risk_coded','RiskRating','Borough','InspectionTPCondition', 'InspectionTPStructure',
       'TreePointDBH', 'ActualFinishDate']].groupby('IncidentGlobalID').agg(
    {'SRCreatedDate': 'min', 'InspectionDate': 'min', 'WOClosedDate': 'min', 'SRCategory': 'first', 'Risk_coded': 'first', 'Borough': 'first', 'RiskRating': 'first', 'InspectionTPCondition': 'first', 'InspectionTPStructure': 'first',
       'TreePointDBH': 'first', 'ActualFinishDate':'min'}).reset_index()

In [None]:
wofinishdatecolumn = 'ActualFinishDate' # 'WOClosedDate'

In [None]:
# calculate inspection and work order delays
nodups['inspection_delay'] = (nodups['InspectionDate'] - nodups['SRCreatedDate']).dt.total_seconds() / 3600 /24

nodups['work_delay'] = (nodups[wofinishdatecolumn] - nodups['InspectionDate']).dt.total_seconds() / 3600 /24

In [None]:
# nodups[['SRCreatedDate','InspectionDate', 'WOClosedDate']].describe(datetime_is_numeric=True)

# Work order delays for entire dataset

In [None]:
nodups.groupby(['Risk_coded', 'Borough'])['work_delay'].median()

In [None]:
nodups.groupby(['SRCategory', 'Borough'])['work_delay'].median()

# If an incident got inspected but no work order, pretend it had a really long work order delay

In [None]:
nodups_imputed = nodups.copy()
nodups_imputed.loc[:, 'work_delay'] = nodups_imputed['work_delay'].fillna(10000)

In [None]:
# only look at inspected incidents
nodups_imputed = nodups_imputed.dropna(subset = ['inspection_delay'])

In [None]:
nodups_imputed.groupby(['Risk_coded', 'Borough'])['work_delay'].median()

In [None]:
nodups_imputed.groupby(['SRCategory', 'Borough'])['work_delay'].median()

# Only look at 2017 - 2020 like in the paper, and other filtering (e.g., where reports are defined, subset of categories)

In [None]:
# filter to categories that we're analysing in the paper
nodups = nodups[nodups['SRCategory'].isin(['Hazard', 'Remove Tree', 'Root/Sewer/Sidewalk', 'Prune','Illegal Tree Damage'])]

#if these fields are NA we didn't have data to calculate reporting delay
nodups = nodups.dropna(subset = ['RiskRating', 'Borough', 'SRCategory', 'Risk_coded', 'InspectionTPCondition', 'TreePointDBH'])

In [None]:
nodups_rightdate = nodups[(nodups['SRCreatedDate'] >= '2017-06-30') & (nodups['SRCreatedDate'] < '2020-07-01')]

nodups_right_dateimputed = nodups_rightdate.copy()
nodups_right_dateimputed.loc[:, 'work_delay'] = nodups_right_dateimputed['work_delay'].fillna(10000)
# only look at inspected incidents
nodups_right_dateimputed = nodups_right_dateimputed.dropna(subset = ['inspection_delay'])

In [None]:
nodups_rightdate.dropna(subset = ['inspection_delay']).groupby(['Risk_coded', 'Borough'])['work_delay'].median()

In [None]:
# with imputation
nodups_right_dateimputed.groupby(['Risk_coded', 'Borough'])['work_delay'].median()

## Just split by year

In [None]:
(nodups['SRCreatedDate'] >= '2017-06-30') & (nodups['SRCreatedDate'] < '2020-07-01')

In [None]:
for year in range(2017, 2021):
    print(year)
    yearquery = f"SRCreatedDate >= '{year}-01-01' and SRCreatedDate <= '{year}-12-31'"
    print(nodups_right_dateimputed.query(yearquery).groupby(['Risk_coded', 'Borough'])['work_delay'].median())

In [None]:
# # This plot shows the median delay for Risk code A incidents in each Borough and year.
sns.lineplot(x = 'year', y = 'work_delay', hue = 'Borough', data = nodups_right_dateimputed.query('Risk_coded == "A"').groupby(['year','Borough'])['work_delay'].median().reset_index())