In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
#reload modules
%load_ext autoreload
%autoreload 2

In [None]:
import join_data as jd

In [None]:
from helpers import *

In [None]:
filesavelabel = 'reportfeaturesonly_'

# Load and join the public data

In [None]:
mergeddf = jd.pipeline(directory = '../data/', FSfilename = 'FSR_221022.csv', FIfilename = 'FI_221022.csv', FWOfilename = 'FWO_221022.csv', FRAfilename = 'FRA_221024.csv')

### Load + join model estimates for reporting delays

In [None]:
predicted_delays_filename = './data_est_report_delays/reports_delay_20230605_projected.csv'

In [None]:
# Load in reporting delays from the model predictions
predicted_report_delays = pd.read_csv(predicted_delays_filename)
predicted_report_delays = predicted_report_delays.rename(columns={'Delay': 'reporting_delay', 'BoroughCode': 'Borough'})
predicted_report_delays.head()

In [None]:
predicted_report_delays[['Borough', 'SRCategory', 'reporting_delay']].drop_duplicates()

## Merge report delays with rest of public data

In [None]:
mergeddf = pd.merge(mergeddf, predicted_report_delays[['Borough', 'SRCategory', 'reporting_delay']].drop_duplicates(), on=['Borough', 'SRCategory'], how='left')

In [None]:
mergeddf.groupby('SRCategory')[["IncidentGlobalID", "reporting_delay"]].count()

In [None]:
# drop the other categories, only doing the main categories
mergeddf = mergeddf[mergeddf.reporting_delay.notna()]

# Delay analysis final data preparation

First, combine multiple reports of the same incident so that I have one row per unique incident. Take first inspection and report date as the right date.

In [None]:
nodups = mergeddf[['IncidentGlobalID','SRCategory','SRCreatedDate', 'InspectionDate','WOClosedDate','Risk_coded','Borough','reporting_delay']].groupby('IncidentGlobalID').agg(
    {'SRCreatedDate': 'min', 'InspectionDate': 'min', 'WOClosedDate': 'min', 'SRCategory': 'first', 'Risk_coded': 'first', 'Borough': 'first', 'reporting_delay':'first'}).reset_index()
nodups['inspection_delay'] = (nodups['InspectionDate'] - nodups['SRCreatedDate']).dt.total_seconds() / 3600 /24
nodups['work_delay'] = (nodups['WOClosedDate'] - nodups['InspectionDate']).dt.total_seconds() / 3600 /24

In [None]:
nodups_rightdate = nodups[(nodups['SRCreatedDate'] >= '2017-06-30') & (nodups['SRCreatedDate'] < '2020-07-01')]

In [None]:
nodups_rightdate[['SRCreatedDate','InspectionDate', 'WOClosedDate']].describe(datetime_is_numeric=True)

In [None]:
nodups_rightdate.query('Borough == "Bronx" and SRCategory == "Hazard"').work_delay.hist()

In [None]:
nodups_rightdate.groupby(['SRCategory', 'Borough'])['work_delay'].median()

In [None]:
nodups_rightdate.groupby(['Risk_coded', 'Borough'])['work_delay'].count()

In [None]:
nodups_rightdate.groupby(['SRCategory', 'Borough'])['work_delay'].count()

In [None]:
nodups_rightdate.groupby(['Risk_coded', 'Borough'])['work_delay'].median()

In [None]:
nodups_rightdate.count()

In [None]:
nodups_rightdate.groupby(['SRCategory', 'Borough'])['reporting_delay', 'inspection_delay', 'work_delay'].median()

In [None]:
nodups_rightdate.groupby(['SRCategory', 'Borough'])['reporting_delay', 'inspection_delay', 'work_delay'].agg(lambda x: 1-np.mean(np.isnan(x)))

In [None]:
addressed = nodups_rightdate.groupby(['SRCategory', 'Borough'])['reporting_delay', 'inspection_delay', 'work_delay'].agg(lambda x: 1-np.mean(np.isnan(x))).reset_index()
addressed.head()

In [None]:
#turn inpsection and work delays into rows instead of columns
addressed = addressed.melt(id_vars=['SRCategory', 'Borough'], value_vars=['reporting_delay', 'inspection_delay', 'work_delay'], var_name='delay_type', value_name='percent_addressed')

# Plotting

## What fraction of incidents are actually addressed

In [None]:
addressed = addressed.rename({'percent_addressed': 'Fraction addressed', 'delay_type': 'Action'}, axis=1)
addressed.loc[:, 'Action'] = addressed.loc[:, 'Action'].str.replace('inspection_delay', 'Inspection').str.replace('work_delay', 'Work order')
plot = sns.catplot(
    data=addressed.query('SRCategory == "Hazard" and Action!="reporting_delay"'), kind="bar",
    x="Borough", y="Fraction addressed", hue="Action",
    errorbar="sd", palette= ['skyblue', 'green'], legend_out = False, #, alpha=.6, height=6
    order = ['Manhattan', 'Queens', 'Staten Island', 'Bronx', 'Brooklyn']
)
legend = plot._legend
legend.set_frame_on(False)
plt.ylim(0, 1.1)
plt.xlabel(None)
plt.savefig(f'plots/{filesavelabel}hazard_fractionaddressed.pdf', bbox_inches='tight')

## Delays conditional on addressed

In [None]:
plot_bar_by_type(nodups_rightdate, typecol = 'SRCategory', othergroupby = 'Borough', impute_missing_work_order = False, label = f'{filesavelabel}')

In [None]:
# nodups_rightdate = nodups_rightdate.sort_values(by = 'Risk_coded', ascending = True)

In [None]:
# plot_bar_by_type(nodups_rightdate.dropna(subset = ['inspection_delay']), typecol = 'Risk_coded', othergroupby = 'Borough', impute_missing_work_order = False)

In [None]:
# plot_bar_by_type(nodups_rightdate.dropna(subset = ['inspection_delay']), typecol = 'Risk_coded', othergroupby = 'Borough', do_inspection_correction = False, do_work_delay = True, impute_missing_work_order = True)