In [123]:
import pandas as pd
import numpy as np

In [2]:
# read the data
df = pd.read_csv('Incidents_Responded_to_by_Fire_Companies.csv', low_memory=False)

In [3]:
# get the column descriptions
df.columns

Index(['IM_INCIDENT_KEY', 'FIRE_BOX', 'INCIDENT_TYPE_DESC',
       'INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME', 'UNITS_ONSCENE',
       'LAST_UNIT_CLEARED_DATE_TIME', 'HIGHEST_LEVEL_DESC',
       'TOTAL_INCIDENT_DURATION', 'ACTION_TAKEN1_DESC', 'ACTION_TAKEN2_DESC',
       'ACTION_TAKEN3_DESC', 'PROPERTY_USE_DESC', 'STREET_HIGHWAY', 'ZIP_CODE',
       'BOROUGH_DESC', 'FLOOR', 'CO_DETECTOR_PRESENT_DESC',
       'FIRE_ORIGIN_BELOW_GRADE_FLAG', 'STORY_FIRE_ORIGIN_COUNT',
       'FIRE_SPREAD_DESC', 'DETECTOR_PRESENCE_DESC', 'AES_PRESENCE_DESC',
       'STANDPIPE_SYS_PRESENT_FLAG'],
      dtype='object')

In [4]:
# Q1: what is the most common type of incident?
# check the column incident type description to get the most common type
df.INCIDENT_TYPE_DESC.describe()

count                               2277779
unique                                  182
top       300 - Rescue, EMS incident, other
freq                                 823378
Name: INCIDENT_TYPE_DESC, dtype: object

In [481]:
# Q1: What proportion of FDNY responses in this dataset correspond to the most common type of incident?
# double checking to make sure that the counts are correct:
sum(df.INCIDENT_TYPE_DESC==df.INCIDENT_TYPE_DESC[2])
df.INCIDENT_TYPE_DESC.size
823378/2277779

0.3614828304238471

In [None]:
# Q2: How many times more likely is an incident in Staten Island a false call compared to in Manhattan?
# Definition of a false call: A false call is an incident for which 'INCIDENT_TYPE_DESC' is 
# '710 - Malicious, mischievous false call, other'.
# 1. separate by location, use borough column
boroughs = df.BOROUGH_DESC.unique()
# staten mask
staten = df.BOROUGH_DESC == boroughs[4]
# manhattan mask
manhattan = df.BOROUGH_DESC == boroughs[1]
# use the masks to count '710 - Malicious, mischievous false call, other' for each location
sfalse = sum(df[staten].INCIDENT_TYPE_DESC == '710 - Malicious, mischievous false call, other')
mfalse = sum(df[manhattan].INCIDENT_TYPE_DESC == '710 - Malicious, mischievous false call, other')
sfalse/mfalse

In [None]:
# Q3.1: Compute what proportion of all incidents are cooking fires for every hour of the day by normalizing the 
# number of cooking fires in a given hour by the total number of incidents that occured in that hour.
# Definition of a cooking fire: 'INCIDENT_TYPE_DESC' is '113 - Cooking fire, confined to container'
# Note: round incident times down.
# 1. cooking fire mask
cooking = df.INCIDENT_TYPE_DESC == '113 - Cooking fire, confined to container'

In [None]:
# 2. get # of incidents for every hour
# convert str to datetime
ts = pd.to_datetime(df.INCIDENT_DATE_TIME, format='%m/%d/%Y %I:%M:%S %p')

In [None]:
# round down to the nearest hour
ts = ts.dt.floor('H')
# count total number of incidents for every hour (index is the date time)
tscounts = ts.value_counts()
# get all of the unique date hours
tsu = ts.unique()
# tscounts[tscounts.index==ts[0]]

In [None]:
# 3. get # of cooking fire incidents for every hour
# get date and time for all cooking fires
ctime = df[cooking].INCIDENT_DATE_TIME
# convert str to datetime
ctime = pd.to_datetime(ctime, format='%m/%d/%Y %I:%M:%S %p')
# round down to the nearest hour
ctime = ctime.dt.floor('H')
# count total number of incidents for every hour (index is the date time)
ccount = ctime.value_counts()

In [None]:
# normalize cooking fire by the total number of incidents
# merge by index (date time, common to both)
tscounts = tscounts.rename('total_count')
ccount = ccount.rename('cooking_count')

In [None]:
# join into a dataframe
incidents = pd.DataFrame(tscounts).join(ccount)
# nans to 0
incidents = incidents.fillna(value=0)

In [None]:
# Q3.2: Find the hour of the day that has the highest proportion of cooking fires and submit that proportion of 
# cooking fires.
ratios = incidents.cooking_count/incidents.total_count
ratios.max()

In [37]:
# Q4: What is the ratio of the average number of units that arrive to a scene of an incident classified as
# '111 - Building fire' to the number that arrive for '651 - Smoke scare, odor of smoke'?

# 1. get the average number incidents '111 - Building fire'
bfavg = (df.INCIDENT_TYPE_DESC == '111 - Building fire').sum()/df.shape[0]
# 2. get the average number of incidents '651 - Smoke scare, odor of smoke'
ssavg = (df.INCIDENT_TYPE_DESC == '651 - Smoke scare, odor of smoke').sum()/df.shape[0]
# 3. get the ratio of building fire avg to smoke scare avg
bfavg/ssavg

0.08789046762106846

In [30]:
# Q5.1: Check the distribution of the number of minutes it takes between the time a '111 - Building fire' incident
# has been logged into the Computer Aided Dispatch system and the time at which the first unit arrives on scene.

bfidx = df.INCIDENT_TYPE_DESC == '111 - Building fire'
bftimes = df[bfidx][['INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME']]

In [54]:
# convert str to datetime
bftimes.INCIDENT_DATE_TIME = pd.to_datetime(bftimes.INCIDENT_DATE_TIME, format='%m/%d/%Y %I:%M:%S %p')
bftimes.ARRIVAL_DATE_TIME = pd.to_datetime(bftimes.ARRIVAL_DATE_TIME, format='%m/%d/%Y %I:%M:%S %p')

In [72]:
# find the difference between the two times
difftime = bftimes.ARRIVAL_DATE_TIME - bftimes.INCIDENT_DATE_TIME
# convert to seconds
difftime = difftime.dt.seconds

In [133]:
# Q5.2: What is the third quartile of that distribution. Note: the number of minutes can be fractional
# (ie, do not round).
# there are nans
print(difftime.quantile(0.75)/60)
# remove nans
print(difftime.dropna().quantile(0.75)/60)
# np.percentile(difftime.dropna(), 75)/60

4.15
4.15


In [228]:
# Q6: What is the coefficient of determination (R squared) between the number of residents at each zip code and the
# number of inicidents whose type is classified as '111 - Building fire' at each of those zip codes.
# 1. calculate the number of building fire incidents at each zip code
bfidx = df.INCIDENT_TYPE_DESC == '111 - Building fire'
zipbf = df[bfidx]['ZIP_CODE']
zipcounts = zipbf.value_counts()

In [229]:
# convert zip codes from string to int
# then merge with census data
zipcounts.index = zipcounts.index.astype(int)

In [230]:
# 2. load in the census data
census = pd.read_csv('2010+Census+Population+By+Zipcode+(ZCTA).csv')

In [231]:
# join zip counts to census data
census = census.join(zipcounts, on='Zip Code ZCTA')
census = census.dropna()
census.columns

In [235]:
from scipy import stats

# x = number of residents
x = census['2010 Census Population']
# y = number of incidents
y = census['ZIP_CODE']
slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)

In [237]:
# Q6: calculate r^2
print("r-squared:", r_value**2)

r-squared: 0.5973393948657464


In [246]:
# Q7.1: For this question, only consider incidents that have information about whether a CO detector was present or
# not.
cotf = df[~pd.isnull(df['CO_DETECTOR_PRESENT_DESC'])]

In [256]:
# Q7.2: compute the proportion of incidents that lasted 20-30, 30-40, 40-50, 50-60, and 60-70 minutes (both interval
# boundary values included) by dividing the number of incidents in each time interval with the total number of
# incidents.
cotf = cotf[['TOTAL_INCIDENT_DURATION', 'CO_DETECTOR_PRESENT_DESC']]

In [355]:
# must include both edges, so I can't use pd.cut or np.digitize or histogram...
def countinrange(x, y, b):
        idx = (x >= b[0])&(x <=b [1])
        return [idx.sum(), (y[idx]=='Yes').sum(), (y[idx]=='No').sum()]

In [384]:
# bin total incident duration into 20-30, 30-40, 40-50, 50-60, and 60-70 minutes
# total incident duration is in seconds
binranges = np.array([(20, 30), (30, 40), (40, 50), (50, 60), (60, 70)])*60
# for each bin, count the total number of incidents in the time range, and the number of events with and without CO
# detector
bincounts = np.zeros([5, 3])
c = 0
# c1: total count, c2: CO detector present, c3: CO detector absent
for bins in binranges:
    bincounts[c] = countinrange(cotf.TOTAL_INCIDENT_DURATION, cotf.CO_DETECTOR_PRESENT_DESC, bins)
    c += 1

In [393]:
# calculate the proportion of incidents in each bin
print(bincounts)
freqbin = bincounts[:,0]/bincounts[:,0].sum()

[[7682. 6501. 1181.]
 [2741. 2155.  586.]
 [1276.  926.  350.]
 [ 730.  481.  249.]
 [ 393.  235.  158.]]


array([0.5991265 , 0.2137732 , 0.09951646, 0.0569334 , 0.03065044])

In [395]:
# Q7.3: For each bin, compute the ratio of the 'CO detector absent' frequency to the 'CO detector present' frequency.
coapr = bincounts[:, 2]/bincounts[:, 1]
coapr

array([0.18166436, 0.27192575, 0.37796976, 0.51767152, 0.67234043])

In [404]:
# Q7.4: Perform a linear regression of this ratio to the mid-point of the bins.
midpoint = np.array([25, 35, 45, 55, 65])
slope, intercept, r_value, p_value, std_err = stats.linregress(midpoint, coapr)

In [408]:
# Q7.5: From this, what is the predicted ratio for events lasting 39 minutes?
predratio = slope*39 + intercept
predratio

0.33068849004143824

In [482]:
# Q8: Calculate the chi-square test statistic for testing whether an incident is more likely to last longer than 
# 60 minutes when CO detector is not present.
# hypothesis: incident lasts longer than 60 min when CO detector is not present vs CO detector is present
# filter out for all incidents > 60 min
cotfgt60 = cotf[cotf.TOTAL_INCIDENT_DURATION > (60*60)]
n = cotfgt60.shape[0] # total number of incidents > 60 min
o = (cotfgt60.CO_DETECTOR_PRESENT_DESC=='No').sum() # number of incidents without CO detector
e = n/2 # equal chance of CO detector present or not
x2 = (o - e)**2/e
x2

0.17001180637544275