# crash_04_counting_injured_number

This notebook is for counting hourly injured/killed people at each segment and intersections (node & short segment). The notebook follows chapters like below:

- 1. short segment
- 2. node
- 3. segment <br><br>

A Chart (number of injured/killed people) in the dashboard (https://workzone-collision-analysis.github.io/capstone/dashboard/) was drawn based on the csv files that were created through this notebook.

## 1. short segment

In [1]:
# import libraries
import pandas as pd
import numpy as np
import geopandas as gpd

In [2]:
# make sure that you run the 'shst_02_extract_short_segments' notebook to get the 'shst_short_segment_centroid.shp'
# import a shapefile of the short segment centroid
gdf_short_segment = gpd.read_file('../data/sharedstreets_geometry/short_segment/shst_short_segment_centroid.shp')

In [3]:
# make sure that you run the 'crash_02_define_Intersection_crash' notebook to get the 'crash_short_segment.shpp'
# import crash dataset
gdf_crash_short_segment = gpd.read_file('../data/cleaned_data/crash_seperated/crash_short_segment/crash_short_segment.shp')

In [4]:
gdf_crash_short_segment.columns

Index(['collision_', 'nearest_id', 'crash_date', 'crash_time', 'borough',
       'zip_code', 'on_street_', 'cross_stre', 'off_street', 'number_of_',
       'number_o_1', 'number_o_2', 'number_o_3', 'number_o_4', 'number_o_5',
       'number_o_6', 'number_o_7', 'contributi', 'contribu_1', 'contribu_2',
       'contribu_3', 'contribu_4', 'vehicle_ty', 'vehicle__1', 'vehicle__2',
       'vehicle__3', 'vehicle__4', 'geometry'],
      dtype='object')

In [5]:
# rename columns
gdf_crash_short_segment = gdf_crash_short_segment.rename(columns={'collision_':'collision_id',
                                                                  'number_of_': 'number_of_injured',
                                                                  'number_o_1': 'number_of_killed'})

In [6]:
# change data type of crash_time column
gdf_crash_short_segment['crash_time'] = pd.to_datetime(gdf_crash_short_segment['crash_time'])

In [7]:
# create a column for hour when crash occure
gdf_crash_short_segment['hour'] = gdf_crash_short_segment['crash_time'].dt.hour

In [8]:
# group by 'nearest_id' ('nearest_id' is an Sharedstreet geometry id of the short segment)
df_crash_short_segment_hourly_injured =  gdf_crash_short_segment[['nearest_id',
                                                                   'hour',
                                                                   'number_of_injured']].groupby(['nearest_id',
                                                                                           'hour'],
                                                                                            as_index=False).sum()

In [9]:
# create the list of hours in a day and short segments
hourly_range = list(range(0,24))
list_short_segment_id =gdf_short_segment['id'].unique().tolist()

In [10]:
# create the list of empty dataframe per segment. In each empty dataframe, 24 rows will be created per segments
list_empty_dataframe = []
for i in list_short_segment_id:
    temp_ = pd.DataFrame(hourly_range).rename(columns={0:'hour'})
    temp_['id'] = i
    list_empty_dataframe.append(temp_)

In [11]:
# concatenate the empty dataframes
df_hourly_injured = pd.concat(list_empty_dataframe, axis=0)

In [12]:
# merge dataframe. By doing this, we will get the number of crashes per each short segment, 
# and there will be NaN values if there is no crashes at a certain segment and hour
df_hourly_injured = df_hourly_injured.merge(df_crash_short_segment_hourly_injured,
                                        left_on=['id','hour'],
                                        right_on=['nearest_id','hour'],
                                        how='left')

In [13]:
# drop unnecessary columns
df_hourly_injured = df_hourly_injured.drop('nearest_id', axis=1)

In [14]:
# fill NaN values as 0
df_hourly_injured = df_hourly_injured.fillna(0)

In [15]:
# change datatype to int
df_hourly_injured['number_of_injured'] = df_hourly_injured['number_of_injured'].astype(int)

In [16]:
# doing same things regarding number of killed people
df_crash_short_segment_hourly_killed =  gdf_crash_short_segment[['nearest_id',
                                                                 'hour',
                                                                 'number_of_killed']].groupby(['nearest_id',
                                                                                               'hour'],
                                                                                            as_index=False).sum()
df_hourly_killed = pd.concat(list_empty_dataframe, axis=0)
df_hourly_killed = df_hourly_killed.merge(df_crash_short_segment_hourly_killed,
                                        left_on=['id','hour'],
                                        right_on=['nearest_id','hour'],
                                        how='left')
df_hourly_killed = df_hourly_killed.drop('nearest_id', axis=1)
df_hourly_killed = df_hourly_killed.fillna(0)
df_hourly_killed['number_of_killed'] = df_hourly_killed['number_of_killed'].astype(int)

In [17]:
# To minimize the size of the csv file, we will save the number of crashes as a list
df_hourly_injured_list = df_hourly_injured[['id','number_of_injured']].groupby('id')['number_of_injured'].apply(list)

In [18]:
# check the list was created by time order
df_hourly_injured_list.head()

id
00010fd3ee560483c21bb98e414741c7    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, ...
0006da955dbe286a729ac6847ec22e6f    [0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, ...
00217e287463d24d9b0e8d20efa9e511    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
00520114b0a7f9d36eafa7e42f03196e    [0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, ...
0059632c4bd2f573e9c2beed50983686    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
Name: number_of_injured, dtype: object

In [19]:
df_hourly_injured.loc[df_hourly_injured['id']=='00010fd3ee560483c21bb98e414741c7']['number_of_injured'].tolist()

[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 3, 0, 1, 0, 0, 2, 0, 0, 0, 2, 0]

In [20]:
# To minimize the size of the csv file, we will save the number of crashes as a list
df_hourly_killed_list = df_hourly_killed[['id','number_of_killed']].groupby('id')['number_of_killed'].apply(list)

In [21]:
df_hourly_killed_list

id
00010fd3ee560483c21bb98e414741c7    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
0006da955dbe286a729ac6847ec22e6f    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
00217e287463d24d9b0e8d20efa9e511    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
00520114b0a7f9d36eafa7e42f03196e    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
0059632c4bd2f573e9c2beed50983686    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                                                          ...                        
ffa76979180bc70451bbd5be891b4b13    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
ffb39397b1f0b732ac9f22c5190a8513    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
ffd7ebaa31da02967746ade3e93cf756    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
ffe965dbb2a41f3288a679d2c0451c49    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
ffedf771d90e31a9e03ac80241e58b50    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: number_of_killed, Length: 5340, dtype: object

In [22]:
# merge the two dataframes
df_list = pd.DataFrame(df_hourly_injured_list).merge(pd.DataFrame(df_hourly_killed_list), left_index=True, right_index=True)

In [23]:
# export
df_list.to_csv('../data/cleaned_data/crash_aggregation/crash_short_segment_hourly_injured.csv')

## 2. node

Same process with the 'short segment' part

In [24]:
# make sure that you run the 'shst_02_extract_short_segments' notebook to get the 'shst_node_filtered.shp'
# import a shapefile of the node point
gdf_node = gpd.read_file('../data/sharedstreets_geometry/node_filtered/shst_node_filtered.shp')

In [25]:
# make sure that you run the 'crash_02_define_Intersection_crash' notebook to get the 'crash_intersection.shp'
# import crash dataset
gdf_crash_node = gpd.read_file('../data/cleaned_data/crash_seperated/crash_intersection/crash_intersection.shp')

In [26]:
gdf_crash_node.columns

Index(['collision_', 'nearest_no', 'crash_date', 'crash_time', 'borough',
       'zip_code', 'on_street_', 'cross_stre', 'off_street', 'number_of_',
       'number_o_1', 'number_o_2', 'number_o_3', 'number_o_4', 'number_o_5',
       'number_o_6', 'number_o_7', 'contributi', 'contribu_1', 'contribu_2',
       'contribu_3', 'contribu_4', 'vehicle_ty', 'vehicle__1', 'vehicle__2',
       'vehicle__3', 'vehicle__4', 'geometry'],
      dtype='object')

In [27]:
# rename columns
gdf_crash_node = gdf_crash_node.rename(columns={'collision_':'collision_id',
                                                'nearest_no':'node_id',
                                                'number_of_': 'number_of_injured',
                                                'number_o_1': 'number_of_killed'})

In [28]:
# change data type of crash_time column
gdf_crash_node['crash_time'] = pd.to_datetime(gdf_crash_node['crash_time'])

In [29]:
# create a column for hour when crash occure
gdf_crash_node['hour'] = gdf_crash_node['crash_time'].dt.hour

In [30]:
# group by 'node_id'
df_crash_node_hourly_injured =  gdf_crash_node[['node_id',
                                                'hour',
                                                'number_of_injured']].groupby(['node_id',
                                                                               'hour'],
                                                                               as_index=False).sum()

In [31]:
# create the list of hours in a day and node points
hourly_range = list(range(0,24))
list_node_id =gdf_node['node_id'].unique().tolist()

In [32]:
# create the list of empty dataframe per segment. In each empty dataframe, 24 rows will be created per segments
list_empty_dataframe = []
for i in list_node_id:
    temp_ = pd.DataFrame(hourly_range).rename(columns={0:'hour'})
    temp_['id'] = i
    list_empty_dataframe.append(temp_)

In [33]:
# concatenate the empty dataframes
df_hourly_injured = pd.concat(list_empty_dataframe, axis=0)

In [34]:
# merge dataframe. By doing this, we will get the number of crashes per each short segment, 
# and there will be NaN values if there is no crashes at a certain segment and hour
df_hourly_injured = df_hourly_injured.merge(df_crash_node_hourly_injured,
                                        left_on=['id','hour'],
                                        right_on=['node_id','hour'],
                                        how='left')

In [35]:
# drop unnecessary columns
df_hourly_injured = df_hourly_injured.drop('node_id', axis=1)

In [36]:
# fill NaN values as 0
df_hourly_injured = df_hourly_injured.fillna(0)

In [37]:
# change datatype to int
df_hourly_injured['number_of_injured'] = df_hourly_injured['number_of_injured'].astype(int)

In [38]:
# doing same things regarding number of killed people
df_crash_node_hourly_killed =  gdf_crash_node[['node_id',
                                               'hour',
                                               'number_of_killed']].groupby(['node_id',
                                                                             'hour'],
                                                                              as_index=False).sum()
df_hourly_killed = pd.concat(list_empty_dataframe, axis=0)
df_hourly_killed = df_hourly_killed.merge(df_crash_node_hourly_killed,
                                        left_on=['id','hour'],
                                        right_on=['node_id','hour'],
                                        how='left')
df_hourly_killed = df_hourly_killed.drop('node_id', axis=1)
df_hourly_killed = df_hourly_killed.fillna(0)
df_hourly_killed['number_of_killed'] = df_hourly_killed['number_of_killed'].astype(int)

In [39]:
# To minimize the size of the csv file, we will save the number of crashes as a list
df_hourly_injured_list = df_hourly_injured[['id','number_of_injured']].groupby('id')['number_of_injured'].apply(list)

In [40]:
# To minimize the size of the csv file, we will save the number of crashes as a list
df_hourly_killed_list = df_hourly_killed[['id','number_of_killed']].groupby('id')['number_of_killed'].apply(list)

In [41]:
# merge the two dataframes
df_list = pd.DataFrame(df_hourly_injured_list).merge(pd.DataFrame(df_hourly_killed_list), left_index=True, right_index=True)

In [42]:
# export
df_list.to_csv('../data/cleaned_data/crash_aggregation/crash_node_hourly_injured.csv')

## 3. segment

In [43]:
# make sure that you run the 'shst_02_extract_short_segments' notebook to get the 'shst_segment_filtered.shp'
# import a shapefile of the short segment centroid
gdf_segment = gpd.read_file('../data/sharedstreets_geometry/segment_filtered/shst_segment_filtered.shp')

In [44]:
gdf_segment.head()

Unnamed: 0,id,fromInters,toIntersec,forwardRef,backRefere,roadClass,length,geometry
0,db6792075ebbddc84479fda26174ca30,374b01a56e64379b8d7198962eaede90,2922a5babc5f921116a9fed4131a5bb1,48b7ab8e4cbafb2c1893cd682ded6704,a8475c8bd67f9e0ec8ce6a404aae41c1,Residential,299.421958,"LINESTRING (-73.91694 40.64668, -73.91625 40.6..."
1,42ccdc2b9ebc38f98c22bb0035045628,37db438d57f16f92e5ba91f1ad1793bb,374b01a56e64379b8d7198962eaede90,febaf06db79d8a16588d1c387a62fdb2,9db38906c3d8ae5df463e297be4e2b9b,Residential,256.577253,"LINESTRING (-73.91765 40.64623, -73.91732 40.6..."
2,84afb6627019b793945a7aab1feefe77,374b01a56e64379b8d7198962eaede90,5b6e4972c82ad4eb6d24c17b94b33b59,3f53ec240fc39c6b6810243b5b6fc830,fbbb71d35b794421e030d3ec9e1dcede,Residential,264.356932,"LINESTRING (-73.91694 40.64668, -73.91662 40.6..."
3,cce2402bd9841cb406b283d10a814940,5b6e4972c82ad4eb6d24c17b94b33b59,a3e9299f85dbecb22e829acc84c10e0e,3087b5f676af0e0026291f350b24859f,2b5b359caf490894be2d52ac51ebd0aa,Residential,256.824988,"LINESTRING (-73.91621 40.64715, -73.91550 40.6..."
4,9fed2268f9da5e9d263fdec0cb322aaa,5b6e4972c82ad4eb6d24c17b94b33b59,4701018d0dd721de27e208b19f0b20b6,4e2b3b2e438d9d7d504c1f0c76a5e4b0,564d3c4d11cb8340bfa4cc2b1ac2e6ff,Residential,378.440368,"LINESTRING (-73.91621 40.64715, -73.91611 40.6..."


In [45]:
# make sure that you run the 'crash_02_define_Intersection_crash' notebook to get the 'crash_segment.shp'
# import crash dataset
gdf_crash_segment = gpd.read_file('../data/cleaned_data/crash_seperated/crash_segment/crash_segment.shp')

In [46]:
gdf_crash_segment.columns

Index(['collision_', 'geometry_i', 'crash_date', 'crash_time', 'borough',
       'zip_code', 'on_street_', 'cross_stre', 'off_street', 'number_of_',
       'number_o_1', 'number_o_2', 'number_o_3', 'number_o_4', 'number_o_5',
       'number_o_6', 'number_o_7', 'contributi', 'contribu_1', 'contribu_2',
       'contribu_3', 'contribu_4', 'vehicle_ty', 'vehicle__1', 'vehicle__2',
       'vehicle__3', 'vehicle__4', 'geometry'],
      dtype='object')

In [47]:
# rename columns
gdf_crash_segment = gdf_crash_segment.rename(columns={'collision_':'collision_id',
                                                      'geometry_i':'geometry_id',
                                                      'number_of_': 'number_of_injured',
                                                      'number_o_1': 'number_of_killed'})

In [48]:
# change data type of crash_time column
gdf_crash_segment['crash_time'] = pd.to_datetime(gdf_crash_segment['crash_time'])

In [49]:
# create a column for hour when crash occure
gdf_crash_segment['hour'] =gdf_crash_segment['crash_time'].dt.hour

In [50]:
# group by 'node_id'
df_crash_segment_hourly_injured =  gdf_crash_segment[['geometry_id',
                                                      'hour',
                                                      'number_of_injured']].groupby(['geometry_id',
                                                                                     'hour'],
                                                                                     as_index=False).sum()

In [51]:
# create the list of hours in a day and node points
hourly_range = list(range(0,24))
list_segment_id =gdf_segment['id'].unique().tolist()

In [52]:
# create the list of empty dataframe per segment. In each empty dataframe, 24 rows will be created per segments
list_empty_dataframe = []
for i in list_segment_id:
    temp_ = pd.DataFrame(hourly_range).rename(columns={0:'hour'})
    temp_['id'] = i
    list_empty_dataframe.append(temp_)

In [53]:
# concatenate the empty dataframes
df_hourly_injured = pd.concat(list_empty_dataframe, axis=0)

In [54]:
# merge dataframe. By doing this, we will get the number of crashes per each short segment, 
# and there will be NaN values if there is no crashes at a certain segment and hour
df_hourly_injured = df_hourly_injured.merge(df_crash_segment_hourly_injured,
                                        left_on=['id','hour'],
                                        right_on=['geometry_id','hour'],
                                        how='left')

In [55]:
# drop unnecessary columns
df_hourly_injured = df_hourly_injured.drop('geometry_id', axis=1)

In [56]:
# fill NaN values as 0
df_hourly_injured = df_hourly_injured.fillna(0)

In [57]:
# change datatype to int
df_hourly_injured['number_of_injured'] = df_hourly_injured['number_of_injured'].astype(int)

In [58]:
# doing same things regarding number of killed people
df_crash_segment_hourly_killed =  gdf_crash_segment[['geometry_id',
                                                      'hour',
                                                      'number_of_killed']].groupby(['geometry_id',
                                                                                    'hour'],
                                                                                     as_index=False).sum()
df_hourly_killed = pd.concat(list_empty_dataframe, axis=0)
df_hourly_killed = df_hourly_killed.merge(df_crash_segment_hourly_killed,
                                        left_on=['id','hour'],
                                        right_on=['geometry_id','hour'],
                                        how='left')
df_hourly_killed = df_hourly_killed.drop('geometry_id', axis=1)
df_hourly_killed = df_hourly_killed.fillna(0)
df_hourly_killed['number_of_killed'] = df_hourly_killed['number_of_killed'].astype(int)

In [59]:
# To minimize the size of the csv file, we will save the number of crashes as a list
df_hourly_injured_list = df_hourly_injured[['id','number_of_injured']].groupby('id')['number_of_injured'].apply(list)

In [60]:
# To minimize the size of the csv file, we will save the number of crashes as a list
df_hourly_killed_list = df_hourly_killed[['id','number_of_killed']].groupby('id')['number_of_killed'].apply(list)

In [61]:
# merge the two dataframes
df_list = pd.DataFrame(df_hourly_injured_list).merge(pd.DataFrame(df_hourly_killed_list), left_index=True, right_index=True)

In [62]:
# export
df_list.to_csv('../data/cleaned_data/crash_aggregation/crash_segment_hourly_injured.csv')