## Merging 511 events and Lion dataset 

This notebook is for adding lion street characterisctis on each 511 events. 

Lion Usable attributes: 

1. Segment type (undivided street, both could mean a median lane maybe, ramp) 

2. Roadway type 

3. Traffic direction

4. Loc status 

5. Curve flag 

6. Radius? 

7. Min st width 

8. Max st width 

9. Bike lane (0,1) 

10. Bike lane

11. number of travel lanes 

12. number of park lanes 

13. number of total lanes 

14. Posted Speed 

15. Snow removal priority

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
# make sure that you run the 'lion_00_data_wrangling_&_filtration' notebook to get the 'lion_filtered.shp'
# import the lion segments
gdf_lion = gpd.read_file('../data/cleaned_data/lion_filtered/lion_filtered.shp')

In [3]:
gdf_lion.columns.tolist()

['objectid',
 'street',
 'saf_street',
 'feature_ty',
 'segment_ty',
 'inc_ex_fla',
 'rb_layer',
 'non_ped',
 'traf_dir',
 'traf_src',
 'spec_addr',
 'face_code',
 'seq_num',
 'street_cod',
 'saf_stre_1',
 'lgc1',
 'lgc2',
 'lgc3',
 'lgc4',
 'lgc5',
 'lgc6',
 'lgc7',
 'lgc8',
 'lgc9',
 'boe_lgc',
 'segment_id',
 'seg_count',
 'loc_status',
 'l_zip',
 'r_zip',
 'l_boro',
 'r_boro',
 'l_cd',
 'r_cd',
 'latomicpol',
 'ratomicpol',
 'lct2010',
 'lct2010_su',
 'rct2010',
 'rct2010_su',
 'lcb2010',
 'lcb2010_su',
 'rcb2010',
 'rcb2010_su',
 'lct2000',
 'lct2000_su',
 'rct2000',
 'rct2000_su',
 'lcb2000',
 'lcb2000_su',
 'rcb2000',
 'rcb2000_su',
 'lct1990',
 'lct1990_su',
 'rct1990',
 'rct1990_su',
 'l_assm_dis',
 'l_elect_di',
 'r_assm_dis',
 'r_elect_di',
 'split_elec',
 'l_schl_dis',
 'r_schl_dis',
 'split_schl',
 'l_sub_sect',
 'r_sub_sect',
 'san_dist_i',
 'map_from',
 'map_to',
 'boro_bndry',
 'mh_ri_flag',
 'x_from',
 'y_from',
 'x_to',
 'y_to',
 'arc_center',
 'arc_cent_1',
 'curve_f

In [5]:
# make sure that you run the '511_02_counting_crash' notebook to get the '511_crashcount_0629.csv'
# import 511 events with crash counts
df_511_crash = pd.read_csv('../data/cleaned_data/511_crashcount_0629.csv')

In [6]:
# Extract useful columns as a different geodataframe
gdf_lion_simplified = gdf_lion[['segment_id',
                                'segment_ty',
                                'rw_type',
                                'traf_dir',
                                'loc_status',
                                'curve_flag',
                                'radius',
                                'street_wid',
                                'bike_lane',
                                'bike_trafd',
                                'number_tra',
                                'number_par',
                                'number_tot',
                                'posted_spe',
                                'snow_prior']]

In [7]:
# correct column names
gdf_lion_simplified = gdf_lion_simplified.rename(columns={'segment_ty':'segment_type',
                                                          'rw_type': 'roadway_type',
                                                          'traf_dir':'traffic_direction',
                                                          'bike_trafd':'bike_traffic_direction',
                                                          'number_par':'number_park',
                                                          'number_tra':'number_travel',
                                                          'number_tot':'number_total',
                                                          'posted_spe': 'posted_speed',
                                                          'street_wid': 'street_width'})

In [9]:
# change a type of segment id from string to int
gdf_lion_simplified['segment_id'] = gdf_lion_simplified['segment_id'].astype(int)

In [10]:
# import lion, shst lookup table
df_lion_shst_lookup = pd.read_csv('../data/cleaned_data/shst_lion_lookuptable.csv')

In [11]:
df_lion_shst_lookup.head()

Unnamed: 0,shstGeometryId,pp_objectid,pp_segment_id
0,a5c5f21851e81c507b141aecc2c6235e,1,78126
1,8486febe149b7fbeba3b499188f5bc2a,2,79796
2,300959ea23f9820fbe692a4a89a67396,3,77356
3,d39717c7db5c64930d589e45414b839e,5,73490
4,8af91954050b3b92b77dccfa206cfc19,6,174633


In [12]:
def get_lion_segment_id(geometryid):
    '''
    This function returns list of the lion segments id from shst geometry id.
    
    ------
    Input :
        geometryid (string): Sharedstreet geometryid    
    
    -----
    Output :
        list_segment_id (list): list of lion segment id that matched to sharedstreet segment
    
    '''
    list_segment_id = df_lion_shst_lookup.loc[df_lion_shst_lookup['shstGeometryId']==geometryid]['pp_segment_id'].tolist()
    return list_segment_id

In [13]:
# get list of lion segment ids from shstgeometry ID 
df_511_crash['segment_id_list'] = df_511_crash['geometryId'].apply(lambda x:get_lion_segment_id(x))

In [14]:
df_511_crash['segment_count'] = df_511_crash['segment_id_list'].apply(lambda x:len(x))

In [15]:
# filter out 511 events that isn't matched with any lion segment 
df_511_crash = df_511_crash.drop(df_511_crash.loc[df_511_crash['segment_count']==0].index)

In [16]:
# drop unnecessary columns
df_511_crash = df_511_crash.drop('segment_count', axis=1)

In [17]:
df_511_crash.shape

(25358, 26)

In [18]:
df_511_crash.head()

Unnamed: 0,event_id,Event Type,Organizati,Facility N,Direction,City,County,State,Create Tim,Close Time,...,geometry,buffer_geometry_900ft,buffer_geometry_1800ft,buffer_geometry_2700ft,buffer_geometry_3600ft,crash_list_900ft,crash_list_1800ft,crash_list_2700ft,crash_list_3600ft,segment_id_list
0,0,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:45:00,2019-10-06T10:17:00,...,POINT (-73.9452906062501 40.85016678170324),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[4218698],"[4218698, 4220414, 4220401]","[4216810, 4218698, 4220414, 4220401, 4218408, ...","[4218684, 4218686, 4218688, 4218046, 4216810, ...","[144212, 144263, 144262, 9014082, 9014081, 140..."
1,1,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:33:00,2019-10-05T10:01:00,...,POINT (-73.9452906062501 40.85016678170324),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[4218698],"[4218698, 4220401]","[4216810, 4218698, 4220401, 4217050, 4216342]","[4218684, 4218686, 4218688, 4218046, 4216810, ...","[144212, 144263, 144262, 9014082, 9014081, 140..."
2,4,bridge construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T22:35:00,2019-10-04T14:47:00,...,POINT (-73.95270838629047 40.85161714904007),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[],[4220401],"[4220401, 4217050, 4216342, 4216810]","[4220401, 4215847, 4217050, 4216342, 4218398, ...","[144212, 144263, 144262, 9014082, 9014081, 140..."
3,6,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T21:49:00,2019-10-06T10:17:00,...,POINT (-73.9452906062501 40.85016678170324),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[4218698],"[4218698, 4220414, 4220401]","[4216810, 4218698, 4220414, 4220401, 4218408, ...","[4218684, 4218686, 4218046, 4218688, 4216810, ...","[144212, 144263, 144262, 9014082, 9014081, 140..."
4,19,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-27T08:08:00,2019-09-27T13:31:00,...,POINT (-73.9452906062501 40.85016678170324),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[],[],[4213587],"[4213694, 4213587]","[144212, 144263, 144262, 9014082, 9014081, 140..."


In [16]:
# change lion geodataframe's index as segment id
gdf_lion_simplified = gdf_lion_simplified.set_index('segment_id')

In [17]:
# explode 511,lion merged geodataframe. it will create a row for each segment in a list in segment_id_list column
df_511_crash_lion_merged_unstack = df_511_crash.explode('segment_id_list')

In [18]:
df_511_crash_lion_merged_unstack.head(3)

Unnamed: 0,event_id,Event Type,Organizati,Facility N,Direction,City,County,State,Create Tim,Close Time,...,geometry,buffer_geometry_900ft,buffer_geometry_1800ft,buffer_geometry_2700ft,buffer_geometry_3600ft,crash_list_900ft,crash_list_1800ft,crash_list_2700ft,crash_list_3600ft,segment_id_list
0,0,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:45:00,2019-10-06T10:17:00,...,POINT (-73.9452906062501 40.85016678170324),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[4218698],"[4218698, 4220414, 4220401]","[4216810, 4218698, 4220414, 4220401, 4218408, ...","[4218684, 4218686, 4218688, 4218046, 4216810, ...",144212
0,0,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:45:00,2019-10-06T10:17:00,...,POINT (-73.9452906062501 40.85016678170324),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[4218698],"[4218698, 4220414, 4220401]","[4216810, 4218698, 4220414, 4220401, 4218408, ...","[4218684, 4218686, 4218688, 4218046, 4216810, ...",144263
0,0,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:45:00,2019-10-06T10:17:00,...,POINT (-73.9452906062501 40.85016678170324),"POLYGON ((992680.4460278568 251695.5213804595,...","POLYGON ((993278.5442073554 252368.045084052, ...","POLYGON ((993495.2826707244 247638.1129178417,...","POLYGON ((993323.9419628344 246752.2208376681,...",[4218698],"[4218698, 4220414, 4220401]","[4216810, 4218698, 4220414, 4220401, 4218408, ...","[4218684, 4218686, 4218688, 4218046, 4216810, ...",144262


In [19]:
# merge above geodataframe with lion geodataframe based on the segment id
df_511_crash_lion_merged_unstack = df_511_crash_lion_merged_unstack.merge(gdf_lion_simplified, left_on='segment_id_list', right_index=True)

In [20]:
df_511_crash_lion_merged_unstack.head(3)

Unnamed: 0,event_id,Event Type,Organizati,Facility N,Direction,City,County,State,Create Tim,Close Time,...,curve_flag,radius,street_width,bike_lane,bike_traffic_direction,number_travel,number_park,number_total,posted_speed,snow_prior
0,0,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:45:00,2019-10-06T10:17:00,...,,0,30.0,,,2,,2,45,V
1,1,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:33:00,2019-10-05T10:01:00,...,,0,30.0,,,2,,2,45,V
2,4,bridge construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T22:35:00,2019-10-04T14:47:00,...,,0,30.0,,,2,,2,45,V


In [21]:
# fill null values as -1
df_511_crash_lion_merged_unstack = df_511_crash_lion_merged_unstack.fillna(-1)

In [22]:
# drop unnecessary columns
df_511_crash_lion_merged_unstack = df_511_crash_lion_merged_unstack.drop(['segment_id_list'], axis=1)

In [23]:
df_511_crash_lion_merged_unstack.columns

Index(['event_id', 'Event Type', 'Organizati', 'Facility N', 'Direction',
       'City', 'County', 'State', 'Create Tim', 'Close Time', 'Event Desc',
       'Responding', 'Latitude', 'Longitude', 'Duration', 'geometryId',
       'geometry', 'buffer_geometry_900ft', 'buffer_geometry_1800ft',
       'buffer_geometry_2700ft', 'buffer_geometry_3600ft', 'crash_list_900ft',
       'crash_list_1800ft', 'crash_list_2700ft', 'crash_list_3600ft',
       'segment_type', 'roadway_type', 'traffic_direction', 'loc_status',
       'curve_flag', 'radius', 'street_width', 'bike_lane',
       'bike_traffic_direction', 'number_travel', 'number_park',
       'number_total', 'posted_speed', 'snow_prior'],
      dtype='object')

In [29]:
# create the list.
# Here, the element of list are the columns in the 511 dataset (not in lion dataset)
columns = df_511_crash_lion_merged_unstack.columns.tolist()[:25]
columns

['event_id',
 'Event Type',
 'Organizati',
 'Facility N',
 'Direction',
 'City',
 'County',
 'State',
 'Create Tim',
 'Close Time',
 'Event Desc',
 'Responding',
 'Latitude',
 'Longitude',
 'Duration',
 'geometryId',
 'geometry',
 'buffer_geometry_900ft',
 'buffer_geometry_1800ft',
 'buffer_geometry_2700ft',
 'buffer_geometry_3600ft',
 'crash_list_900ft',
 'crash_list_1800ft',
 'crash_list_2700ft',
 'crash_list_3600ft']

In [30]:
# groupby columns were in the 511 datasets. we will pick the most frequent value
df_511_crash_lion = df_511_crash_lion_merged_unstack.groupby(columns, as_index=False).agg(lambda x:x.value_counts().index[0])

In [31]:
df_511_crash_lion.shape

(25358, 39)

In [32]:
df_511_crash_lion.head(3)

Unnamed: 0,event_id,Event Type,Organizati,Facility N,Direction,City,County,State,Create Tim,Close Time,...,curve_flag,radius,street_width,bike_lane,bike_traffic_direction,number_travel,number_park,number_total,posted_speed,snow_prior
0,0,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:45:00,2019-10-06T10:17:00,...,-1,0,40.0,-1,-1,4,-1,4,45,V
1,1,construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,Manhattan,New York,NY,2019-09-30T23:33:00,2019-10-05T10:01:00,...,-1,0,40.0,-1,-1,4,-1,4,45,V
2,2,bridge construction,Port Authority New York/New Jersey,George Washington Bridge,Westbound,ramp to Manhattan,New York,NY,2019-09-30T22:48:00,2019-10-04T14:47:00,...,-1,0,35.0,-1,-1,3,-1,3,50,V


In [33]:
df_511_crash_lion.columns

Index(['event_id', 'Event Type', 'Organizati', 'Facility N', 'Direction',
       'City', 'County', 'State', 'Create Tim', 'Close Time', 'Event Desc',
       'Responding', 'Latitude', 'Longitude', 'Duration', 'geometryId',
       'geometry', 'buffer_geometry_900ft', 'buffer_geometry_1800ft',
       'buffer_geometry_2700ft', 'buffer_geometry_3600ft', 'crash_list_900ft',
       'crash_list_1800ft', 'crash_list_2700ft', 'crash_list_3600ft',
       'segment_type', 'roadway_type', 'traffic_direction', 'loc_status',
       'curve_flag', 'radius', 'street_width', 'bike_lane',
       'bike_traffic_direction', 'number_travel', 'number_park',
       'number_total', 'posted_speed', 'snow_prior'],
      dtype='object')

In [34]:
df_511_crash_lion.to_csv('../data/cleaned_data/511_crash_lion_0629.csv', index=False)