# crash_02_define_Intersection_crash

In this notebook, we will separate 'matched' crashes (the result of 'crash_02_data_wrangling_with_shst.ipynb') into intersection and segment crashes. The intersection crashes are located nearby Sharedstreet node points and 'short segment' (please check 'shst_02_extract_short_segments.ipynb)' and 'segment crashes'  means crashes that are located on Sharedstreet segment but not located near to intersections and short segments. <br>

This notebook contains chapters like below:

- 1. Filtering crashes which occurred nearby short segment
- 2. Filtering crashes which occurred nearby Sharedstreets nodes
- 3. Filtering crashes which occurred nearby Sharedstreets segments

In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from fiona.crs import from_epsg
from shapely.geometry import LineString

In [2]:
# make sure that you run the 'crash_01_data_wrangling' notebook to get the '511_mv_collisions.csv'
# import matched crash dataset
gdf_crashes = gpd.read_file('../data/cleaned_data/mv_collisions_shst_matched.geojson')

In [3]:
gdf_crashes.head(3)

Unnamed: 0,collision_id,geometry_id,crash_date,crash_time,borough,zip_code,on_street_name,cross_street_name,off_street_name,number_of_persons_injured,...,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5,geometry
0,3531327,ba4520777941a56b87f97a1d35dc2e20,2016-10-01,20:20:00,manhattan,10038.0,,,20 park row,0.0,...,unspecified,,,,PASSENGER VEHICLE,PASSENGER VEHICLE,,,,POINT (-74.00772 40.71152)
1,3530538,da0bde3c3c147e230387851d1679e6bc,2016-10-01,01:40:00,,,gowanus expy (bqe),,,0.0,...,,,,,PASSENGER VEHICLE,,,,,POINT (-74.00712 40.65499)
2,3534839,aff3547e8a39ef2c07ad433655ca4d61,2016-10-01,22:30:00,manhattan,10032.0,west 163 street,broadway,,0.0,...,,,,,PASSENGER VEHICLE,PASSENGER VEHICLE,,,,POINT (-73.94216 40.83779)


In [4]:
# make sure that you run the 'shst_01_processing_sharedstreets_geometry' notebook to get the 'shst_segment.shp'
# import sharedstreet segment 
gdf_shst_segment = gpd.read_file('../data/sharedstreets_geometry/segment/shst_segment.shp')

In [5]:
# extract a list of the segment id 
list_shst_segment = gdf_shst_segment['id'].tolist()

In [6]:
# drop the crashes that were matched on segment id which were not in shst_segment.shp 
gdf_crashes = gdf_crashes.loc[gdf_crashes['geometry_id'].isin(list_shst_segment)]

In [7]:
# copy crash dataset with projection systme egsg 2263 (us-ft)
gdf_crashes_2263 = gdf_crashes.copy()
gdf_crashes_2263.crs = from_epsg(4326)
gdf_crashes_2263 = gdf_crashes_2263.to_crs(epsg=2263)  

  return _prepare_from_string(" ".join(pjargs))


## 1. Filtering crashes which occurred nearby short segment

In [8]:
# make sure that you run the 'shst_02_extract_short_segments' notebook to get the 'shst_short_segment.shp'
# import a shapefile of the short segment
gdf_short_segment = gpd.read_file('../data/sharedstreets_geometry/short_segment/shst_short_segment.shp')

In [9]:
# make sure that you run the 'shst_02_extract_short_segments' notebook to get the '/shst_short_segment_centroid.shp'
# import a shapefile of the short segment centroid
gdf_short_segment_centroid = gpd.read_file('../data/sharedstreets_geometry/short_segment/shst_short_segment_centroid.shp')

In [10]:
# drop unnecessary columns
gdf_short_segment = gdf_short_segment[['id','geometry']]

In [11]:
# create a geodataframe of the sharedstreets that projected to epsg2263
gdf_short_segment_2263 = gdf_short_segment.copy()

# change coordinate system from epsg4326 to epsg2263
# epsg2263 is based on us-ft.
gdf_short_segment_2263.crs = from_epsg(4326)
gdf_short_segment_2263 = gdf_short_segment_2263.to_crs(epsg=2263)  

  return _prepare_from_string(" ".join(pjargs))


In [12]:
# buffer 30ft
gdf_short_segment_2263['geometry'] = gdf_short_segment_2263['geometry'].buffer(30)

In [13]:
# using sjoin function, find crashes which occured buffer zone.
gdf_crashes_2263_short_segment = gpd.sjoin(gdf_crashes_2263, gdf_short_segment_2263, op='within')

In [14]:
# drop unnecessary columns
df_crashes_short_segment = gdf_crashes_2263_short_segment[['collision_id','id']]

In [15]:
df_crashes_short_segment.head()

Unnamed: 0,collision_id,id
2,3534839,aff3547e8a39ef2c07ad433655ca4d61
94085,3631813,aff3547e8a39ef2c07ad433655ca4d61
94285,3632703,aff3547e8a39ef2c07ad433655ca4d61
100490,3638615,aff3547e8a39ef2c07ad433655ca4d61
109310,3648160,aff3547e8a39ef2c07ad433655ca4d61


Because some crashes matched with multiple short segments, we will assign the nearest short segment to each crash 

In [16]:
# this cell will create the dataframe that contains list of sharedstreet id for each collision id
df_crashes_short_segment = df_crashes_short_segment.groupby('collision_id')['id'].apply(list)
df_crashes_short_segment = df_crashes_short_segment.reset_index()

In [17]:
# add 'geometry' column that contains coordinates of crashes
df_crashes_short_segment = df_crashes_short_segment.merge(gdf_crashes[['collision_id','geometry']], on='collision_id', how='left')

In [18]:
df_crashes_short_segment.head()

Unnamed: 0,collision_id,id,geometry
0,3530528,[e95045cba4602b73447ed579683d85ae],POINT (-73.96137 40.68047)
1,3530535,[3aa91910e5f2525ab515c94f695c57b4],POINT (-73.96822 40.79978)
2,3530542,[67df749077fafca15d9c8b493fdf54d0],POINT (-73.76844 40.68620)
3,3530561,[f8c6d0bea790a5f71cd04a4e1401751d],POINT (-73.78581 40.68900)
4,3530570,[511c9d6a42a91f2abda9eb8ab1f476fd],POINT (-73.75181 40.68240)


In [19]:
def calculate_distance(point1,point2):
    '''
    This function is for calculating a distance between two points
    -----
    Input:
        point1 (Shapely.geometry.Point): a point (x,y)
        point2 (Shapely.geometry.Point): a point (x,y)
    -----
    Output:    
        distance (float): a distance between point1 and point2
    
    '''
    return LineString(tuple(point1.coords)+tuple(point2.coords)).length

In [20]:
def most_nearest_short_segment(x):
    '''
    This function returns an id of the nearest short segment
    -----
    Input:
        x (pd.DataFrame): df_crashes_short_segment
    -----
    Output:    
        id (string) : an id of the nearest short segment
    
    '''
    # crash point (coordinate of crash event)
    point = x['geometry']
    
    # list of short segment
    list_node = x['id']
    
    # if a crash matched with one short segment
    if len(list_node) == 1:
        
        # return an id of that short segment
        return list_node[0]
    else:
        # make a dataframe of short segment
        df_node = pd.DataFrame(list_node).rename(columns={0:'id'})
        
        # add a 'geometry' column that contains a centroid of each short segment
        df_node = df_node.merge(gdf_short_segment_centroid[['id','geometry']], on='id', how='left') 
        
        # calculate distance between the centroids and crash point
        df_node['distance'] = df_node['geometry'].apply(lambda x: calculate_distance(point,x))
        
        # return an id of the nearest short segment
        return df_node.sort_values(by='distance').iloc[0]['id']

In [21]:
# get the nearest short segment
df_crashes_short_segment['nearest_id'] = df_crashes_short_segment.apply(lambda x:most_nearest_short_segment(x), axis=1)

In [22]:
# drop unnecessary columns
df_crashes_short_segment = df_crashes_short_segment.drop(['id','geometry'], axis=1)

In [23]:
# set the geometry as the centroid of the nearest short segment
df_crashes_short_segment = df_crashes_short_segment.merge(gdf_short_segment_centroid[['id','geometry']],
                                                          left_on = 'nearest_id',
                                                          right_on = 'id')
df_crashes_short_segment = df_crashes_short_segment.drop('id', axis=1)

In [24]:
# add crash characteristics
df_crashes_short_segment = df_crashes_short_segment.merge(gdf_crashes.drop(['geometry_id',
                                                                            'geometry'], axis=1),
                                                          on='collision_id',
                                                          how='left')

In [25]:
# set the type of GeoDataFrame
gdf_crashes_short_segment = gpd.GeoDataFrame(df_crashes_short_segment, geometry='geometry') 

In [26]:
# set the projection system
gdf_crashes_short_segment.crs = from_epsg(4326)

  return _prepare_from_string(" ".join(pjargs))


In [51]:
gdf_crashes_short_segment.head(3)

Unnamed: 0,collision_id,nearest_id,geometry,crash_date,crash_time,borough,zip_code,on_street_name,cross_street_name,off_street_name,...,contributing_factor_vehicle_1,contributing_factor_vehicle_2,contributing_factor_vehicle_3,contributing_factor_vehicle_4,contributing_factor_vehicle_5,vehicle_type_code_1,vehicle_type_code_2,vehicle_type_code_3,vehicle_type_code_4,vehicle_type_code_5
0,3530528,e95045cba4602b73447ed579683d85ae,POINT (-73.96137 40.68047),2016-10-01,03:18:00,brooklyn,11238.0,,,986 atlantic avenue,...,following too closely,unspecified,,,,TAXI,SPORT UTILITY / STATION WAGON,TAXI,SPORT UTILITY / STATION WAGON,
1,3553471,e95045cba4602b73447ed579683d85ae,POINT (-73.96137 40.68047),2016-11-02,05:54:00,,,grand avenue,,,...,unspecified,,,,,PASSENGER VEHICLE,,,,
2,3562630,e95045cba4602b73447ed579683d85ae,POINT (-73.96137 40.68047),2016-11-17,00:00:00,,,atlantic avenue,,,...,following too closely,unspecified,,,,PK,Van,,,


In [27]:
# export the filtered datasets
gdf_crashes_short_segment.to_file('../data/cleaned_data/crash_seperated/crash_short_segment/crash_short_segment.shp',  encoding='utf-8')
gdf_crashes_short_segment.to_file('../data/cleaned_data/crash_seperated/crash_short_segment/crash_short_segment.geojson', driver='GeoJSON',  encoding='utf-8')

In [28]:
# extract the list of crashes that occurred nearby the short segments
list_crashes_short_segment = gdf_crashes_short_segment['collision_id'].tolist()

In [29]:
# drop crashes that occurred nearby the short segments
gdf_crashes_long_segment_2263 = gdf_crashes_2263.loc[~gdf_crashes_2263['collision_id'].isin(list_crashes_short_segment)]

## 2. Filtering crashes which occurred nearby Sharedstreets nodes

In [30]:
# make sure that you run the 'shst_02_extract_short_segments' notebook to get the 'shst_node_filtered.shp'
# import a shapefile of the node point
gdf_shst_node = gpd.read_file('../data/sharedstreets_geometry/node_filtered/shst_node_filtered.shp')

In [31]:
# create a geodataframe of the sharedstreets that projected to epsg2263
gdf_shst_node_2263 = gdf_shst_node.copy()

# change coordinate system from epsg4326 to epsg2263
# epsg2263 is based on us-ft.
gdf_shst_node_2263.crs = from_epsg(4326)
gdf_shst_node_2263 = gdf_shst_node_2263.to_crs(epsg=2263)  

In [32]:
# buffer 30ft
gdf_shst_node_2263['geometry'] = gdf_shst_node_2263['geometry'].buffer(30)

In [33]:
# using sjoin function, find crashes which occured buffer zone.
gdf_crashes_2263_intersection = gpd.sjoin(gdf_crashes_long_segment_2263, gdf_shst_node_2263, op='within')

In [34]:
# drop unnecessary columns
gdf_crashes_intersection = gdf_crashes_2263_intersection[['collision_id','node_id']]

In [35]:
# this cell will create the dataframe that contains list of node id for each collision id
df_crashes_intersection = gdf_crashes_intersection.groupby('collision_id')['node_id'].apply(list)
df_crashes_intersection = df_crashes_intersection.reset_index()

In [36]:
# add 'geometry' column that contains coordinates of crashes
df_crashes_intersection = df_crashes_intersection.merge(gdf_crashes[['collision_id','geometry']], on='collision_id', how='left')

In [37]:
def most_nearest_node(x):
    '''
    This function returns an id of the nearest Sharedstreet node. 
    Entire process in this function is same with the most_nearest_short_segment function
    -----
    Input:
        x (pd.DataFrame): df_crashes_intersection
    -----
    Output:    
        id (string) : an id of the nearest Sharedstreet node
    
    '''
    
    point = x['geometry']
    list_node = x['node_id']
    if len(list_node) == 1:
        return list_node[0]
    else:
        df_node = pd.DataFrame(list_node).rename(columns={0:'node_id'})
        df_node = df_node.merge(gdf_shst_node[['node_id','geometry']], on='node_id', how='left') 
        df_node['distance'] = df_node['geometry'].apply(lambda x: calculate_distance(point,x))
        return df_node.sort_values(by='distance').iloc[0]['node_id']

In [38]:
# get the nearest sharedstreet node point
df_crashes_intersection['nearest_id'] = df_crashes_intersection.apply(lambda x:most_nearest_node(x), axis=1)

In [39]:
# drop unnecessary columns
df_crashes_intersection = df_crashes_intersection.drop(['node_id','geometry'], axis=1)

In [40]:
# set the geometry as the centroid of the nearest node
df_crashes_intersection = df_crashes_intersection.merge(gdf_shst_node[['node_id','geometry']],
                                                        left_on = 'nearest_id',
                                                        right_on = 'node_id')
df_crashes_intersection = df_crashes_intersection.drop('node_id', axis=1)

In [41]:
# add crash characteristics
df_crashes_intersection = df_crashes_intersection.merge(gdf_crashes.drop(['geometry_id',
                                                                          'geometry'], axis=1),
                                                          on='collision_id',
                                                          how='left')

In [42]:
# rename a nearest_id column
df_crashes_intersection = df_crashes_intersection.rename(columns={'nearest_id':'nearest_node_id'})

In [43]:
# set the type of GeoDataFrame
gdf_crashes_intersection = gpd.GeoDataFrame(df_crashes_intersection, geometry='geometry') 

In [44]:
# set the projection system
gdf_crashes_intersection.crs = from_epsg(4326)

  return _prepare_from_string(" ".join(pjargs))


In [45]:
# export the filtered datasets
gdf_crashes_intersection.to_file('../data/cleaned_data/crash_seperated/crash_intersection/crash_intersection.shp', encoding='utf-8')
gdf_crashes_intersection.to_file('../data/cleaned_data/crash_seperated/crash_intersection/crash_intersection.geojson', encoding='utf-8', driver='GeoJSON')

In [46]:
# extract the list of crashes that occurred nearby the intersections
list_intersection_crash = gdf_crashes_intersection['collision_id'].tolist()

## Filtering crashes which occurred nearby Sharedstreets segments

The segment crashes are crashes that are not classified as 'short segment' crashes or 'intersection' crashes

In [47]:
# drop crashes that occurred nearby the short segments and intersections
gdf_crashes_segment = gdf_crashes.loc[~gdf_crashes['collision_id'].isin(list_intersection_crash)]
gdf_crashes_segment = gdf_crashes_segment.loc[~gdf_crashes_segment['collision_id'].isin(list_crashes_short_segment)]

In [48]:
# export the filtered datasets
gdf_crashes_segment.to_file('../data/cleaned_data/crash_seperated/crash_segment/crash_segment.shp',  encoding='utf-8')
gdf_crashes_segment.to_file('../data/cleaned_data/crash_seperated/crash_segment/crash_segment.geojson', driver='GeoJSON',  encoding='utf-8')

## Check the numbers of each dataset, to check duplicated values or errors

This is for checking every crashes was assigned correctly

In [49]:
gdf_crashes_segment.shape[0] + gdf_crashes_intersection.shape[0]\
+ gdf_crashes_short_segment.shape[0]

649881

In [50]:
gdf_crashes.shape

(649881, 28)