In [None]:
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Reading in the raw data
# The Motor Vehicle Collisions Crashes data is downloaded from NYC OpenData Portal
# https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95
# The data was downloaded on 12/09/2022
df = pd.read_csv("Raw Data/Motor_Vehicle_Collisions_Crashes_Raw_20221209.csv")

In [None]:
# Here are the first five lines:
df.head()

In [None]:
# Here are all the column headers:
df.columns
# list(df)[:30]

In [None]:
# Here are the date types for all columns:
df.dtypes

In [None]:
# Are there any duplicate IDs? No. That's good. 
(df['COLLISION_ID'].value_counts() > 1).any()

In [None]:
# What is the shape of the data? # rows and # cols
df.shape

In [None]:
# What was the number of cells not filled in for each question?
df.isna().sum()

# Filter out un-needed data

In [None]:
# Create a copy of the dataframe
df1 = df.copy()

In [None]:
# Drop the data that doesn't have a latitude and longitude (case 1: field is empty)
df2 = df1[(df1['LATITUDE'].notnull()) & (df1['LONGITUDE'].notnull())]
print("number of dropped data =", df1.shape[0]-df2.shape[0])
df2.head()

In [None]:
# Drop the data that doesn't have a latitude and longitude (case 2: both fields are zero)
df3 = df2[(df2['LATITUDE']!=0) & (df2['LONGITUDE']!=0)]
print("number of dropped data =", df2.shape[0]-df3.shape[0])
df3.head()

In [None]:
# Drop the data that doesn't involve a cyclist accident
df4 = df3[(df3['NUMBER OF CYCLIST INJURED']>0) | (df3['NUMBER OF CYCLIST KILLED']>0)]
print("number of dropped data =", df3.shape[0]-df4.shape[0])
df4.head()

In [None]:
# Drop the data that was after 2022/01/01, should get 41088 --> 41002 entries
df4['CRASH DATE'] = pd.to_datetime(df4['CRASH DATE']).dt.date
df5 = df4[(df4['CRASH DATE']<datetime.date(2022,1,1))]

In [None]:
print("number of dropped data =", df4.shape[0]-df5.shape[0])
print("number of current data =", df5.shape[0])

In [None]:
# Drop the data that was before 2017/01/01, get 23455 entries
df6 = df5[(df5['CRASH DATE']>=datetime.date(2017,1,1))]

In [None]:
print("number of dropped data =", df5.shape[0]-df6.shape[0])
print("number of current data =", df6.shape[0])

# Some stats

In [None]:
# Overview of cyclist accident data
print("values in the injury column =", df5['NUMBER OF CYCLIST INJURED'].unique())
print("values in the fatality column =", df5['NUMBER OF CYCLIST KILLED'].unique())

In [None]:
# Print out the number of data entries that don't have a location
num_raw_data = df1.shape[0]
num_data_w_location = df2.shape[0]
num_data_wo_location = num_raw_data - num_data_w_location
print(num_raw_data)
print(num_data_wo_location)
print(num_data_w_location)

In [None]:
# Number of data records that don't have a location and cyclist accident
num_data_wo_location_only_cyclist = df3.shape[0]
num_data_wo_location_not_cyclist = num_data_wo_location - num_data_wo_location_only_cyclist
print(num_raw_data)
print(num_data_wo_location)
print(num_data_w_location)
print(num_data_wo_location_not_cyclist)
print(num_data_wo_location_only_cyclist)

In [None]:
print(df4[(df4['NUMBER OF CYCLIST INJURED']>0)].shape[0])
print(df4[(df4['NUMBER OF CYCLIST INJURED']==0)].shape[0])
print(df4[(df4['NUMBER OF CYCLIST KILLED']>0)].shape[0])
print(df4[(df4['NUMBER OF CYCLIST KILLED']==0)].shape[0])
print(df4[(df4['NUMBER OF CYCLIST INJURED']>0) & (df4['NUMBER OF CYCLIST KILLED']>0)].shape[0])

# Plot injury and fatality trend

In [None]:
years = list(range(2015,2021+1))

In [None]:
# Compute cyclist injury and fatality number for each year
cyclist_injuries = []
cyclist_fatalities = []

for year in years:
    cyclist_injuries.append(df1[(pd.to_datetime(df1['CRASH DATE']).dt.year == year) & (df1['NUMBER OF CYCLIST INJURED']>0)].shape[0])
    cyclist_fatalities.append(df1[(pd.to_datetime(df1['CRASH DATE']).dt.year == year) & (df1['NUMBER OF CYCLIST KILLED']>0)].shape[0])

print(cyclist_injuries)
print(cyclist_fatalities)

In [None]:
# Compute all crash-related injury and fatality number for each year
all_injuries = []
all_fatalities = []

for year in years:
    all_injuries.append(df1[(pd.to_datetime(df1['CRASH DATE']).dt.year == year) & (df1['NUMBER OF PERSONS INJURED']>0)].shape[0])
    all_fatalities.append(df1[(pd.to_datetime(df1['CRASH DATE']).dt.year == year) & (df1['NUMBER OF PERSONS KILLED']>0)].shape[0])

print(all_injuries)
print(all_fatalities)

In [None]:
# Compute cyclist-to-all-crash accident ratio
injury_ratio = [x/y for x,y in zip(cyclist_injuries, all_injuries)]
fatality_ratio = [x/y for x,y in zip(cyclist_fatalities, all_fatalities)]

print(injury_ratio)
print(fatality_ratio)

In [None]:
# Plot cyclist injury trend
x = years[1:]
y = cyclist_injuries[1:]
plt.plot(x, y, label = "# of cyclist injuries in NYC")

# Labels
plt.xlabel('Year')
plt.ylabel('Number of People')
plt.title('Cyclist Injury Trend')
plt.legend()

# Set the ticks
plt.xticks(np.arange(min(x), max(x)+1, 1))

# Save the plot
plt.savefig('cyclist_injury_trend.jpg', dpi=300, bbox_inches="tight")

# Show the plot
plt.show()

In [None]:
# Plot cyclist fatality trend
x = years[1:]
y = cyclist_fatalities[1:]
plt.plot(x, y, label = "# of cyclist fatalities in NYC")

# Label
plt.xlabel('Year')
plt.ylabel('Number of People')
plt.title('Cyclist Fatality Trend')
plt.legend(loc='upper left')

# Set the ticks
plt.xticks(np.arange(min(x), max(x)+1, 1))

# Save the plot
plt.savefig('cyclist_fatality_trend.jpg', dpi=300, bbox_inches="tight")

# Show the plot
plt.show()

In [None]:
# Plot cyclist injury ratio trend
x = years[1:]
y = injury_ratio[1:]
plt.plot(x, y, label = "Ratio of cyclist injury to all crash injury")

# Label
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.title('Cyclist Injury Ratio Trend')
plt.legend(loc='upper left')

# Set the ticks
plt.xticks(np.arange(min(x), max(x)+1, 1))

# Save the plot
plt.savefig('cyclist_injury_ratio_trend.jpg', dpi=300, bbox_inches="tight")

# Show the plot
plt.show()

In [None]:
# Plot cyclist fatality ratio trend
x = years[1:]
y = fatality_ratio[1:]
plt.plot(x, y, label = "Ratio of cyclist fatality to all crash fatality")

# Label
plt.xlabel('Year')
plt.ylabel('Percentage')
plt.title('Cyclist Fatality Ratio Trend')
plt.legend(loc='upper left')

# Set the ticks
plt.xticks(np.arange(min(x), max(x)+1, 1))

# Save the plot
plt.savefig('cyclist_fatality_ratio_trend.jpg', dpi=300, bbox_inches="tight")

# Show the plot
plt.show()

# Export data

In [None]:
# Save the filtered dataframe as csv file
df6.to_csv('Motor_Vehicle_Collisions_Crashes_Cleaned_20221222.csv',index=False)
# df3.to_csv('Motor_Vehicle_Collisions_Crashes_Cleaned_20220929.csv')