In [354]:
import pandas as pd
import numpy as np

import json
import requests
from config import google_api_key

import warnings
warnings.filterwarnings('ignore')

In [355]:
# Reading CSV file
raw_data = pd.read_csv('Resources/NYPD_Motor_Vehicle_Collisions.csv', low_memory=False)

In [356]:
# Randomly selecting a subset of dataset (1%) for testing purposes
test_data = raw_data.sample(frac=0.0001)

In [357]:
test_data.shape

(152, 29)

In [358]:
test_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
534053,02/12/2017,4:15,QUEENS,11106.0,40.75855,-73.924866,"(40.75855, -73.924866)",34 AVENUE,34 STREET,,...,Failure to Yield Right-of-Way,,,,3613857,SPORT UTILITY / STATION WAGON,SPORT UTILITY / STATION WAGON,,,
1305406,07/18/2013,12:05,BROOKLYN,11209.0,40.617896,-74.029302,"(40.6178962, -74.0293024)",92 STREET,5 AVENUE,,...,Unspecified,,,,143196,PASSENGER VEHICLE,SPORT UTILITY / STATION WAGON,,,
853693,09/16/2015,9:00,MANHATTAN,10016.0,40.745682,-73.972125,"(40.7456822, -73.9721247)",EAST 37 STREET,1 AVENUE,,...,Other Vehicular,,,,3297366,SPORT UTILITY / STATION WAGON,UNKNOWN,,,
1405925,01/19/2013,13:15,,,,,,EAST 135 STREET,LINCOLN AVENUE,,...,Lost Consciousness,,,,73713,PASSENGER VEHICLE,SPORT UTILITY / STATION WAGON,,,
187272,08/11/2018,22:51,BROOKLYN,11219.0,40.62735,-74.009964,"(40.62735, -74.009964)",,,1014 BAY RIDGE AVENUE,...,Unspecified,,,,3959547,Sedan,Sedan,,,


In [359]:
r_c = test_data.shape
print(f'The dataset contains {r_c[0]} rows and {r_c[1]} columns')

The dataset contains 152 rows and 29 columns


In [360]:
test_data.columns

Index(['DATE', 'TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME',
       'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
       'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
       'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
       'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
       'CONTRIBUTING FACTOR VEHICLE 1', 'CONTRIBUTING FACTOR VEHICLE 2',
       'CONTRIBUTING FACTOR VEHICLE 3', 'CONTRIBUTING FACTOR VEHICLE 4',
       'CONTRIBUTING FACTOR VEHICLE 5', 'UNIQUE KEY', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4',
       'VEHICLE TYPE CODE 5'],
      dtype='object')

### For our analysis, we are interested interested in the following columns:

- 'DATE': Create a timeline of collisions
- 'TIME': Identify most dangerous and safest times of day for cyclists
- 'BOROUGH': Identify most dangerous and safest borough for cyclists
- 'ZIP CODE': Identify most dangerous and safest zip code for cyclists
        
**Q. Has any of these parameters change over time?**
        
- 'LATITUDE', 'LONGITUDE' and 'LOCATION': Visualize data and calculate distance of bike lane segments
 
- 'ON STREET NAME', 'CROSS STREET NAME', 'OFF STREET NAME': Find missing zip codes and borough information

- 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
- 'NUMBER OF PEDESTRIANS INJURED', 'NUMBER OF PEDESTRIANS KILLED',
- 'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED'
- 'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED'

- 'CONTRIBUTING FACTOR VEHICLE 1': Determine most common cause of collisions (e.g. alcohol, speeding)

- 'UNIQUE KEY': Incident report (unique for each collision

In [361]:
test_data.describe()

Unnamed: 0,LATITUDE,LONGITUDE,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,UNIQUE KEY
count,128.0,128.0,152.0,152.0,152.0,152.0,152.0,152.0,152.0,152.0,152.0
mean,40.724918,-73.928958,0.184211,0.0,0.039474,0.0,0.013158,0.0,0.131579,0.0,2586023.0
std,0.079048,0.080033,0.437044,0.0,0.195363,0.0,0.114327,0.0,0.393393,0.0,1603292.0
min,40.58096,-74.16427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5946.0
25%,40.6655,-73.981622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,303444.2
50%,40.719019,-73.933546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3417598.0
75%,40.771204,-73.888931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3792520.0
max,40.900574,-73.706889,2.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,4149990.0


In [362]:
# Dropping columns we're not interested in
test_data.drop(['OFF STREET NAME', 
                 'CONTRIBUTING FACTOR VEHICLE 2', 
                 'CONTRIBUTING FACTOR VEHICLE 3', 
                 'CONTRIBUTING FACTOR VEHICLE 4', 
                 'CONTRIBUTING FACTOR VEHICLE 5', 
                 'VEHICLE TYPE CODE 2', 
                 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'], axis=1, inplace=True)

In [363]:
test_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,UNIQUE KEY,VEHICLE TYPE CODE 1
534053,02/12/2017,4:15,QUEENS,11106.0,40.75855,-73.924866,"(40.75855, -73.924866)",34 AVENUE,34 STREET,0.0,0.0,0,0,0,0,0,0,Unsafe Speed,3613857,SPORT UTILITY / STATION WAGON
1305406,07/18/2013,12:05,BROOKLYN,11209.0,40.617896,-74.029302,"(40.6178962, -74.0293024)",92 STREET,5 AVENUE,0.0,0.0,0,0,0,0,0,0,Unspecified,143196,PASSENGER VEHICLE
853693,09/16/2015,9:00,MANHATTAN,10016.0,40.745682,-73.972125,"(40.7456822, -73.9721247)",EAST 37 STREET,1 AVENUE,0.0,0.0,0,0,0,0,0,0,Other Vehicular,3297366,SPORT UTILITY / STATION WAGON
1405925,01/19/2013,13:15,,,,,,EAST 135 STREET,LINCOLN AVENUE,0.0,0.0,0,0,0,0,0,0,Other Vehicular,73713,PASSENGER VEHICLE
187272,08/11/2018,22:51,BROOKLYN,11219.0,40.62735,-74.009964,"(40.62735, -74.009964)",,,0.0,0.0,0,0,0,0,0,0,Driver Inattention/Distraction,3959547,Sedan


In [364]:
# Removing round brackets from coordinates
test_data['LOCATION'] = test_data['LOCATION'].str.strip('()')
test_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,UNIQUE KEY,VEHICLE TYPE CODE 1
534053,02/12/2017,4:15,QUEENS,11106.0,40.75855,-73.924866,"40.75855, -73.924866",34 AVENUE,34 STREET,0.0,0.0,0,0,0,0,0,0,Unsafe Speed,3613857,SPORT UTILITY / STATION WAGON
1305406,07/18/2013,12:05,BROOKLYN,11209.0,40.617896,-74.029302,"40.6178962, -74.0293024",92 STREET,5 AVENUE,0.0,0.0,0,0,0,0,0,0,Unspecified,143196,PASSENGER VEHICLE
853693,09/16/2015,9:00,MANHATTAN,10016.0,40.745682,-73.972125,"40.7456822, -73.9721247",EAST 37 STREET,1 AVENUE,0.0,0.0,0,0,0,0,0,0,Other Vehicular,3297366,SPORT UTILITY / STATION WAGON
1405925,01/19/2013,13:15,,,,,,EAST 135 STREET,LINCOLN AVENUE,0.0,0.0,0,0,0,0,0,0,Other Vehicular,73713,PASSENGER VEHICLE
187272,08/11/2018,22:51,BROOKLYN,11219.0,40.62735,-74.009964,"40.62735, -74.009964",,,0.0,0.0,0,0,0,0,0,0,Driver Inattention/Distraction,3959547,Sedan


In [365]:
#Converting 'DATE' column to datetime object
test_data['DATE'] = pd.to_datetime(test_data['DATE'])

In [366]:
test_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,NUMBER OF PERSONS INJURED,NUMBER OF PERSONS KILLED,NUMBER OF PEDESTRIANS INJURED,NUMBER OF PEDESTRIANS KILLED,NUMBER OF CYCLIST INJURED,NUMBER OF CYCLIST KILLED,NUMBER OF MOTORIST INJURED,NUMBER OF MOTORIST KILLED,CONTRIBUTING FACTOR VEHICLE 1,UNIQUE KEY,VEHICLE TYPE CODE 1
534053,2017-02-12,4:15,QUEENS,11106.0,40.75855,-73.924866,"40.75855, -73.924866",34 AVENUE,34 STREET,0.0,0.0,0,0,0,0,0,0,Unsafe Speed,3613857,SPORT UTILITY / STATION WAGON
1305406,2013-07-18,12:05,BROOKLYN,11209.0,40.617896,-74.029302,"40.6178962, -74.0293024",92 STREET,5 AVENUE,0.0,0.0,0,0,0,0,0,0,Unspecified,143196,PASSENGER VEHICLE
853693,2015-09-16,9:00,MANHATTAN,10016.0,40.745682,-73.972125,"40.7456822, -73.9721247",EAST 37 STREET,1 AVENUE,0.0,0.0,0,0,0,0,0,0,Other Vehicular,3297366,SPORT UTILITY / STATION WAGON
1405925,2013-01-19,13:15,,,,,,EAST 135 STREET,LINCOLN AVENUE,0.0,0.0,0,0,0,0,0,0,Other Vehicular,73713,PASSENGER VEHICLE
187272,2018-08-11,22:51,BROOKLYN,11219.0,40.62735,-74.009964,"40.62735, -74.009964",,,0.0,0.0,0,0,0,0,0,0,Driver Inattention/Distraction,3959547,Sedan


In [367]:
min_date = test_data['DATE'].min()
max_date = test_data['DATE'].max()
print(f'Dataset ranges from {min_date} to {max_date}')

Dataset ranges from 2012-07-01 00:00:00 to 2019-06-10 00:00:00


In [368]:
# For missing values in NUMBER OF PERSONS INJURED and NUMBER OF PERSONS KILLED, we will replace with 0
test_data['NUMBER OF PERSONS INJURED'].fillna(0, inplace=True)
test_data['NUMBER OF PERSONS KILLED'].fillna(0, inplace=True)

In [369]:
test_data.isna().sum()

DATE                              0
TIME                              0
BOROUGH                          44
ZIP CODE                         44
LATITUDE                         24
LONGITUDE                        24
LOCATION                         24
ON STREET NAME                   33
CROSS STREET NAME                54
NUMBER OF PERSONS INJURED         0
NUMBER OF PERSONS KILLED          0
NUMBER OF PEDESTRIANS INJURED     0
NUMBER OF PEDESTRIANS KILLED      0
NUMBER OF CYCLIST INJURED         0
NUMBER OF CYCLIST KILLED          0
NUMBER OF MOTORIST INJURED        0
NUMBER OF MOTORIST KILLED         0
CONTRIBUTING FACTOR VEHICLE 1     1
UNIQUE KEY                        0
VEHICLE TYPE CODE 1               1
dtype: int64

In [370]:
# Exporting csv
test_data.to_csv('Resources/test_data.csv', index=False)