# 2. Data Augmentation

In [1]:
import numpy as np
import pandas as pd

In [2]:
raw_data = pd.read_csv('Resources/NYPD_Motor_Vehicle_Collisions.csv', low_memory=False)

In [95]:
raw_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,06/14/2019,0:00,BRONX,10461.0,40.836327,-73.827614,"(40.836327, -73.827614)",CROSBY AVENUE,BAISLEY AVENUE,,...,Unspecified,,,,4152765,Station Wagon/Sport Utility Vehicle,Sedan,,,
1,06/14/2019,0:00,BROOKLYN,11207.0,40.685127,-73.90643,"(40.685127, -73.90643)",PILLING STREET,EVERGREEN AVENUE,,...,Unspecified,,,,4152260,Pick-up Truck,Station Wagon/Sport Utility Vehicle,,,
2,06/14/2019,0:00,BROOKLYN,11212.0,40.661655,-73.92795,"(40.661655, -73.92795)",,,950 RUTLAND ROAD,...,,,,,4151711,Sedan,,,,
3,06/14/2019,0:00,QUEENS,11413.0,40.675213,-73.7378,"(40.675213, -73.7378)",MERRICK BOULEVARD,232 STREET,,...,Unspecified,,,,4151632,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
4,06/14/2019,0:00,,,40.88742,-73.89449,"(40.88742, -73.89449)",MAJOR DEEGAN EXPRESSWAY,,,...,Unspecified,,,,4151943,Station Wagon/Sport Utility Vehicle,Sedan,,,


In [96]:
# Filtering for duplicate rows grouped by location
duplicated_rows = pd.concat(g for _, g in raw_data.groupby('LOCATION') if len(g) > 1)

In [97]:
duplicated_rows.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
5549,06/06/2019,7:47,,,0.0,0.0,"(0.0, 0.0)",NARROWS ROAD NORTH,CLOVE ROAD,,...,Unspecified,,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
7427,01/30/2019,14:05,QUEENS,11366.0,0.0,0.0,"(0.0, 0.0)",UNION TURNPIKE,UTOPIA PARKWAY,,...,,,,,4072615,,,,,
8619,05/31/2019,0:00,QUEENS,11412.0,0.0,0.0,"(0.0, 0.0)",FARMERS BOULEVARD,LINDEN BOULEVARD,,...,Unspecified,,,,4142454,Pick-up Truck,Sedan,,,
9207,05/31/2019,4:30,QUEENS,11367.0,0.0,0.0,"(0.0, 0.0)",MELBOURNE AVENUE,150 STREET,,...,,,,,4144366,Station Wagon/Sport Utility Vehicle,,,,
9209,05/31/2019,4:32,BRONX,10452.0,0.0,0.0,"(0.0, 0.0)",EAST 167 STREET,GERARD AVENUE,,...,Unspecified,,,,4146681,Sedan,Sedan,,,


In [98]:
# Rows with location value '(0.0, 0.0)' cannot be used as they include various zip codes and boroughs
# We'll drop them from the duplicates
duplicated_rows_w_locations = duplicated_rows[duplicated_rows['LOCATION'] != '(0.0, 0.0)']

In [99]:
print(duplicated_rows.shape)
print(duplicated_rows_w_locations.shape)

(1223159, 29)
(1222154, 29)


In [100]:
duplicated_rows_w_locations.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1135872,05/19/2014,17:20,STATEN ISLAND,10307,40.498949,-74.244365,"(40.4989488, -74.2443651)",SWINNERTON STREET,BILLOP AVENUE,,...,Unspecified,,,,341158,PASSENGER VEHICLE,UNKNOWN,,,
1446398,11/05/2012,11:15,STATEN ISLAND,10307,40.498949,-74.244365,"(40.4989488, -74.2443651)",SWINNERTON STREET,BILLOP AVENUE,,...,Unspecified,,,,301815,PASSENGER VEHICLE,UNKNOWN,,,
1157852,04/10/2014,17:20,STATEN ISLAND,10307,40.499842,-74.239917,"(40.499842, -74.2399169)",YETMAN AVENUE,BILLOP AVENUE,,...,Unspecified,,,,319011,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
1317834,06/26/2013,10:10,STATEN ISLAND,10307,40.499842,-74.239917,"(40.499842, -74.2399169)",YETMAN AVENUE,BILLOP AVENUE,,...,Unspecified,,,,302914,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
277629,03/25/2018,17:30,STATEN ISLAND,10307,40.500023,-74.23902,"(40.500023, -74.23902)",ROCKAWAY STREET,BILLOP AVENUE,,...,Unspecified,,,,3869029,PASSENGER VEHICLE,PASSENGER VEHICLE,,,


In [101]:
# Sum of missing zips before fillna
duplicated_rows_w_locations['ZIP CODE'].isna().sum()

277508

In [102]:
# Creating a copy of duplicated_rows_w_locations to augment
aug_duplicated_rows_w_locations = duplicated_rows_w_locations.copy()
aug_duplicated_rows_w_locations.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1135872,05/19/2014,17:20,STATEN ISLAND,10307,40.498949,-74.244365,"(40.4989488, -74.2443651)",SWINNERTON STREET,BILLOP AVENUE,,...,Unspecified,,,,341158,PASSENGER VEHICLE,UNKNOWN,,,
1446398,11/05/2012,11:15,STATEN ISLAND,10307,40.498949,-74.244365,"(40.4989488, -74.2443651)",SWINNERTON STREET,BILLOP AVENUE,,...,Unspecified,,,,301815,PASSENGER VEHICLE,UNKNOWN,,,
1157852,04/10/2014,17:20,STATEN ISLAND,10307,40.499842,-74.239917,"(40.499842, -74.2399169)",YETMAN AVENUE,BILLOP AVENUE,,...,Unspecified,,,,319011,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
1317834,06/26/2013,10:10,STATEN ISLAND,10307,40.499842,-74.239917,"(40.499842, -74.2399169)",YETMAN AVENUE,BILLOP AVENUE,,...,Unspecified,,,,302914,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
277629,03/25/2018,17:30,STATEN ISLAND,10307,40.500023,-74.23902,"(40.500023, -74.23902)",ROCKAWAY STREET,BILLOP AVENUE,,...,Unspecified,,,,3869029,PASSENGER VEHICLE,PASSENGER VEHICLE,,,


In [103]:
# Filling NA values with previous value
aug_duplicated_rows_w_locations['ZIP CODE'].fillna(method='ffill', inplace=True)

In [104]:
# Sum of missing zips after fillna
aug_duplicated_rows_w_locations['ZIP CODE'].isna().sum()

0

In [105]:
# Sum of missing boroughs before fillna
duplicated_rows_w_locations['BOROUGH'].isna().sum()

277360

In [106]:
aug_duplicated_rows_w_locations['BOROUGH'].fillna(method='ffill', inplace=True)

In [107]:
# Sum of missing boroughs after fillna
aug_duplicated_rows_w_locations['BOROUGH'].isna().sum()

0

In [108]:
aug_duplicated_rows_w_locations.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
1135872,05/19/2014,17:20,STATEN ISLAND,10307,40.498949,-74.244365,"(40.4989488, -74.2443651)",SWINNERTON STREET,BILLOP AVENUE,,...,Unspecified,,,,341158,PASSENGER VEHICLE,UNKNOWN,,,
1446398,11/05/2012,11:15,STATEN ISLAND,10307,40.498949,-74.244365,"(40.4989488, -74.2443651)",SWINNERTON STREET,BILLOP AVENUE,,...,Unspecified,,,,301815,PASSENGER VEHICLE,UNKNOWN,,,
1157852,04/10/2014,17:20,STATEN ISLAND,10307,40.499842,-74.239917,"(40.499842, -74.2399169)",YETMAN AVENUE,BILLOP AVENUE,,...,Unspecified,,,,319011,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
1317834,06/26/2013,10:10,STATEN ISLAND,10307,40.499842,-74.239917,"(40.499842, -74.2399169)",YETMAN AVENUE,BILLOP AVENUE,,...,Unspecified,,,,302914,PASSENGER VEHICLE,PASSENGER VEHICLE,,,
277629,03/25/2018,17:30,STATEN ISLAND,10307,40.500023,-74.23902,"(40.500023, -74.23902)",ROCKAWAY STREET,BILLOP AVENUE,,...,Unspecified,,,,3869029,PASSENGER VEHICLE,PASSENGER VEHICLE,,,


### Using augmented dataframe to supplement zip codes and borough information

In [109]:
# Sum of missing zip in raw data before
missing_zips = raw_data['ZIP CODE'].isna().sum()
missing_borough = raw_data['BOROUGH'].isna().sum()
print(f'Missing zip codes from raw data: {missing_zips}')
print(f'Missing boroughs from raw data: {missing_borough}')

Missing zip codes from raw data: 455394
Missing boroughs from raw data: 455214


In [110]:
augmented_raw_data = raw_data.copy()
augmented_raw_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,06/14/2019,0:00,BRONX,10461.0,40.836327,-73.827614,"(40.836327, -73.827614)",CROSBY AVENUE,BAISLEY AVENUE,,...,Unspecified,,,,4152765,Station Wagon/Sport Utility Vehicle,Sedan,,,
1,06/14/2019,0:00,BROOKLYN,11207.0,40.685127,-73.90643,"(40.685127, -73.90643)",PILLING STREET,EVERGREEN AVENUE,,...,Unspecified,,,,4152260,Pick-up Truck,Station Wagon/Sport Utility Vehicle,,,
2,06/14/2019,0:00,BROOKLYN,11212.0,40.661655,-73.92795,"(40.661655, -73.92795)",,,950 RUTLAND ROAD,...,,,,,4151711,Sedan,,,,
3,06/14/2019,0:00,QUEENS,11413.0,40.675213,-73.7378,"(40.675213, -73.7378)",MERRICK BOULEVARD,232 STREET,,...,Unspecified,,,,4151632,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
4,06/14/2019,0:00,,,40.88742,-73.89449,"(40.88742, -73.89449)",MAJOR DEEGAN EXPRESSWAY,,,...,Unspecified,,,,4151943,Station Wagon/Sport Utility Vehicle,Sedan,,,


In [111]:
augmented_raw_data['ZIP CODE'].fillna(aug_duplicated_rows_w_locations['ZIP CODE'], inplace=True)

In [112]:
augmented_raw_data['BOROUGH'].fillna(aug_duplicated_rows_w_locations['BOROUGH'], inplace=True)

In [113]:
augmented_raw_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,06/14/2019,0:00,BRONX,10461,40.836327,-73.827614,"(40.836327, -73.827614)",CROSBY AVENUE,BAISLEY AVENUE,,...,Unspecified,,,,4152765,Station Wagon/Sport Utility Vehicle,Sedan,,,
1,06/14/2019,0:00,BROOKLYN,11207,40.685127,-73.90643,"(40.685127, -73.90643)",PILLING STREET,EVERGREEN AVENUE,,...,Unspecified,,,,4152260,Pick-up Truck,Station Wagon/Sport Utility Vehicle,,,
2,06/14/2019,0:00,BROOKLYN,11212,40.661655,-73.92795,"(40.661655, -73.92795)",,,950 RUTLAND ROAD,...,,,,,4151711,Sedan,,,,
3,06/14/2019,0:00,QUEENS,11413,40.675213,-73.7378,"(40.675213, -73.7378)",MERRICK BOULEVARD,232 STREET,,...,Unspecified,,,,4151632,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
4,06/14/2019,0:00,BRONX,10466,40.88742,-73.89449,"(40.88742, -73.89449)",MAJOR DEEGAN EXPRESSWAY,,,...,Unspecified,,,,4151943,Station Wagon/Sport Utility Vehicle,Sedan,,,


In [122]:
# Sum of missing zip codes and boroughs in augmented data after
missing_zips_after = augmented_raw_data['ZIP CODE'].isna().sum()
missing_borough_after = augmented_raw_data['BOROUGH'].isna().sum()

missing_zips_before = raw_data['ZIP CODE'].isna().sum()
missing_borough_before = raw_data['BOROUGH'].isna().sum()

print(f'Missing zip codes from augmented raw data: {missing_zips_after}')
print(f'Missing boroughs from augmented raw data: {missing_borough_after}')

print(f'Missing zip codes from raw data: {missing_zips_before}')
print(f'Missing boroughs from raw data: {missing_borough_before}')

Missing zip codes from augmented raw data: 177886
Missing boroughs from augmented raw data: 177854
Missing zip codes from raw data: 455394
Missing boroughs from raw data: 455214


In [121]:
augmented_raw_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,06/14/2019,0:00,BRONX,10461,40.836327,-73.827614,"(40.836327, -73.827614)",CROSBY AVENUE,BAISLEY AVENUE,,...,Unspecified,,,,4152765,Station Wagon/Sport Utility Vehicle,Sedan,,,
1,06/14/2019,0:00,BROOKLYN,11207,40.685127,-73.90643,"(40.685127, -73.90643)",PILLING STREET,EVERGREEN AVENUE,,...,Unspecified,,,,4152260,Pick-up Truck,Station Wagon/Sport Utility Vehicle,,,
2,06/14/2019,0:00,BROOKLYN,11212,40.661655,-73.92795,"(40.661655, -73.92795)",,,950 RUTLAND ROAD,...,,,,,4151711,Sedan,,,,
3,06/14/2019,0:00,QUEENS,11413,40.675213,-73.7378,"(40.675213, -73.7378)",MERRICK BOULEVARD,232 STREET,,...,Unspecified,,,,4151632,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
4,06/14/2019,0:00,BRONX,10466,40.88742,-73.89449,"(40.88742, -73.89449)",MAJOR DEEGAN EXPRESSWAY,,,...,Unspecified,,,,4151943,Station Wagon/Sport Utility Vehicle,Sedan,,,


---
---

In [116]:
duplicated_rows_without_locations = duplicated_rows[duplicated_rows['LOCATION'] == '(0.0, 0.0)']

In [117]:
duplicated_rows_without_locations.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
5549,06/06/2019,7:47,,,0.0,0.0,"(0.0, 0.0)",NARROWS ROAD NORTH,CLOVE ROAD,,...,Unspecified,,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
7427,01/30/2019,14:05,QUEENS,11366.0,0.0,0.0,"(0.0, 0.0)",UNION TURNPIKE,UTOPIA PARKWAY,,...,,,,,4072615,,,,,
8619,05/31/2019,0:00,QUEENS,11412.0,0.0,0.0,"(0.0, 0.0)",FARMERS BOULEVARD,LINDEN BOULEVARD,,...,Unspecified,,,,4142454,Pick-up Truck,Sedan,,,
9207,05/31/2019,4:30,QUEENS,11367.0,0.0,0.0,"(0.0, 0.0)",MELBOURNE AVENUE,150 STREET,,...,,,,,4144366,Station Wagon/Sport Utility Vehicle,,,,
9209,05/31/2019,4:32,BRONX,10452.0,0.0,0.0,"(0.0, 0.0)",EAST 167 STREET,GERARD AVENUE,,...,Unspecified,,,,4146681,Sedan,Sedan,,,


In [118]:
# Replacing locations/lat/lng with NA
duplicated_rows_without_locations["LOCATION"].replace('(0.0, 0.0)', np.nan, inplace=True)
duplicated_rows_without_locations["LATITUDE"].replace(0.0, np.nan, inplace=True)
duplicated_rows_without_locations["LONGITUDE"].replace(0.0, np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [119]:
duplicated_rows_without_locations.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
5549,06/06/2019,7:47,,,,,,NARROWS ROAD NORTH,CLOVE ROAD,,...,Unspecified,,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
7427,01/30/2019,14:05,QUEENS,11366.0,,,,UNION TURNPIKE,UTOPIA PARKWAY,,...,,,,,4072615,,,,,
8619,05/31/2019,0:00,QUEENS,11412.0,,,,FARMERS BOULEVARD,LINDEN BOULEVARD,,...,Unspecified,,,,4142454,Pick-up Truck,Sedan,,,
9207,05/31/2019,4:30,QUEENS,11367.0,,,,MELBOURNE AVENUE,150 STREET,,...,,,,,4144366,Station Wagon/Sport Utility Vehicle,,,,
9209,05/31/2019,4:32,BRONX,10452.0,,,,EAST 167 STREET,GERARD AVENUE,,...,Unspecified,,,,4146681,Sedan,Sedan,,,


In [120]:
duplicated_rows_without_locations["LOCATION"].isna().sum()

1005

In [76]:
# We can use the Google API to augment these values
import requests
from config import google_api_key

In [77]:
# Reverse geolocation based on street address
#url = 'https://maps.googleapis.com/maps/api/geocode/json?'

#addresses = list(zip(duplicated_rows_without_locations['ON STREET NAME'], duplicated_rows_without_locations['CROSS STREET NAME']))

#full_addresses_2 = []
#latitude_2 = []
#longitude_2 = []

#for a in addresses:
    #query_url = f'{url}address={a}&key={google_api_key}'

    #try:
        #resp = requests.get(query_url)
        #data_2 = resp.json()
        #full_data_2.append(data_2)
    #except:
        print('Incorrectly formatted URL') # Accounts for NaN values in 'LOCATION'  
    
    #try:
        #full_addresses_2.append(data_2['results'][0]['formatted_address'])
        #latitude_2.append(data_2['results'][0]['geometry']['location']['lat'])
        #longitude_2.append(data_2['results'][0]['geometry']['location']['lng'])
        
    #except:
        print("No Data found") # Accounts for addresses not found in the Google API
        #full_addresses_2.append('NaN')
        #latitude_2.append('NaN')
        #longitude_2.append('NaN')

No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found
No Data found


In [123]:
duplicated_rows_without_locations['FULL_ADDRESS'] = full_addresses_2
duplicated_rows_without_locations.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,FULL_ADDRESS
5549,06/06/2019,7:47,,,,,,NARROWS ROAD NORTH,CLOVE ROAD,,...,,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,"Clove Rd & Narrows Rd N, Staten Island, NY 103..."
7427,01/30/2019,14:05,QUEENS,11366.0,,,,UNION TURNPIKE,UTOPIA PARKWAY,,...,,,,4072615,,,,,,"Union Tpke & Utopia Pkwy, Queens, NY 11366, USA"
8619,05/31/2019,0:00,QUEENS,11412.0,,,,FARMERS BOULEVARD,LINDEN BOULEVARD,,...,,,,4142454,Pick-up Truck,Sedan,,,,"Linden Blvd/Farmers Blvd, Queens, NY 11412, USA"
9207,05/31/2019,4:30,QUEENS,11367.0,,,,MELBOURNE AVENUE,150 STREET,,...,,,,4144366,Station Wagon/Sport Utility Vehicle,,,,,"Melbourne Ave & 150th St, Queens, NY 11367, USA"
9209,05/31/2019,4:32,BRONX,10452.0,,,,EAST 167 STREET,GERARD AVENUE,,...,,,,4146681,Sedan,Sedan,,,,"50 56 E 167th St, The Bronx, NY 10452, USA"


In [124]:
duplicated_rows_without_locations['LATITUDE'] = latitude_2
duplicated_rows_without_locations.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,FULL_ADDRESS
5549,06/06/2019,7:47,,,40.611,,,NARROWS ROAD NORTH,CLOVE ROAD,,...,,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,"Clove Rd & Narrows Rd N, Staten Island, NY 103..."
7427,01/30/2019,14:05,QUEENS,11366.0,40.7258,,,UNION TURNPIKE,UTOPIA PARKWAY,,...,,,,4072615,,,,,,"Union Tpke & Utopia Pkwy, Queens, NY 11366, USA"
8619,05/31/2019,0:00,QUEENS,11412.0,40.6917,,,FARMERS BOULEVARD,LINDEN BOULEVARD,,...,,,,4142454,Pick-up Truck,Sedan,,,,"Linden Blvd/Farmers Blvd, Queens, NY 11412, USA"
9207,05/31/2019,4:30,QUEENS,11367.0,40.7342,,,MELBOURNE AVENUE,150 STREET,,...,,,,4144366,Station Wagon/Sport Utility Vehicle,,,,,"Melbourne Ave & 150th St, Queens, NY 11367, USA"
9209,05/31/2019,4:32,BRONX,10452.0,40.8353,,,EAST 167 STREET,GERARD AVENUE,,...,,,,4146681,Sedan,Sedan,,,,"50 56 E 167th St, The Bronx, NY 10452, USA"


In [125]:
duplicated_rows_without_locations['LONGITUDE'] = longitude_2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [126]:
duplicated_rows_without_locations['LOCATION'] = list(zip(latitude_2, longitude_2))
duplicated_rows_without_locations.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,FULL_ADDRESS
5549,06/06/2019,7:47,,,40.611,-74.098,"(40.6109718, -74.0980192)",NARROWS ROAD NORTH,CLOVE ROAD,,...,,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,"Clove Rd & Narrows Rd N, Staten Island, NY 103..."
7427,01/30/2019,14:05,QUEENS,11366.0,40.7258,-73.7918,"(40.7258402, -73.7918254)",UNION TURNPIKE,UTOPIA PARKWAY,,...,,,,4072615,,,,,,"Union Tpke & Utopia Pkwy, Queens, NY 11366, USA"
8619,05/31/2019,0:00,QUEENS,11412.0,40.6917,-73.7619,"(40.691689, -73.761932)",FARMERS BOULEVARD,LINDEN BOULEVARD,,...,,,,4142454,Pick-up Truck,Sedan,,,,"Linden Blvd/Farmers Blvd, Queens, NY 11412, USA"
9207,05/31/2019,4:30,QUEENS,11367.0,40.7342,-73.8198,"(40.7341822, -73.8197716)",MELBOURNE AVENUE,150 STREET,,...,,,,4144366,Station Wagon/Sport Utility Vehicle,,,,,"Melbourne Ave & 150th St, Queens, NY 11367, USA"
9209,05/31/2019,4:32,BRONX,10452.0,40.8353,-73.9208,"(40.8352677, -73.92078649999999)",EAST 167 STREET,GERARD AVENUE,,...,,,,4146681,Sedan,Sedan,,,,"50 56 E 167th St, The Bronx, NY 10452, USA"


In [127]:
# Splitting FULL_ADDRESS column to indiviual values
i = duplicated_rows_without_locations.columns.get_loc('FULL_ADDRESS')
df2 = duplicated_rows_without_locations['FULL_ADDRESS'].str.split(",", n=3, expand=True)
temp_zips = pd.concat([duplicated_rows_without_locations.iloc[:, :i], df2, duplicated_rows_without_locations.iloc[:, i+1:]], axis=1)
temp_zips.drop([0, 3], axis=1, inplace=True)
temp_zips.rename(columns={1: 'FOUND BORO'}, inplace=True)
temp_zips.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,FOUND BORO,2
5549,06/06/2019,7:47,,,40.611,-74.098,"(40.6109718, -74.0980192)",NARROWS ROAD NORTH,CLOVE ROAD,,...,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,Staten Island,NY 10304
7427,01/30/2019,14:05,QUEENS,11366.0,40.7258,-73.7918,"(40.7258402, -73.7918254)",UNION TURNPIKE,UTOPIA PARKWAY,,...,,,4072615,,,,,,Queens,NY 11366
8619,05/31/2019,0:00,QUEENS,11412.0,40.6917,-73.7619,"(40.691689, -73.761932)",FARMERS BOULEVARD,LINDEN BOULEVARD,,...,,,4142454,Pick-up Truck,Sedan,,,,Queens,NY 11412
9207,05/31/2019,4:30,QUEENS,11367.0,40.7342,-73.8198,"(40.7341822, -73.8197716)",MELBOURNE AVENUE,150 STREET,,...,,,4144366,Station Wagon/Sport Utility Vehicle,,,,,Queens,NY 11367
9209,05/31/2019,4:32,BRONX,10452.0,40.8353,-73.9208,"(40.8352677, -73.92078649999999)",EAST 167 STREET,GERARD AVENUE,,...,,,4146681,Sedan,Sedan,,,,The Bronx,NY 10452


In [128]:
i = temp_zips.columns.get_loc(2)
df2 = temp_zips[2].str.split(" ", n=2, expand=True)
augmented_zips = pd.concat([temp_zips.iloc[:, :i], df2, temp_zips.iloc[:, i+1:]], axis=1)
augmented_zips.drop([0, 1], axis=1, inplace=True)
augmented_zips.rename(columns={2: 'FOUND ZIP'}, inplace=True)
augmented_zips.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,FOUND BORO,FOUND ZIP
5549,06/06/2019,7:47,,,40.611,-74.098,"(40.6109718, -74.0980192)",NARROWS ROAD NORTH,CLOVE ROAD,,...,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,Staten Island,10304
7427,01/30/2019,14:05,QUEENS,11366.0,40.7258,-73.7918,"(40.7258402, -73.7918254)",UNION TURNPIKE,UTOPIA PARKWAY,,...,,,4072615,,,,,,Queens,11366
8619,05/31/2019,0:00,QUEENS,11412.0,40.6917,-73.7619,"(40.691689, -73.761932)",FARMERS BOULEVARD,LINDEN BOULEVARD,,...,,,4142454,Pick-up Truck,Sedan,,,,Queens,11412
9207,05/31/2019,4:30,QUEENS,11367.0,40.7342,-73.8198,"(40.7341822, -73.8197716)",MELBOURNE AVENUE,150 STREET,,...,,,4144366,Station Wagon/Sport Utility Vehicle,,,,,Queens,11367
9209,05/31/2019,4:32,BRONX,10452.0,40.8353,-73.9208,"(40.8352677, -73.92078649999999)",EAST 167 STREET,GERARD AVENUE,,...,,,4146681,Sedan,Sedan,,,,The Bronx,10452


In [129]:
duplicated_rows_without_locations['ZIP CODE'] = augmented_zips['FOUND ZIP']
duplicated_rows_without_locations['BOROUGH'] = augmented_zips['FOUND BORO']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [130]:
duplicated_rows_without_locations.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5,FULL_ADDRESS
5549,06/06/2019,7:47,Staten Island,10304,40.611,-74.098,"(40.6109718, -74.0980192)",NARROWS ROAD NORTH,CLOVE ROAD,,...,,,,4146169,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,,"Clove Rd & Narrows Rd N, Staten Island, NY 103..."
7427,01/30/2019,14:05,Queens,11366,40.7258,-73.7918,"(40.7258402, -73.7918254)",UNION TURNPIKE,UTOPIA PARKWAY,,...,,,,4072615,,,,,,"Union Tpke & Utopia Pkwy, Queens, NY 11366, USA"
8619,05/31/2019,0:00,Queens,11412,40.6917,-73.7619,"(40.691689, -73.761932)",FARMERS BOULEVARD,LINDEN BOULEVARD,,...,,,,4142454,Pick-up Truck,Sedan,,,,"Linden Blvd/Farmers Blvd, Queens, NY 11412, USA"
9207,05/31/2019,4:30,Queens,11367,40.7342,-73.8198,"(40.7341822, -73.8197716)",MELBOURNE AVENUE,150 STREET,,...,,,,4144366,Station Wagon/Sport Utility Vehicle,,,,,"Melbourne Ave & 150th St, Queens, NY 11367, USA"
9209,05/31/2019,4:32,The Bronx,10452,40.8353,-73.9208,"(40.8352677, -73.92078649999999)",EAST 167 STREET,GERARD AVENUE,,...,,,,4146681,Sedan,Sedan,,,,"50 56 E 167th St, The Bronx, NY 10452, USA"


In [131]:
# Using augmented values from pass_1 to main data using date to match
values_to_update = duplicated_rows_without_locations.loc[duplicated_rows_without_locations['UNIQUE KEY'].isin(augmented_raw_data['UNIQUE KEY'])][['ZIP CODE', 'BOROUGH', 'LATITUDE', 'LONGITUDE', 'LOCATION']]
values_to_update.head()



Unnamed: 0,ZIP CODE,BOROUGH,LATITUDE,LONGITUDE,LOCATION
5549,10304,Staten Island,40.611,-74.098,"(40.6109718, -74.0980192)"
7427,11366,Queens,40.7258,-73.7918,"(40.7258402, -73.7918254)"
8619,11412,Queens,40.6917,-73.7619,"(40.691689, -73.761932)"
9207,11367,Queens,40.7342,-73.8198,"(40.7341822, -73.8197716)"
9209,10452,The Bronx,40.8353,-73.9208,"(40.8352677, -73.92078649999999)"


In [132]:
# Replacing missing values in test data with found zip codes and boroughs in pass 1
more_augmented_raw_data = augmented_raw_data.fillna(values_to_update)


In [133]:
more_augmented_raw_data.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
0,06/14/2019,0:00,BRONX,10461,40.8363,-73.8276,"(40.836327, -73.827614)",CROSBY AVENUE,BAISLEY AVENUE,,...,Unspecified,,,,4152765,Station Wagon/Sport Utility Vehicle,Sedan,,,
1,06/14/2019,0:00,BROOKLYN,11207,40.6851,-73.9064,"(40.685127, -73.90643)",PILLING STREET,EVERGREEN AVENUE,,...,Unspecified,,,,4152260,Pick-up Truck,Station Wagon/Sport Utility Vehicle,,,
2,06/14/2019,0:00,BROOKLYN,11212,40.6617,-73.9279,"(40.661655, -73.92795)",,,950 RUTLAND ROAD,...,,,,,4151711,Sedan,,,,
3,06/14/2019,0:00,QUEENS,11413,40.6752,-73.7378,"(40.675213, -73.7378)",MERRICK BOULEVARD,232 STREET,,...,Unspecified,,,,4151632,Station Wagon/Sport Utility Vehicle,Station Wagon/Sport Utility Vehicle,,,
4,06/14/2019,0:00,BRONX,10466,40.8874,-73.8945,"(40.88742, -73.89449)",MAJOR DEEGAN EXPRESSWAY,,,...,Unspecified,,,,4151943,Station Wagon/Sport Utility Vehicle,Sedan,,,


In [134]:
# Sum of missing zip codes and boroughs in augmented data after
missing_zips_after_2 = more_augmented_raw_data['ZIP CODE'].isna().sum()
missing_borough_after_2 = more_augmented_raw_data['BOROUGH'].isna().sum()

missing_zips_before = raw_data['ZIP CODE'].isna().sum()
missing_borough_before = raw_data['BOROUGH'].isna().sum()

print(f'Missing zip codes from further augmented raw data: {missing_zips_after_2}')
print(f'Missing boroughs from further augmented raw data: {missing_borough_after_2}')

print(f'Missing zip codes from augmented raw data: {missing_zips_after}')
print(f'Missing boroughs from augmented raw data: {missing_borough_after}')

print(f'Missing zip codes from raw data: {missing_zips_before}')
print(f'Missing boroughs from raw data: {missing_borough_before}')

Missing zip codes from further augmented raw data: 177774
Missing boroughs from further augmented raw data: 177667
Missing zip codes from augmented raw data: 177886
Missing boroughs from augmented raw data: 177854
Missing zip codes from raw data: 455394
Missing boroughs from raw data: 455214


In [145]:
# Saving augmented raw data
augmented_raw_data.to_csv('Resources/augmented_raw_data.csv', index=False)

---
---
---


---
---
---

In [137]:
# Analyzing remaining data
missing_zip_codes = more_augmented_raw_data[more_augmented_raw_data['ZIP CODE'].isna()]
missing_zip_codes.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
48,06/14/2019,10:33,,,,,,20 AVENUE,,,...,Unspecified,,,,4151479,Station Wagon/Sport Utility Vehicle,Sedan,,,
57,06/14/2019,10:50,,,,,,WESTCHESTER AVENUE,GRANT CIRCLE,,...,Unspecified,,,,4152639,Sedan,Refrigerated Van,,,
97,02/07/2019,2:20,,,,,,MANHATTAN BR UPPER,,,...,,,,,4076491,Sedan,,,,
114,06/14/2019,12:45,,,,,,12 AVENUE,WEST 39 STREET,,...,Passing or Lane Usage Improper,,,,4151905,Sedan,Sedan,,,
123,06/14/2019,13:00,,,,,,COLLEGE POINT BOULEVARD,WHITESTONE EXPRESSWAY,,...,Unspecified,,,,4151478,Station Wagon/Sport Utility Vehicle,Sedan,,,


In [142]:
# Filtering for records that include street data but no zip code and no location
missing_zip_codes_w_streets = more_augmented_raw_data[
                                (more_augmented_raw_data['ZIP CODE'].isna()) 
                                | (more_augmented_raw_data['LOCATION'].isna()) 
                                & (more_augmented_raw_data['ON STREET NAME'].notna())
                                & (more_augmented_raw_data['CROSS STREET NAME'].notna())
                            ]
missing_zip_codes_w_streets.head()

Unnamed: 0,DATE,TIME,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,LOCATION,ON STREET NAME,CROSS STREET NAME,OFF STREET NAME,...,CONTRIBUTING FACTOR VEHICLE 2,CONTRIBUTING FACTOR VEHICLE 3,CONTRIBUTING FACTOR VEHICLE 4,CONTRIBUTING FACTOR VEHICLE 5,UNIQUE KEY,VEHICLE TYPE CODE 1,VEHICLE TYPE CODE 2,VEHICLE TYPE CODE 3,VEHICLE TYPE CODE 4,VEHICLE TYPE CODE 5
10,06/14/2019,0:14,BROOKLYN,11218.0,,,,PROSPECT EXPRESSWAY,CHURCH AVENUE,,...,Driver Inattention/Distraction,,,,4153006,Station Wagon/Sport Utility Vehicle,Van,,,
48,06/14/2019,10:33,,,,,,20 AVENUE,,,...,Unspecified,,,,4151479,Station Wagon/Sport Utility Vehicle,Sedan,,,
57,06/14/2019,10:50,,,,,,WESTCHESTER AVENUE,GRANT CIRCLE,,...,Unspecified,,,,4152639,Sedan,Refrigerated Van,,,
97,02/07/2019,2:20,,,,,,MANHATTAN BR UPPER,,,...,,,,,4076491,Sedan,,,,
114,06/14/2019,12:45,,,,,,12 AVENUE,WEST 39 STREET,,...,Passing or Lane Usage Improper,,,,4151905,Sedan,Sedan,,,


In [143]:
missing_zip_codes_w_streets.shape

(201366, 29)