#### EDA and Visualization Final Project: Data Prep and Analysis
###### Zachary Barnes and Bing Wang

##### Housekeeping

In [260]:
# Load Python libraries
import os
import numpy as np
import pandas as pd

# Run R code adjacent to Python code
%load_ext rpy2.ipython

# Load ggplot R library
%R library(ggplot2)
%R library(scales)

# Avoid kernal death
os.environ['KMP_DUPLICATE_LIB_OK']='True'

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


##### Read in and organize data (Collisions and Victims datasets, from TIMS)

In [261]:
c = pd.read_csv("Collisions.csv")

# Keep only relevant columns
c = c[['CASE_ID','COLLISION_DATE','COLLISION_TIME','INTERSECTION','COLLISION_SEVERITY',
        'LIGHTING','LOCATION_TYPE','MVIW','TYPE_OF_COLLISION',
        'WEATHER_1','LATITUDE','LONGITUDE','PRIMARY_RD',
        'SECONDARY_RD','POINT_X','POINT_Y']]
# Convert COLLISION_TIME to hour of day
c['COLLISION_TIME'] = [int(i[:-2]) if len(i) > 2 else 0 for i in c['COLLISION_TIME'].astype(str).values]

# Add quarter column
c['QUARTER'] = c['COLLISION_DATE'].map(pd.to_datetime).dt.quarter
c.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,QUARTER
2872,8594583,2018-03-02,3,Y,4,C,,B,G,A,,,6TH ST,STEVENSON ST,-122.40967,37.78177,1
1374,8708224,2018-08-17,13,Y,3,A,,B,G,A,,,SUTTER ST,BUCHANAN ST,-122.42998,37.78651,3
10,90735081,2018-05-19,14,N,3,A,,C,C,A,37.76509,-122.40489,US-101 N/B,17TH ST. U/C,-122.404829,37.7651,2
3198,8555554,2018-01-07,11,Y,3,A,,A,H,B,,,9TH AV,IRVING ST,-122.46632,37.76404,1
543,8764300,2018-11-14,20,N,2,C,,C,B,A,,,JEFFERSON ST,LEAVENWORTH ST,-122.418597,37.807861,4


In [262]:
v = pd.read_csv('Victims12.csv')

# Keep only relevant columns
v = v[['CASE_ID','VICTIM_DEGREE_OF_INJURY','VICTIM_ROLE','VICTIM_AGE']]
v.sample(5)


Unnamed: 0,CASE_ID,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
223,8594866,2,4,49
80,8716537,2,3,37
37,8758252,2,3,27
157,8637035,2,4,30
198,8602972,2,1,18


In [263]:
# Merge Collisions and Victims datasets
cv = pd.merge(c, v, how='left', on='CASE_ID')
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,QUARTER,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
1792,8683720,2018-07-10,14,N,3,A,,C,D,A,,,8TH AV,CLEMENT ST,-122.466597,37.783224,3,,,
2221,8636978,2018-05-28,19,Y,3,B,,C,D,A,,,POST ST,GOUGH ST,-122.42484,37.78619,2,,,
75,90705946,2018-04-05,16,N,4,A,,C,C,B,37.74945,-122.40387,US-101 S/B,CESAR CHAVEZ ST,-122.403799,37.749428,2,,,
2237,8636956,2018-05-09,19,Y,4,B,,B,G,A,,,BEALE ST,HOWARD ST,-122.39428,37.78992,2,,,
1305,8716427,2018-09-14,18,Y,3,A,,B,G,A,,,MISSION ST,8TH ST,-122.41316,37.77746,3,,,


In [264]:
# Paul's Function to remove NaNs from columns
def fixDFcolsForR(myDF):
    for i in range(myDF.shape[1]): # for each col
        if myDF.iloc[:, i].dtype == 'O': # check if col is type object
            myDF.iloc[:, i] = myDF.iloc[:, i].astype(str) # if yes set type to string
    return myDF

In [274]:
# Convert NaNs to string type
cv = fixDFcolsForR(cv)
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,QUARTER,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
1021,8754667,2018-10-12,9,N,4,A,,B,G,A,,,GEARY BL,24TH AV,-122.483653,37.78021,4,,,
1472,8698198,2018-08-09,16,Y,4,A,,C,D,A,,,CONCORD ST,BRUNSWICK ST,-122.44216,37.71092,3,,,
3149,8572546,2018-02-28,8,Y,2,A,,G,A,A,,,22ND ST,TENNESSEE ST,-122.38925,37.75789,1,2.0,4.0,25.0
125,90683783,2018-03-07,12,N,4,A,,C,C,A,37.74354,-122.40628,US-101 S/B,FAITH ST O/C,-122.406211,37.743482,1,,,
2714,8597278,2018-03-26,18,Y,4,A,,C,C,A,,,7TH ST,HOWARD ST,-122.4094,37.77798,1,,,


# Use Nominatim API to convert lat long to OSM ID

In [287]:
import requests
import json

def osmid(lat,lon):
    URL = 'https://nominatim.openstreetmap.org/reverse?format=json&lat=%s&lon=%s'
    r = requests.get(URL % (lat,lon))
    data = json.loads(r.text)
#     osm_id = data.get('osm_id','NONE')
    return data['osm_id']


In [289]:
import time
lat_long = cv[['POINT_Y','POINT_X','VICTIM_DEGREE_OF_INJURY']]
osm_id = list()
for i,row in lat_long.iterrows():
    if (np.isnan(row['POINT_Y']) or np.isnan(row['POINT_X'])) or np.isnan(row['VICTIM_DEGREE_OF_INJURY']):
        osm_id.append('NONE')
        continue
    else:
        time.sleep(2)
        osm_id.append(osmid(row['POINT_Y'],row['POINT_X']))
osm_series = pd.Series(osm_id)
osm_series.to_csv('osm.csv')

  del sys.path[0]
