#### EDA and Visualization Final Project: Data Prep and Analysis
###### Zachary Barnes and Bing Wang

##### Housekeeping

In [12]:
# Load Python libraries
import os
import numpy as np
import pandas as pd

# Run R code adjacent to Python code
%load_ext rpy2.ipython

# Load ggplot R library
%R library(ggplot2)
%R library(scales)

# Avoid kernal death
os.environ['KMP_DUPLICATE_LIB_OK']='True'

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


##### Read in and organize data (Collisions and Victims datasets, from TIMS)

In [13]:
c = pd.read_csv("Collisions.csv")

# Keep only relevant columns
c = c[['CASE_ID','COLLISION_DATE','INTERSECTION','COLLISION_SEVERITY',
        'LIGHTING','LOCATION_TYPE','MVIW','TYPE_OF_COLLISION',
        'WEATHER_1','LATITUDE','LONGITUDE','PRIMARY_RD',
        'SECONDARY_RD','POINT_X','POINT_Y']]
c.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y
861,8754857,2018-10-18,Y,3,A,,A,C,A,,,PARK PRESIDIO BL,CROSSOVER DR,,
882,8754836,2018-10-29,Y,4,B,,G,A,A,,,BRANNAN ST,4TH ST,-122.39654,37.77833
3764,90861112,2018-11-01,N,4,A,,C,C,A,37.7691,-122.40971,US-101 S/B,BRYANT ST,-122.409704,37.76913
3364,8552827,2018-01-15,Y,3,C,,B,G,A,,,MARKET ST,7TH ST,-122.41251,37.78048
3410,8549107,2018-01-01,Y,4,C,,C,C,A,,,LINCOLN WY,7TH AV,-122.4643,37.76604


In [14]:
v = pd.read_csv('Victims.csv')

# Keep only relevant columns
v = v[['CASE_ID','VICTIM_DEGREE_OF_INJURY','VICTIM_ROLE','VICTIM_AGE']]
v.sample(5)


Unnamed: 0,CASE_ID,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
1200,8754675,2,1,24
2524,8667490,2,3,54
1207,8754667,4,3,20
5591,90794951,7,1,57
1623,8716452,4,1,26


In [15]:
# Merge Collisions and Victims datasets
cv = pd.merge(c, v, how='left', on='CASE_ID')
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
2088,8708149,2018-08-17,Y,4,A,,C,D,A,,,DUBOCE AV,ELGIN PARK,-122.42315,37.76983,4,1,27
4320,8594880,2018-03-06,Y,3,C,,A,H,A,,,GOUGH ST,EDDY ST,-122.42412,37.78247,3,1,30
5287,8543894,2018-01-28,Y,4,A,,B,G,A,,,CYRIL MAGNIN ST,EDDY ST,-122.40852,37.7845,4,3,58
397,8777516,2018-12-03,Y,3,C,,C,D,A,,,BROADWAY ST,BATTERY ST,-122.40104,37.7986,3,1,22
542,8777315,2018-12-28,Y,4,C,,C,D,A,,,POST ST,LARKIN ST,-122.41827,37.78704,4,1,25


In [16]:
# Paul's Function to remove NaNs from columns
def fixDFcolsForR(myDF):
    for i in range(myDF.shape[1]): # for each col
        if myDF.iloc[:, i].dtype == 'O': # check if col is type object
            myDF.iloc[:, i] = myDF.iloc[:, i].astype(str) # if yes set type to string
    return myDF

In [17]:
# Convert NaNs to string type
cv = fixDFcolsForR(cv)
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
19,90733518,2018-05-15,N,4,E,,C,C,A,37.76978,-122.40519,I-80 E/B,DIVISION ST,-122.405223,37.769777,7,1,60
3067,8656334,2018-06-16,Y,4,A,,C,D,A,,,S VAN NESS AV,21ST ST,-122.41675,37.75716,0,2,30
559,8777303,2018-12-29,Y,4,C,,C,A,A,,,DIVISADERO ST,GOLDEN GATE AV,-122.43854,37.77873,4,2,24
532,8777317,2018-12-10,Y,3,C,,B,G,A,,,VELASCO AV,RIO VERDE ST,-122.41504,37.70872,3,3,38
2673,8683817,2018-07-06,N,4,C,,B,G,A,,,3RD ST,MISSION ST,-122.401231,37.785634,4,3,25
