#### EDA and Visualization Final Project: Data Prep and Analysis
###### Zachary Barnes and Bing Wang

##### Housekeeping

In [47]:
# Load Python libraries
import os
import numpy as np
import pandas as pd

# Run R code adjacent to Python code
%load_ext rpy2.ipython

# Load ggplot R library
%R library(ggplot2)
%R library(scales)

# Avoid kernal death
os.environ['KMP_DUPLICATE_LIB_OK']='True'

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


##### Read in and organize data (Collisions and Victims datasets, from TIMS)

In [86]:
c = pd.read_csv("Collisions.csv")

# Keep only relevant columns
c = c[['CASE_ID','COLLISION_DATE','COLLISION_TIME','INTERSECTION','COLLISION_SEVERITY',
        'LIGHTING','LOCATION_TYPE','MVIW','TYPE_OF_COLLISION',
        'WEATHER_1','LATITUDE','LONGITUDE','PRIMARY_RD',
        'SECONDARY_RD','POINT_X','POINT_Y']]
# Convert COLLISION_TIME to hour of day
c['COLLISION_TIME'] = [int(i[:-2]) if len(i) > 2 else 0 for i in c['COLLISION_TIME'].astype(str).values ]
c.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y
1147,8720169,2018-04-13,18,Y,4,A,,C,C,A,,,GUERRERO ST,16TH ST,-122.4241,37.76478
532,8766608,2018-11-02,16,Y,4,A,,G,D,A,,,MARKET ST,MARKET ST 1133,,
1121,8724540,2018-09-05,12,Y,4,A,,G,-,A,,,MISSION ST,4TH ST,-122.40426,37.78452
1302,8716426,2018-09-22,22,Y,4,C,,C,D,A,,,JONES ST,TURK ST,-122.41241,37.783
1420,8699732,2018-08-30,18,N,3,A,,B,G,A,,,DUBOCE AV,ELGIN PARK,-122.423098,37.769833


In [49]:
v = pd.read_csv('Victims.csv')

# Keep only relevant columns
v = v[['CASE_ID','VICTIM_DEGREE_OF_INJURY','VICTIM_ROLE','VICTIM_AGE']]
v.sample(5)


Unnamed: 0,CASE_ID,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
1578,8716490,4,5,37
992,8754829,3,2,27
1105,8754760,4,1,24
5722,90828614,7,2,16
605,8758271,4,3,31


In [50]:
# Merge Collisions and Victims datasets
cv = pd.merge(c, v, how='left', on='CASE_ID')
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
3935,8619224,2018-04-11,1950,Y,4,C,,G,G,C,,,CABRILLO ST,38TH AV,-122.49824,37.77381,4,4,61
3143,8656273,2018-06-21,1153,Y,3,A,,G,-,A,,,OCTAVIA ST,HAIGHT ST,-122.42393,37.77296,3,4,24
3026,8656472,2018-06-24,1841,Y,4,A,,C,C,A,,,FOLSOM ST,17TH ST,-122.41515,37.76369,4,2,27
5290,8543882,2018-01-25,1929,Y,4,C,I,C,A,A,,,KIRKHAM ST,19TH AV,-122.476917,37.759846,4,1,49
2099,8705819,2018-09-05,1504,Y,3,A,,D,B,A,,,10TH ST,SHERIDAN ST,-122.41187,37.77208,3,1,52


In [51]:
# Paul's Function to remove NaNs from columns
def fixDFcolsForR(myDF):
    for i in range(myDF.shape[1]): # for each col
        if myDF.iloc[:, i].dtype == 'O': # check if col is type object
            myDF.iloc[:, i] = myDF.iloc[:, i].astype(str) # if yes set type to string
    return myDF

In [52]:
# Convert NaNs to string type
cv = fixDFcolsForR(cv)
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,COLLISION_TIME,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
5819,90866269,2018-11-13,1735,N,4,D,,C,C,A,37.72329,-122.4009,US-101 N/B,PAUL AVE,-122.400902,37.723289,7,1,19
2792,8683332,2018-07-09,1430,N,4,A,,B,G,A,,,6TH ST,HOWARD ST,-122.407394,37.779956,4,3,48
3504,8636928,2018-05-11,1708,Y,4,A,,G,D,A,,,BAY,FILLMORE,-122.43665,37.8027,4,4,50
3233,8650559,2018-07-10,1430,Y,4,A,,C,B,A,,,HARRISON ST,2ND ST,-122.39518,37.78433,4,1,43
4547,8575269,2018-02-03,2020,Y,4,C,,B,D,A,,,EDDY ST,POLK ST,-122.41918,37.78308,4,3,59
