#### EDA and Visualization Final Project: Data Prep and Analysis
###### Zachary Barnes and Bing Wang

##### Housekeeping

In [2]:
# Load Python libraries
import os
import numpy as np
import pandas as pd

# Run R code adjacent to Python code
%load_ext rpy2.ipython

# Load ggplot R library
%R library(ggplot2)
%R library(scales)

# Avoid kernal death
os.environ['KMP_DUPLICATE_LIB_OK']='True'

##### Read in and organize data (Collisions and Victims datasets, from TIMS)

In [3]:
c = pd.read_csv("Collisions.csv")

# Keep only relevant columns
c = c[['CASE_ID','COLLISION_DATE','INTERSECTION','COLLISION_SEVERITY',
        'LIGHTING','LOCATION_TYPE','MVIW','TYPE_OF_COLLISION',
        'WEATHER_1','LATITUDE','LONGITUDE','PRIMARY_RD',
        'SECONDARY_RD','POINT_X','POINT_Y']]
c.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y
1626,8688829,2018-07-23,Y,3,C,,-,H,A,,,SACRAMENTO ST,LARKIN ST,-122.41924,37.79171
2121,8649552,2018-06-27,N,3,A,,C,A,A,,,COLUMBUS AV,TAYLOR ST,-122.41488,37.80355
2178,8637038,2018-05-30,Y,4,C,,G,D,A,,,VAN NESS AV,MCALLISTER ST,-122.42033,37.78005
130,90683244,2018-03-05,N,4,A,,C,C,A,37.72496,-122.40151,US-101 NB,N. RD. EDGE OF THE PAUL AVE. U/C,-122.401571,37.724939
479,8777092,2018-12-07,N,4,A,,B,D,A,,,SACRAMENTO ST,FRANKLIN ST,-122.42416,37.79109


In [4]:
v = pd.read_csv('Victims.csv')

# Keep only relevant columns
v = v[['CASE_ID','VICTIM_DEGREE_OF_INJURY','VICTIM_ROLE','VICTIM_AGE']]
v.sample(5)


Unnamed: 0,CASE_ID,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
3563,8619257,2,3,49
2174,8685925,4,2,30
3462,8620702,4,1,23
116,8777397,4,2,15
3698,8603104,4,2,41


In [5]:
# Merge Collisions and Victims datasets
cv = pd.merge(c, v, how='left', on='CASE_ID')
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
4526,8587685,2018-04-17,N,4,A,,C,C,A,,,LINCOLN WY,29TH AV,-122.488071,37.764984,4,1,32
767,8776919,2018-12-21,-,4,A,,C,F,A,,,BRYANT ST,2ND ST,-122.39363,37.78308,4,2,35
1672,8742299,2018-10-23,Y,4,A,,C,A,A,,,LAGUNA ST,ELLIS ST,-122.42758,37.78298,0,2,28
982,8758241,2018-11-29,Y,4,A,,C,A,B,,,BAY ST,KEARRY ST,-122.40717,37.80644,0,2,8
1107,8758118,2018-11-06,Y,3,A,,B,G,A,,,LINCOLN BL,41ST AV,-122.50085,37.76441,3,3,68


In [6]:
# Paul's Function to remove NaNs from columns
def fixDFcolsForR(myDF):
    for i in range(myDF.shape[1]): # for each col
        if myDF.iloc[:, i].dtype == 'O': # check if col is type object
            myDF.iloc[:, i] = myDF.iloc[:, i].astype(str) # if yes set type to string
    return myDF

In [7]:
# Convert NaNs to string type
cv = fixDFcolsForR(cv)
cv.sample(5)

Unnamed: 0,CASE_ID,COLLISION_DATE,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
2045,8708780,2018-09-04,Y,4,A,,C,B,B,,,OFARRELL,OFARRELL 170,,,4,1,52
3862,8619276,2018-04-11,Y,3,C,,B,G,A,,,25TH ST,ORANGE ALY,-122.41995,37.75055,3,3,69
4123,8601444,2018-03-23,-,3,A,,B,G,A,,,EVANS AV,SELBY ST,-122.39408,37.746181,3,3,37
819,8764196,2018-11-15,Y,4,A,,C,C,A,,,MCALLISTER ST,LARKIN ST,-122.41694,37.7805,0,2,48
427,8777414,2018-12-08,Y,4,C,,C,D,A,,,VAN NESS AV,CHESTNUT ST,-122.42478,37.80226,0,2,25


In [8]:
cv12 = cv.loc[(cv['VICTIM_DEGREE_OF_INJURY'] == 1) | (cv['VICTIM_DEGREE_OF_INJURY'] == 2)]
# cv12.to_csv('victims12.csv')

In [9]:
d = pd.read_csv("Collisions.csv")
d.head()

Unnamed: 0,CASE_ID,ACCIDENT_YEAR,PROC_DATE,JURIS,COLLISION_DATE,COLLISION_TIME,OFFICER_ID,REPORTING_DISTRICT,DAY_OF_WEEK,CHP_SHIFT,...,COUNT_MC_KILLED,COUNT_MC_INJURED,PRIMARY_RAMP,SECONDARY_RAMP,LATITUDE,LONGITUDE,COUNTY,CITY,POINT_X,POINT_Y
0,90736476,2018,2018-05-29,9335,2018-05-18,755,22037,,5,1,...,0,0,-,-,37.73747,-122.40784,SAN FRANCISCO,SAN FRANCISCO,-122.407801,37.737483
1,90736282,2018,2018-05-29,9335,2018-05-17,1215,22007,,4,1,...,0,0,-,-,37.73841,-122.40775,SAN FRANCISCO,SAN FRANCISCO,,
2,90735985,2018,2018-05-29,9335,2018-05-28,1045,18845,,1,1,...,0,0,-,-,37.78522,-122.39317,SAN FRANCISCO,SAN FRANCISCO,-122.39317,37.78522
3,90735917,2018,2018-05-29,9335,2018-05-17,1035,22096,,4,1,...,0,0,-,-,37.72752,-122.40259,SAN FRANCISCO,SAN FRANCISCO,-122.402545,37.727537
4,90735464,2018,2018-05-28,9335,2018-05-22,1723,18688,,2,2,...,0,0,-,-,37.8083,-122.36712,SAN FRANCISCO,SAN FRANCISCO,-122.367035,37.808211


In [11]:
cv12.loc[(cv['CASE_ID']) == 8804185]

Unnamed: 0,CASE_ID,COLLISION_DATE,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
349,8804185,2018-12-29,Y,2,C,,C,A,A,,,PORTOLA DR,CLIPPER ST,-122.44484,37.74694,2,1,32
