#### EDA and Visualization Final Project: Data Prep and Analysis
###### Zachary Barnes and Bing Wang

##### Housekeeping

In [1]:
# Load Python libraries
import os
import numpy as np
import pandas as pd

# Run R code adjacent to Python code
%load_ext rpy2.ipython

# Load ggplot R library
%R library(ggplot2)
%R library(scales)

# Avoid kernal death
os.environ['KMP_DUPLICATE_LIB_OK']='True'

##### Read in and organize data (Collisions and Victims datasets, from TIMS)

In [6]:
c = pd.read_csv("Collisions.csv")

# Keep only relevant columns
c = c[['CASE_ID','INTERSECTION','COLLISION_SEVERITY',
        'LIGHTING','LOCATION_TYPE','MVIW','TYPE_OF_COLLISION',
        'WEATHER_1','LATITUDE','LONGITUDE','PRIMARY_RD',
        'SECONDARY_RD','POINT_X','POINT_Y']]
c.sample(5)

Unnamed: 0,CASE_ID,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y
1060,8747062,Y,4,A,,C,D,A,,,JUDAH ST,20TH AV,-122.47809,37.76165
1595,8693165,N,4,A,,C,D,A,,,MORAGA AV,16TH AV,-122.474002,37.756213
1709,8685844,Y,4,A,,G,B,A,,,16TH ST,CAPP ST,-122.4187,37.7651
702,8758189,Y,2,A,,A,H,A,,,OCEAN AV,PLYMOUTH AV,-122.45654,37.72398
1220,8716525,Y,3,A,,C,D,A,,,25TH ST,SAN JOSE AV,-122.42143,37.75046


In [8]:
v = pd.read_csv('Victims.csv')

# Keep only relevant columns
v = v[['CASE_ID','VICTIM_DEGREE_OF_INJURY','VICTIM_ROLE','VICTIM_AGE']]
v.sample(5)


Unnamed: 0,CASE_ID,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
3140,8636951,4,1,72
2884,8651602,4,1,60
5298,90728462,7,1,59
5466,90769585,6,1,29
375,8777116,4,2,25


In [9]:
# Merge Collisions and Victims datasets
cv = pd.merge(c, v, how='left', on='CASE_ID')
cv.sample(5)

Unnamed: 0,CASE_ID,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
3031,8656472,Y,4,A,,C,C,A,,,FOLSOM ST,17TH ST,-122.41515,37.76369,0,2,11
4574,8574708,Y,4,A,,C,D,A,,,POWELL ST,BEACH ST,-122.41232,37.8077,4,2,14
4976,8555252,Y,4,C,,G,H,A,,,MARKET ST,5TH ST,-122.40806,37.784,0,2,25
2513,8685920,Y,4,C,,C,B,A,,,VAN NESS AV,POST ST,-122.42165,37.78661,0,2,37
1909,8716491,N,4,A,,B,G,A,,,GUERRERO ST,20TH ST,-122.423465,37.758305,0,2,32


In [10]:
# Paul's Function to remove NaNs from columns
def fixDFcolsForR(myDF):
    for i in range(myDF.shape[1]): # for each col
        if myDF.iloc[:, i].dtype == 'O': # check if col is type object
            myDF.iloc[:, i] = myDF.iloc[:, i].astype(str) # if yes set type to string
    return myDF

In [13]:
# Convert NaNs to string type
cv = fixDFcolsForR(cv)
cv.sample(5)

Unnamed: 0,CASE_ID,INTERSECTION,COLLISION_SEVERITY,LIGHTING,LOCATION_TYPE,MVIW,TYPE_OF_COLLISION,WEATHER_1,LATITUDE,LONGITUDE,PRIMARY_RD,SECONDARY_RD,POINT_X,POINT_Y,VICTIM_DEGREE_OF_INJURY,VICTIM_ROLE,VICTIM_AGE
0,90736476,N,4,A,,C,C,B,37.73747,-122.40784,US-101 N/B,ALEMANY BLVD U/C,-122.407801,37.737483,7,1,24
1,90736282,N,3,A,,C,C,A,37.73841,-122.40775,I-280 N/B TO US-101 N/B,CORTLAND AVENUE UNDER-CROSSING,,,7,2,27
2,90736282,N,3,A,,C,C,A,37.73841,-122.40775,I-280 N/B TO US-101 N/B,CORTLAND AVENUE UNDER-CROSSING,,,6,1,43
3,90735985,N,4,A,R,C,C,A,37.78522,-122.39317,I-80 E/B FROM ESSEX ST.,ESSEX ST,-122.39317,37.78522,7,1,38
4,90735985,N,4,A,R,C,C,A,37.78522,-122.39317,I-80 E/B FROM ESSEX ST.,ESSEX ST,-122.39317,37.78522,7,2,19


#### Trying out logistic regression

In [2]:
# from: https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [None]:
# make "fatal or not"