# Data

We can build a model based on the conditions such as:
1. weather
2. the time of the accident happens
3. road condition
4. light condition
to see if these contions would have impact on car accident.

In [56]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import itertools
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
%matplotlib inline

In [179]:
#reading the csv
!wget -O Data-collisions.csv https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv

--2020-08-22 10:42:59--  https://s3.us.cloud-object-storage.appdomain.cloud/cf-courses-data/CognitiveClass/DP0701EN/version-2/Data-Collisions.csv
Resolving s3.us.cloud-object-storage.appdomain.cloud (s3.us.cloud-object-storage.appdomain.cloud)... 67.228.254.196
Connecting to s3.us.cloud-object-storage.appdomain.cloud (s3.us.cloud-object-storage.appdomain.cloud)|67.228.254.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 73917638 (70M) [text/csv]
Saving to: ‘Data-collisions.csv’


2020-08-22 10:44:35 (753 KB/s) - ‘Data-collisions.csv’ saved [73917638/73917638]



In [184]:
#read csv
df = pd.read_csv('Data-collisions.csv')
df.head()

Unnamed: 0,SEVERITYCODE,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,...,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,2,-122.323148,47.70314,1,1307,1307,3502005,Matched,Intersection,37475.0,...,Wet,Daylight,,,,10,Entering at angle,0,0,N
1,1,-122.347294,47.647172,2,52200,52200,2607959,Matched,Block,,...,Wet,Dark - Street Lights On,,6354039.0,,11,From same direction - both going straight - bo...,0,0,N
2,1,-122.33454,47.607871,3,26700,26700,1482393,Matched,Block,,...,Dry,Daylight,,4323031.0,,32,One parked--one moving,0,0,N
3,1,-122.334803,47.604803,4,1144,1144,3503937,Matched,Block,,...,Dry,Daylight,,,,23,From same direction - all others,0,0,N
4,2,-122.306426,47.545739,5,17700,17700,1807429,Matched,Intersection,34387.0,...,Wet,Daylight,,4028032.0,,10,Entering at angle,0,0,N


In [185]:
#select the columns of conditions we are interested in
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# replacing 'nan' with 0
df['WEATHER'] = df['WEATHER'].fillna(0)
df['ROADCOND']=df['ROADCOND'].fillna(0)
df['LIGHTCOND']=df['LIGHTCOND'].fillna(0)

newdata= df[['WEATHER','ROADCOND','LIGHTCOND','SEVERITYCODE']]
newdata

Unnamed: 0,WEATHER,ROADCOND,LIGHTCOND,SEVERITYCODE
0,Overcast,Wet,Daylight,2
1,Raining,Wet,Dark - Street Lights On,1
2,Overcast,Dry,Daylight,1
3,Clear,Dry,Daylight,1
4,Raining,Wet,Daylight,2
5,Clear,Dry,Daylight,1
6,Raining,Wet,Daylight,1
7,Clear,Dry,Daylight,2
8,Clear,Dry,Daylight,1
9,Clear,Dry,Daylight,2


# Analysis about conditions might cause car accidents

## Weather Analysis


In [186]:
newdata.groupby(['WEATHER'])['SEVERITYCODE'].value_counts()

WEATHER                   SEVERITYCODE
0                         1                3997
                          2                1084
Blowing Sand/Dirt         1                  41
                          2                  15
Clear                     1               75295
                          2               35840
Fog/Smog/Smoke            1                 382
                          2                 187
Other                     1                 716
                          2                 116
Overcast                  1               18969
                          2                8745
Partly Cloudy             2                   3
                          1                   2
Raining                   1               21969
                          2               11176
Severe Crosswind          1                  18
                          2                   7
Sleet/Hail/Freezing Rain  1                  85
                          2                  28
S

In [187]:
newdata.groupby(['WEATHER'])['SEVERITYCODE'].value_counts(normalize=True)

WEATHER                   SEVERITYCODE
0                         1               0.786656
                          2               0.213344
Blowing Sand/Dirt         1               0.732143
                          2               0.267857
Clear                     1               0.677509
                          2               0.322491
Fog/Smog/Smoke            1               0.671353
                          2               0.328647
Other                     1               0.860577
                          2               0.139423
Overcast                  1               0.684456
                          2               0.315544
Partly Cloudy             2               0.600000
                          1               0.400000
Raining                   1               0.662815
                          2               0.337185
Severe Crosswind          1               0.720000
                          2               0.280000
Sleet/Hail/Freezing Rain  1               0

### We can see from above tables that most car accidents happen on clear, overcast and raining days. This might because people do not always go out on extreme weather. However, looking at the proportion of the severity code, the highest proportion of car accidents of severity code 2 takes place on partly cloudly days, which means that more car accidents happening on cloudly days are serious than other weather.

# Road Condition Analysis

In [188]:
newdata.groupby(['ROADCOND'])['SEVERITYCODE'].value_counts(normalize=True)

ROADCOND        SEVERITYCODE
0               1               0.788508
                2               0.211492
Dry             1               0.678227
                2               0.321773
Ice             1               0.774194
                2               0.225806
Oil             1               0.625000
                2               0.375000
Other           1               0.674242
                2               0.325758
Sand/Mud/Dirt   1               0.693333
                2               0.306667
Snow/Slush      1               0.833665
                2               0.166335
Standing Water  1               0.739130
                2               0.260870
Unknown         1               0.950325
                2               0.049675
Wet             1               0.668134
                2               0.331866
Name: SEVERITYCODE, dtype: float64

### On oil roads, car accidents are easier to get serious, as the proportion of severity code 2 is highest on oil roads and wet roads among all of the road conditions. 

# Light Condition Analysis

In [189]:
newdata.groupby(['LIGHTCOND'])['SEVERITYCODE'].value_counts(normalize=True)

LIGHTCOND                 SEVERITYCODE
0                         1               0.789168
                          2               0.210832
Dark - No Street Lights   1               0.782694
                          2               0.217306
Dark - Street Lights Off  1               0.736447
                          2               0.263553
Dark - Street Lights On   1               0.701589
                          2               0.298411
Dark - Unknown Lighting   1               0.636364
                          2               0.363636
Dawn                      1               0.670663
                          2               0.329337
Daylight                  1               0.668116
                          2               0.331884
Dusk                      1               0.670620
                          2               0.329380
Other                     1               0.778723
                          2               0.221277
Unknown                   1               0

In [190]:
newdata.groupby(['LIGHTCOND'])['SEVERITYCODE'].value_counts()

LIGHTCOND                 SEVERITYCODE
0                         1                4080
                          2                1090
Dark - No Street Lights   1                1203
                          2                 334
Dark - Street Lights Off  1                 883
                          2                 316
Dark - Street Lights On   1               34032
                          2               14475
Dark - Unknown Lighting   1                   7
                          2                   4
Dawn                      1                1678
                          2                 824
Daylight                  1               77593
                          2               38544
Dusk                      1                3958
                          2                1944
Other                     1                 183
                          2                  52
Unknown                   1               12868
                          2                 605
N

### More car accidents happen when dark, and the car accidents could be easier to get serious when it is dark with unknown lighting. This shows that light condition does have a strong impact on car accidents. It is safer to drive when it is not dark.

# Methodology

In [191]:
newdata['WEATHER'].replace(to_replace=['Blowing Sand/Dirt','Clear','Fog/Smog/Smoke','Other','Overcast','Partly Cloudy','Raining','Severe Crosswind','Sleet/Hail/Freezing Rain','Snowing','Unknown'], value=[0,1,0,1,1,0,0,0,0,0,1],inplace=True)
newdata['ROADCOND'].replace(to_replace=['Wet','Dry','Unknown','Snow/Slush','Ice','Other','Sand/Mud/Dirt','Standing Water','Oil'], value =[0,1,1,0,0,1,0,0,0],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [192]:
newdata['LIGHTCOND'].replace(to_replace=['Dark - No Street Lights','Dark - Street Lights Off','Dark - Street Lights On','Dark - Unknown Lighting','Dawn','Daylight','Dusk','Other','Unknown'],value=[0,0,1,0,0,1,0,1,0],inplace=True)

In [193]:
newdata.head()

Unnamed: 0,WEATHER,ROADCOND,LIGHTCOND,SEVERITYCODE
0,1,0,1,2
1,0,0,1,1
2,1,1,1,1
3,1,1,1,1
4,0,0,1,2


In [194]:
X=newdata[['WEATHER','ROADCOND','LIGHTCOND']]
X

Unnamed: 0,WEATHER,ROADCOND,LIGHTCOND
0,1,0,1
1,0,0,1
2,1,1,1
3,1,1,1
4,0,0,1
5,1,1,1
6,0,0,1
7,1,1,1
8,1,1,1
9,1,1,1


In [195]:
y = newdata['SEVERITYCODE'].values.astype(str)

In [200]:
X= preprocessing.StandardScaler().fit(X).transform(X)
X

array([[ 0.50774504, -1.59453327,  0.42509083],
       [-1.96949239, -1.59453327,  0.42509083],
       [ 0.50774504,  0.62714276,  0.42509083],
       ...,
       [ 0.50774504,  0.62714276,  0.42509083],
       [ 0.50774504,  0.62714276, -2.35243843],
       [ 0.50774504, -1.59453327,  0.42509083]])

### Metrices

In [207]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=4)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (155738, 3) (155738,)
Test set: (38935, 3) (38935,)


### KNN Model

In [214]:
KNN_model = KNeighborsClassifier(n_neighbors = 4).fit(X_train, y_train)
KNNpredicted = KNN_model.predict(X_test)
KNN_f1 = f1_score(y_test, KNNpredicted, average='weighted')
KNN_acc = accuracy_score(y_test, KNNpredicted)

### Decision Tree

In [219]:
Tree_model = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
Tree_model.fit(X_train, y_train)
treepredicted = Tree_model.predict(X_test)
Tree_f1 = f1_score(y_test, treepredicted, average='weighted')
Tree_acc = accuracy_score(y_test, treepredicted)

  'precision', 'predicted', average, warn_for)


### Linear Regression

In [216]:
LR_model = LogisticRegression(C=0.098, solver='liblinear').fit(X_train, y_train)
LRpredicted = LR_model.predict(X_test)

LR_f1 = f1_score(y_test, LRpredicted, average='weighted')
LR_acc = accuracy_score(y_test, LRpredicted)

  'precision', 'predicted', average, warn_for)


### Results

In [217]:
table = {
    "Algorithm": ["KNN", "Decision Tree", "LogisticRegression"],
    "F1-score": [KNN_f1, Tree_f1, LR_f1],
    "Accuracy": [KNN_acc, Tree_acc, LR_acc]
}

table = pd.DataFrame(table)
table

Unnamed: 0,Algorithm,F1-score,Accuracy
0,KNN,0.611456,0.649313
1,Decision Tree,0.582206,0.704379
2,LogisticRegression,0.582206,0.704379


#### KNN with highest F1-score and lowest accuracy.