# Capstone Project

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('C:/Users/ACHAL SHAH/Desktop/Data-Collisions.csv')

In [3]:
df.head()

Unnamed: 0,SEVERITYCODE,X,Y,OBJECTID,INCKEY,COLDETKEY,REPORTNO,STATUS,ADDRTYPE,INTKEY,...,ROADCOND,LIGHTCOND,PEDROWNOTGRNT,SDOTCOLNUM,SPEEDING,ST_COLCODE,ST_COLDESC,SEGLANEKEY,CROSSWALKKEY,HITPARKEDCAR
0,2,-122.323148,47.70314,1,1307,1307,3502005,Matched,Intersection,37475.0,...,Wet,Daylight,,,,10,Entering at angle,0,0,N
1,1,-122.347294,47.647172,2,52200,52200,2607959,Matched,Block,,...,Wet,Dark - Street Lights On,,6354039.0,,11,From same direction - both going straight - bo...,0,0,N
2,1,-122.33454,47.607871,3,26700,26700,1482393,Matched,Block,,...,Dry,Daylight,,4323031.0,,32,One parked--one moving,0,0,N
3,1,-122.334803,47.604803,4,1144,1144,3503937,Matched,Block,,...,Dry,Daylight,,,,23,From same direction - all others,0,0,N
4,2,-122.306426,47.545739,5,17700,17700,1807429,Matched,Intersection,34387.0,...,Wet,Daylight,,4028032.0,,10,Entering at angle,0,0,N


In [4]:
df.dtypes

SEVERITYCODE        int64
X                 float64
Y                 float64
OBJECTID            int64
INCKEY              int64
COLDETKEY           int64
REPORTNO           object
STATUS             object
ADDRTYPE           object
INTKEY            float64
LOCATION           object
EXCEPTRSNCODE      object
EXCEPTRSNDESC      object
SEVERITYCODE.1      int64
SEVERITYDESC       object
COLLISIONTYPE      object
PERSONCOUNT         int64
PEDCOUNT            int64
PEDCYLCOUNT         int64
VEHCOUNT            int64
INCDATE            object
INCDTTM            object
JUNCTIONTYPE       object
SDOT_COLCODE        int64
SDOT_COLDESC       object
INATTENTIONIND     object
UNDERINFL          object
WEATHER            object
ROADCOND           object
LIGHTCOND          object
PEDROWNOTGRNT      object
SDOTCOLNUM        float64
SPEEDING           object
ST_COLCODE         object
ST_COLDESC         object
SEGLANEKEY          int64
CROSSWALKKEY        int64
HITPARKEDCAR       object
dtype: objec

In [5]:
pre_df=df[["SEVERITYCODE","WEATHER","ROADCOND","LIGHTCOND"]]

# Convert object columns to category
pre_df["WEATHER"] = pre_df["WEATHER"].astype('category')
pre_df["ROADCOND"] = pre_df["ROADCOND"].astype('category')
pre_df["LIGHTCOND"] = pre_df["LIGHTCOND"].astype('category')

# Create new column for analysis
pre_df["WEATHER_CAT"] = pre_df["WEATHER"].cat.codes
pre_df["ROADCOND_CAT"] = pre_df["ROADCOND"].cat.codes
pre_df["LIGHTCOND_CAT"] = pre_df["LIGHTCOND"].cat.codes

pre_df.dtypes

SEVERITYCODE        int64
WEATHER          category
ROADCOND         category
LIGHTCOND        category
WEATHER_CAT          int8
ROADCOND_CAT         int8
LIGHTCOND_CAT        int8
dtype: object

In [6]:
pre_df["SEVERITYCODE"].value_counts()

1    136485
2     58188
Name: SEVERITYCODE, dtype: int64

In [7]:
pre_df["WEATHER"].value_counts()

Clear                       111135
Raining                      33145
Overcast                     27714
Unknown                      15091
Snowing                        907
Other                          832
Fog/Smog/Smoke                 569
Sleet/Hail/Freezing Rain       113
Blowing Sand/Dirt               56
Severe Crosswind                25
Partly Cloudy                    5
Name: WEATHER, dtype: int64

In [8]:
pre_df["ROADCOND"].value_counts()

Dry               124510
Wet                47474
Unknown            15078
Ice                 1209
Snow/Slush          1004
Other                132
Standing Water       115
Sand/Mud/Dirt         75
Oil                   64
Name: ROADCOND, dtype: int64

In [9]:
pre_df["LIGHTCOND"].value_counts()

Daylight                    116137
Dark - Street Lights On      48507
Unknown                      13473
Dusk                          5902
Dawn                          2502
Dark - No Street Lights       1537
Dark - Street Lights Off      1199
Other                          235
Dark - Unknown Lighting         11
Name: LIGHTCOND, dtype: int64

In [10]:
from sklearn.utils import resample

pre_df_maj = pre_df[pre_df.SEVERITYCODE==1]
pre_df_min = pre_df[pre_df.SEVERITYCODE==2]

pre_df_maj_dsample = resample(pre_df_maj,
                              replace=False,
                              n_samples=58188,
                              random_state=123)

balanced_df = pd.concat([pre_df_maj_dsample, pre_df_min])

balanced_df.SEVERITYCODE.value_counts()

2    58188
1    58188
Name: SEVERITYCODE, dtype: int64

In [11]:
X = np.asarray(balanced_df[['WEATHER_CAT', 'ROADCOND_CAT', 'LIGHTCOND_CAT']])
X[0:5]

array([[ 6,  8,  2],
       [ 1,  0,  5],
       [10,  7,  8],
       [ 1,  0,  5],
       [ 1,  0,  5]], dtype=int8)

In [12]:
y = np.asarray(balanced_df['SEVERITYCODE'])
y [0:5]

array([1, 1, 1, 1, 1], dtype=int64)

In [13]:
from sklearn import preprocessing
X = preprocessing.StandardScaler().fit(X).transform(X)
X[0:5]

array([[ 1.15236718,  1.52797946, -1.21648407],
       [-0.67488   , -0.67084969,  0.42978835],
       [ 2.61416492,  1.25312582,  2.07606076],
       [-0.67488   , -0.67084969,  0.42978835],
       [-0.67488   , -0.67084969,  0.42978835]])

## Model and Evaluation

In [15]:
from sklearn.metrics import jaccard_score
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

#Train and Test Sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
print ('Train set rows:', X_train.shape[0])
print ('Test set rows:', X_test.shape[0])

Train set rows: 81463
Test set rows: 34913


### K Nearst Neigbours 

In [22]:
from sklearn.neighbors import KNeighborsClassifier
k = 14
knn = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)

knn_y_pred = knn.predict(X_test)
knn_y_pred[0:5]

array([2, 2, 1, 1, 2], dtype=int64)

In [23]:
jaccard_score(y_test, knn_y_pred)

0.31110811781609193

In [24]:
f1_score(y_test, knn_y_pred, average='macro')

0.5484494712246419

### Decision Tree 

In [33]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(criterion="entropy", max_depth = 7)

dt.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=7)

In [34]:
dt_y_pred = dt.predict(X_test)

In [35]:
jaccard_score(y_test, dt_y_pred)

0.2873687679487783

In [36]:
f1_score(y_test, dt_y_pred, average='macro')

0.5450597937389444

### Linear Regression 

In [37]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
LR = LogisticRegression(C=6, solver='liblinear').fit(X_train,y_train)

LR_y_pred = LR.predict(X_test)
LR_y_prob = LR.predict_proba(X_test)

LR_y_prob = LR.predict_proba(X_test)
log_loss(y_test, LR_y_prob)

0.6849535383198887

In [38]:
jaccard_score(y_test, LR_y_pred)

0.2720073907879108

In [39]:
f1_score(y_test, LR_y_pred, average='macro')

0.511602093963383

### Model Accuracy 

In [40]:
from sklearn.metrics import accuracy_score

In [41]:
print("KNN Accuracy: ", accuracy_score(y_test, knn_y_pred))

KNN Accuracy:  0.5605361899578953


In [42]:
print("Decision Tree Accuracy: ", accuracy_score(y_test, dt_y_pred))

Decision Tree Accuracy:  0.5664365709048206


In [43]:
print("LR Accuracy: ", accuracy_score(y_test, LR_y_pred))

LR Accuracy:  0.5260218256809784


### Thank you! 