In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
!conda install -c conda-forge folium=0.5.0 --yes
import folium
from folium.plugins import MarkerCluster
import warnings
warnings.filterwarnings("ignore")
sns.set()

Solving environment: \ 

In [None]:
#download the dataset to our project directory and take a look at the data types 
!wget -O data.csv "https://opendata.arcgis.com/datasets/5b5c745e0f1f48e7a53acec63a0022ab_0.csv"

In [None]:
data = pd.read_csv("data.csv")
data.info()

In [None]:
# The X and Y fields denote the longitude and latitude of the collisions. We can visualize the first few non-null collisions on a map.
map = folium.Map(location=[47.60, -122.33], zoom_start=12)
marker_cluster = MarkerCluster().add_to(map)
locations = data[['Y', 'X']][data['Y'].notna()].head(1000)
locationlist = locations.values.tolist()
for point in range(len(locations)):
    folium.Marker(locationlist[point]).add_to(marker_cluster)
map

In [None]:
# The WEATHER field contains a description of the weather conditions during the time of the collision.
data['WEATHER'].value_counts().to_frame('count')

In [None]:
# The ROADCOND field describes the condition of the road during the collision.
data['ROADCOND'].value_counts().to_frame('count')

In [None]:
# The LIGHTCOND field describes the light conditions during the collision.
data['LIGHTCOND'].value_counts().to_frame('count')

In [None]:
#The SPEEDING field classifies collisions based on whether or not speeding was a factor in the collision. Blanks indicate cases where the vehicle was not speeding.
data['SPEEDING'].value_counts().to_frame()

In [None]:
#The SEVERITYCODE field contains a code that corresponds to the severity of the collision. and SEVERITYDESC contains a detailed description of the severity of the collision.
data['SEVERITYCODE'].value_counts().to_frame('count')

In [None]:
# The UNDERINFL field describes whether or not a driver involved was under the influence of drugs or alcohol. The values 0 and N denote that the driver was not under any influence while 1 and Y that they were.
data['UNDERINFL'].value_counts().to_frame('count')

In [None]:
#The PERSONCOUNT and VEHCOUNT indicate how many people and vehicles were involved in a collision respectively.
data['PERSONCOUNT'].describe()

In [None]:
data['VEHCOUNT'].describe()

In [None]:
#The number of pedestrians involved in the collision helps identify severity involved 
data['PEDCOUNT'].describe()

In [None]:
#The number of bicycles involved in the collision helps identify severity involved
data['PEDCYLCOUNT'].describe()

In [None]:
## Data Cleaning and Processing
# Removing unwanted cloumns, Checking blanks and duplicates
data.isna().sum()


In [None]:
data.duplicated().sum()

In [None]:
# Selecting relevant fields and dropping others.
data_clean = data[['X', 'Y', 'WEATHER', 'ROADCOND', 'LIGHTCOND',
                   'SPEEDING', 'SEVERITYCODE', 'UNDERINFL',
                   'SERIOUSINJURIES', 'FATALITIES', 'INJURIES',
                   'PERSONCOUNT', 'PEDCOUNT', 'PEDCYLCOUNT', 'VEHCOUNT']]
data_clean.info()

In [None]:
#Fixing the SPEEDING field by encoding it to 0 for the blanks and 1 for the Y values.
data_clean['SPEEDING'] = data_clean['SPEEDING'].map({'Y': 1})
data_clean['SPEEDING'].replace(np.nan, 0, inplace=True)
data_clean['SPEEDING'].value_counts().to_frame()

In [None]:
#Records containing values as Unknown and Others can be considered as null values. Severity Code of 0 corresponds to unknown severity, which can also be treated as null.
data_clean.replace('Unknown', np.nan, inplace=True)
data_clean.replace('Other', np.nan, inplace=True)
data_clean['SEVERITYCODE'].replace('0', np.nan, inplace=True)

In [None]:
# We can quickly have an overview of the dataset and look at the frequency of missings records.
sns.heatmap(data_clean.isnull(), cmap='YlGnBu_r')
plt.show()

In [None]:
#We now drop the records having null values in order to clean the data.
data_clean.dropna(axis=0, inplace=True)
# Visualizing the dataset after dropping the null values shows that there are no more blanks.
sns.heatmap(data_clean.isnull(), cmap='YlGnBu_r')
plt.show()

In [None]:
#As the UNDERINFL field had data inconsistency, it is cleaned by converting all N and 0 values to 0 and all Y and 1 values to 1.
data_clean['UNDERINFL'] = data_clean['UNDERINFL'].map({'N': 0, '0': 0, 'Y': 1, '1': 1})
data_clean.info()

In [None]:
# Checking the distribution of data
ax = sns.countplot(data_clean['WEATHER'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, 
                   horizontalalignment='right')
plt.show()

In [None]:
# Road Condition
ax = sns.countplot(data_clean['ROADCOND'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, 
                   horizontalalignment='right')
plt.show()

In [None]:
# light condition
ax = sns.countplot(data_clean['LIGHTCOND'])
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, 
                   horizontalalignment='right')
plt.show()

In [None]:
# Underinfluence of 
sns.countplot(data_clean['UNDERINFL'])

In [None]:
# Using scatter plot to show vehicle and person count involved in accident
ax = plt.scatter(data_clean['VEHCOUNT'], data_clean['PERSONCOUNT'])
plt.xlabel('VEHCOUNT')
plt.ylabel('PERSONCOUNT')
plt.show()

In [None]:
# Vehicle count and Injuries
ax = plt.scatter(data_clean['VEHCOUNT'], data_clean['INJURIES'])
plt.xlabel('VEHCOUNT')
plt.ylabel('INJURIES')
plt.show()

In [None]:
# Pedistrian and person count
ax = plt.scatter(data_clean['PEDCOUNT'], data_clean['PERSONCOUNT'])
plt.xlabel('PEDCOUNT')
plt.ylabel('PERSONCOUNT')
plt.show()

In [None]:
# plotting the clean data showing corellation 
sns.heatmap(data_clean.corr(), cmap='YlGnBu_r')
plt.show()

In [None]:
#hot encoding of the WEATHER, ROADCOND, and LIGHTCOND fields as they are categorical.
data_clean = pd.concat([data_clean.drop(['WEATHER', 'ROADCOND', 'LIGHTCOND'], axis=1), 
           pd.get_dummies(data_clean['ROADCOND']),
           pd.get_dummies(data_clean['LIGHTCOND']),
           pd.get_dummies(data_clean['WEATHER'])], axis=1)

In [None]:
#Shuffling of the dataset is necessary as it is an unbalanced dataset.
data_clean = data_clean.sample(frac=1).reset_index(drop=True)
data_clean.head(5).T

In [None]:
#Finding the correlation among the features of the dataset 
sns.heatmap(data_clean.corr(), cmap='YlGnBu_r')
plt.show()

In [None]:
#
from sklearn import preprocessing
x = data_clean.drop(['SEVERITYCODE'], axis=1)
y = data_clean[['SEVERITYCODE']]
data_clean_scaled = preprocessing.StandardScaler().fit(x).transform(x)
data_clean_scaled[0:3]

In [None]:
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_clean_scaled, y, 
                                                    test_size=0.2, random_state=42)

In [None]:
## Modelling and Evaluation
# Decision Tree Model
from sklearn.tree import DecisionTreeClassifier
dTreeModel = DecisionTreeClassifier(criterion='entropy', max_depth=5)
dTreeModel.fit(x_train, y_train)
dTreeModel

In [None]:
yHat = dTreeModel.predict(x_test)
print(classification_report(y_test, yHat))

In [None]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfcModel = RandomForestClassifier(n_estimators=75)
rfcModel.fit(x_train, y_train)
yHat = rfcModel.predict(x_test)
print(classification_report(y_test, yHat))

In [None]:
# logistic regression model
from sklearn.linear_model import LogisticRegression
logRegModel = LogisticRegression(C=0.01)
logRegModel.fit(x_train, y_train)
logRegModel
yHat = logRegModel.predict(x_test)
print(classification_report(y_test, yHat))

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(32, input_dim=x_train.shape[1], activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(4, activation='sigmoid')
])

model.compile(
    loss='categorical_crossentropy', 
    optimizer='adam', 
    metrics=['accuracy']
)

In [None]:
num_epochs = 10
history = model.fit(x_train, tf.keras.utils.to_categorical(
    y_train['SEVERITYCODE'].map({
        '1': 0,
        '2': 1,
        '2b': 2,
        '3': 3
    }), dtype='float32'
), epochs=num_epochs, batch_size=50, validation_split = 0.2)

In [None]:
loss_train = history.history['loss']
loss_validation = history.history['val_loss']
epochs = range(1, num_epochs + 1)
plt.plot(epochs, loss_train, 'g', label='Training')
plt.plot(epochs, loss_validation, 'b', label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss')
plt.legend()
plt.show()

In [None]:
acc_train = history.history['accuracy']
acc_validation = history.history['val_accuracy']
epochs = range(1, num_epochs + 1)
plt.plot(epochs, acc_train, 'g', label='Training')
plt.plot(epochs, acc_validation, 'b', label='Validation')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy')
plt.legend()
plt.show()


In [None]:
yHat = model.predict(x_test)
yPred = [np.argmax(y) for y in yHat]
print(classification_report(y_test.SEVERITYCODE.map({
        '1': 0,
        '2': 1,
        '2b': 2,
        '3': 3
}), yPred))

In [None]:
plt.bar(['DTC', 'RFC', 'LogReg', 'ANN'], [1.,1.,1.,1.])
plt.show()