In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

# Data Preprocessing

In [None]:
data = pd.read_csv('./Traffic_accidents.csv')
data.info()

In [None]:
data.head(3)

In [None]:
data.isnull().sum()

In [None]:
sns.heatmap(data == 0, yticklabels=False)

In [None]:
data.drop(['OBJECTID', 'DATE_', 'TIME', 'SEVERITY_DESCRIPTION', 'CASUALTIES', 'ACCIDENT_TYPE',
           'ACCIDENT_DESCRIPTION', 'X', 'Y', 'RENDER'], axis=1, inplace=True)
# converting fatal collisions to serious
data['SEVERITY'] = data['SEVERITY'].replace(1, 2)
data.head(3)

In [None]:
slight = pd.get_dummies(data["SEVERITY"], dtype=int, drop_first=True)
data = pd.concat([data, slight], axis=1)
data.drop(['SEVERITY'], axis=1, inplace=True)
data.rename(columns={3: 'Severe'}, inplace=True)
# inverting the severe column
data['Severe'] = data['Severe'] ^ 1
data.head(30)


<h1> Balancing the dataset </h1>

In [None]:
# Counting the occurrences of each class
class_counts = data['Severe'].value_counts()

minority_class_label = class_counts.idxmin()
majority_class_label = class_counts.idxmax()

majority_class_indices = data[data['Severe'] == majority_class_label].index

num_samples_minority_class = class_counts[minority_class_label]

undersampled_majority_indices = data[data['Severe'] == majority_class_label].sample(n=num_samples_minority_class,
                                                                                    random_state=42).index

undersampled_indices = data[data['Severe'] == minority_class_label].index.union(undersampled_majority_indices)

undersampled_data = data.loc[undersampled_indices]

undersampled_data.head(145)

data = undersampled_data
data.head(10)

# Performance Evaluation Functions

In [None]:
def plot_dataset(X, y, axes):
    plt.plot(X[:, 0][y == 0], X[:, 1][y == 0], "bs")
    plt.plot(X[:, 0][y == 1], X[:, 1][y == 1], "g^")
    plt.axis(axes)
    plt.grid(True, which='both')
    plt.xlabel(r"$Age$", fontsize=15)
    plt.ylabel(r"$Glucose$", fontsize=15, rotation=90)

In [None]:
results = []


def add_results(result):
    results.append(result)


def get_results():
    print("Model, Test Avg. Accuracy, Test Avg. Precision")
    return results

In [None]:
def evaluate_model(model):
    y_pred = model.predict(X_test)
    matrix = confusion_matrix(y_test, y_pred)
    class_report = classification_report(y_test, y_pred)
    accu_score = accuracy_score(y_test, y_pred)
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=10, n_jobs=-1)
    print("\nConfusion Matrix:")
    print(matrix)
    print("\nClassification Report:")
    print(class_report)
    print("\nAccuracy Score")
    print(accu_score)
    print("\n")
    print('\nAverage accuracy: ', np.mean(scores))

    # doing k fold cross validation for precision
    scores_test = cross_val_score(model, X_test, y_test, scoring='accuracy', cv=10, n_jobs=-1)

    print('\nTest average accuracy: ', np.mean(scores_test))

    scores = cross_val_score(model, X_train, y_train, scoring='precision_macro', cv=10, n_jobs=-1)
    # print the average score
    print('\nAverage precision: ', scores.mean())

    # doing k fold cross validation for accuracy
    scores_test = cross_val_score(model, X_test, y_test, scoring='precision_macro', cv=10, n_jobs=-1)

    print('\nTest average precision: ', scores_test.mean())

<h1> Splitting dataset </h1>

In [None]:
X = data.drop(['Severe'], axis=1)
y = data['Severe']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

<h1> Support Vector Machine </h1>

In [None]:
linear_svm_model = SVC(kernel='linear', random_state=0)
linear_svm_pipeline = make_pipeline(StandardScaler(), linear_svm_model)

poly_svm_model = SVC(kernel='poly', random_state=0)
poly_svm_pipeline = make_pipeline(StandardScaler(), poly_svm_model)

rbf_svm_model = SVC(kernel='rbf', random_state=0)
rbf_svm_pipeline = make_pipeline(StandardScaler(), rbf_svm_model)


In [None]:
linear_svm_pipeline.fit(X_train, y_train)
poly_svm_pipeline.fit(X_train, y_train)
rbf_svm_pipeline.fit(X_train, y_train)

In [None]:
print("Linear Support Vector Machine:")
evaluate_model(model=linear_svm_pipeline)

In [None]:
print("Polynomial Support Vector Machine:")
evaluate_model(model=poly_svm_pipeline)

In [None]:
print("Radial Basis Function Support Vector Machine:")
evaluate_model(model=rbf_svm_pipeline)

<h1> Hyperparameter Tuning: </h1>

<h2> Linear Kernel Tuning: </h2>

<p> Narrowing Down on best C hyper parameter </p>

In [None]:
parameter_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=linear_svm_model, param_grid=parameter_grid, scoring='accuracy', n_jobs=- 1)
grid_search.fit(X_train, y_train)
print("Best Parameters", grid_search.best_params_)

In [None]:
parameter_grid = {'C': np.linspace(0.1, 10, num=100, dtype=float)}

grid_search = GridSearchCV(estimator=linear_svm_model, param_grid=parameter_grid, scoring='accuracy', n_jobs=- 1)
grid_search.fit(X_train, y_train)
print("Best Parameters", grid_search.best_params_)

Best hyper parameters for linear kernel is 3 for the C hyper parameter.

<h2> Polynomial kernel tuning: </h2>

In [None]:
parameter_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'degree': [1, 2, 3, 4],
                  'gamma': [0.01, 0.1, 1, 10]}
grid_search = GridSearchCV(estimator=poly_svm_model, param_grid=parameter_grid, scoring='accuracy', cv=3, n_jobs=- 1,
                           verbose=2)
grid_search.fit(X_train, y_train)
print("Best Parameters", grid_search.best_params_)

The best hyperparameters found for the polynomial kernel are: 
C = 0.01
degree = 2
gamma = 1


<h2> Radial Basis Function kernel tuning: </h2>

In [None]:
parameter_grid = {'C': [0.01, 0.1, 1, 1, 10, 100, 1000], 'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=rbf_svm_model, param_grid=parameter_grid, scoring='accuracy', cv=3, n_jobs=- 1)
grid_search.fit(X_train, y_train)
print("Best Parameters", grid_search.best_params_)

In [None]:
parameter_grid = {'C': np.linspace(1, 100, num=100, dtype=float), 'gamma': np.linspace(0.01, 1, num=100, dtype=float)}
grid_search = GridSearchCV(estimator=rbf_svm_model, param_grid=parameter_grid, scoring='accuracy', cv=3, n_jobs=- 1)
grid_search.fit(X_train, y_train)
print("Best Parameters", grid_search.best_params_)

The best hyperparameters found for the Radial Basis Function Kernel are: C = 18.0, gamma = 0.09

<p>Linear Kernel: C = 3</p>
<p>Poly Kernel: C = 0.01, gamma = 1, degree = 2</p>
<p>RBF Kernel:  C = 18.0, gamma = 0.09</p>

<h1> Tuned Support Vector Machine Model Evaluation: </h1>

In [None]:
linear_svm_model = SVC(kernel='linear', C=3, random_state=0)
linear_svm_pipeline = make_pipeline(StandardScaler(), linear_svm_model)

poly_svm_model = SVC(kernel='poly', C=0.01, gamma=1, degree=2, random_state=0)
poly_svm_pipeline = make_pipeline(StandardScaler(), poly_svm_model)

rbf_svm_model = SVC(kernel='rbf', C=18.0, gamma=0.09, random_state=0)
rbf_svm_pipeline = make_pipeline(StandardScaler(), rbf_svm_model)

In [None]:
linear_svm_pipeline.fit(X_train, y_train)
poly_svm_pipeline.fit(X_train, y_train)
rbf_svm_pipeline.fit(X_train, y_train)

In [None]:
print("Tuned Linear Support Vector Machine:")
evaluate_model(model=linear_svm_pipeline)

In [None]:
print("Tuned Polynomial Support Vector Machine:")
evaluate_model(model=poly_svm_pipeline)

In [None]:
print("Tuned Radial Basis Function Support Vector Machine:")
evaluate_model(model=rbf_svm_pipeline)

Overall the support vector machine with the radial basis function kernel performed the best once tuned over all the other kernels.
With the average accuracy and test average accuracy greater than any other kernel. The radial basis function kernel also had the highest f1-score of all of the other kernels.

<h1> Ensemble model testing: </h1>

In [None]:
ensemble_model = RandomForestClassifier(random_state=0, n_jobs=-1)
ensemble_pipe_line = make_pipeline(StandardScaler(), ensemble_model)
ensemble_pipe_line.fit(X_train, y_train)

In [None]:
evaluate_model(ensemble_pipe_line)

<h2> Tuning ensemble hyperparameters: </h2> 

In [None]:
parameter_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}

random_search = GridSearchCV(estimator=ensemble_model, param_grid=parameter_grid,
                             cv=3, scoring='accuracy', n_jobs=- 1)
random_search.fit(X_train, y_train)
print("Best Parameters", random_search.best_params_)

In [None]:
ensemble_model = RandomForestClassifier(max_depth=3, max_features='sqrt', max_leaf_nodes=9, n_estimators=25,
                                        random_state=0, n_jobs=-1)
ensemble_pipe_line = make_pipeline(StandardScaler(), ensemble_model)
ensemble_pipe_line.fit(X_train, y_train)

In [None]:
evaluate_model(ensemble_pipe_line)

<h1> Final Results: </h1>

<p> The best model was the ensemble model, giving a higher average and test average accuracy / precision than the runner up model being the radial basis function support vector machine </p>
<p> Ensemble model: average accuracy = 0.619, test average accuracy = 0.609, average precision = 0.624, test average precision = 0.616</p>
<p> Radial Basis Function SVM: average accuracy = 0.618, test average accuracy = 0.597, average precision = 0.624, test average precision = 0. 602</p>
<p> The f1-score for the ensemble model was higher overall with class 0 being 0.65 and class 1 being 0.64, compared to the svm which got an f1 score for class 0 of 0.62 and an f1 score for class 1 of 0.65 </p>
<p> Overall the accuracies are relativley low, probabbly due to the lack of serious and fatal collisions in the dataset forcing undersampling of slight collisions </p>