In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.utils import class_weight
from imblearn.over_sampling import RandomOverSampler
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [122]:
# Read & Adjust Data from CSV file

data = pd.read_csv("C:/Users/Mahmoud/Documents/GitHub/Machine-Learning-Classification-Project/Preprocessing/2017_Accidents_UK_Clean _Mahmoud.csv",dtype=float)
x = data.iloc[:, :-1] 
y = data.iloc[:, -1]  

# KNN doesn't support regularization, we cant increase the weights of the data inside the model. We can only adjust the weights
# of the near/far data.
# So instead of weighting the data, we resampled the miniroity classes to achieve balance.
# We also applied pca on scaled data (as it affects pca severily), to decrease its huge dimensions. As, it's very suspecious to have usless/correlated features.

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

priciple_component_analayzer = PCA(n_components=15)  
x_pca = priciple_component_analayzer.fit_transform(x_scaled)

oversampler = RandomOverSampler(random_state=10)
x_resampled, y_resampled = oversampler.fit_resample(x_pca, y)

# added this later, so used naive naming techniques to not miss with the code's structure
x=pd.DataFrame(x_resampled)
y=y_resampled

x_training, x_testing, y_training, y_testing = train_test_split(x, y, test_size=0.15, random_state=10)

x=x_training
y=y_training



In [123]:
# We try different parameters and apply K-folding on them. Our evaluation metrices will be accuracy, precision, and recall.
# We weren't able to apply ROC, as it's a multi-class output, and having 1v2 roc isn't the best metric. 
# We store the results to choose the best one later.

k_values = np.arange(3, 100, 10)                         # Choosing K from a low value to 100. We tried high values (around the square root of the number of samples); however, they had nearly the same results, and they are computationally hungry. 
weight_options = ['uniform', 'distance']                  # Also, the most commong weight/distance formulas
metric_options = ['euclidean', 'manhattan',]
results = []   

# Perform k-fold cross-validation for each parameter combination
for k in k_values:
    for weight in weight_options:
        for metric in metric_options:
            accuracy_scores = []    
            precision_scores = []  
            recall_scores = []      
            
            my_k_fold = StratifiedKFold(n_splits=5, random_state=10, shuffle=True)
            
            for train_index, test_index in my_k_fold.split(x,y):
                X_train, X_cross = x.iloc[train_index], x.iloc[test_index]
                y_train, y_cross = y.iloc[train_index], y.iloc[test_index]

                knn_model = KNeighborsClassifier(n_neighbors=k, weights=weight, metric=metric)
                knn_model.fit(X_train, y_train)
                y_pred = knn_model.predict(X_cross)

                accuracy = accuracy_score(y_cross, y_pred)
                precision = precision_score(y_cross, y_pred, average='macro',zero_division=0)
                recall = recall_score(y_cross, y_pred, average='macro')

                accuracy_scores.append(accuracy)
                precision_scores.append(precision)
                recall_scores.append(recall)
            
            this_result = {  'k': k,
                        'metric': metric,
                        'weight': weight,
                        'accuracy': np.mean(accuracy_scores),
                        'precision': np.mean(precision_scores),
                        'recall': np.mean(recall_scores) }
            print(this_result)
            results.append(this_result)  
          
            


{'k': 3, 'metric': 'euclidean', 'weight': 'uniform', 'accuracy': 0.8412635098910066, 'precision': 0.8524441975472697, 'recall': 0.8412621355883984}
{'k': 3, 'metric': 'manhattan', 'weight': 'uniform', 'accuracy': 0.8410449030773062, 'precision': 0.8522540932821456, 'recall': 0.8410435510925556}
{'k': 3, 'metric': 'euclidean', 'weight': 'distance', 'accuracy': 0.8783556392495913, 'precision': 0.899365635411883, 'recall': 0.8783492884975213}
{'k': 3, 'metric': 'manhattan', 'weight': 'distance', 'accuracy': 0.8785078776342466, 'precision': 0.8993932341863486, 'recall': 0.8785015652031369}
{'k': 13, 'metric': 'euclidean', 'weight': 'uniform', 'accuracy': 0.7294874915693927, 'precision': 0.7197667009750761, 'recall': 0.7295018055771324}
{'k': 13, 'metric': 'manhattan', 'weight': 'uniform', 'accuracy': 0.73122460150548, 'precision': 0.7217479512086866, 'recall': 0.7312387550299474}
{'k': 13, 'metric': 'euclidean', 'weight': 'distance', 'accuracy': 0.8523771002393816, 'precision': 0.879918685

In [124]:
# We favor precision and recall over accuracy ; however, we still take accuracy into consideration. 
# So we take the top 10 accuracy scores passing the average precision and recall values.
# After this we, take the top 5 of them based on precision. Then we take the one having the maximum recall.

precision_values = [result['precision'] for result in results]
all_models_average_precision_scores = np.mean(precision_values)

recall_values = [result['recall'] for result in results]
all_models_average_recall_scores = np.mean(recall_values)

filtered_results = []
for result in results:
    if result['precision'] >= all_models_average_precision_scores and result['recall'] >= all_models_average_recall_scores :
        filtered_results.append(result)
        
top_10_results = sorted(filtered_results, key=lambda x: x['accuracy'], reverse=True)[:10]

precision_values = [result['precision'] for result in top_10_results]
top_10_results_average_precision_scores = np.mean(precision_values)


filtered_results_2 = []
for result in top_10_results:
    if result['precision'] >= top_10_results_average_precision_scores:
        filtered_results_2.append(result)
top_5_results = sorted(filtered_results_2, key=lambda x: x['precision'], reverse=True)[:5]

best_model = max(top_5_results, key=lambda x: x['recall'])

print(best_model)

{'k': 3, 'metric': 'manhattan', 'weight': 'distance', 'accuracy': 0.8785078776342466, 'precision': 0.8993932341863486, 'recall': 0.8785015652031369}


In [125]:
# Create the final model with the best parameters

# On trying different parameters, we picked the most suitable one. The KNN shows promsing results achieving one of the highest scores
# on out evaluation metrices. It could be also improved by some domain knowledge, and even using more supportive models
# (anomaly detection, clustering..etc).

final_knn_model = KNeighborsClassifier(
    n_neighbors=best_model['k'],
    weights=best_model['weight'],
    metric=best_model['metric'] )
final_knn_model.fit(x, y)


y_prediction = final_knn_model.predict(x_testing)

final_accuracy = accuracy_score(y_testing, y_prediction)
final_precision = precision_score(y_testing, y_prediction, average='macro',zero_division=0)
final_recall = recall_score(y_testing, y_prediction, average='macro')
my_confusion_matrix = confusion_matrix(y_testing, y_prediction, labels=[0, 1, 2])

print("Testing Accuracy:", final_accuracy)
print("Testing Precision:", final_precision)
print("Testing Recall:", final_recall)

print("Confusion Matrix:")
print("True Fatal: " , my_confusion_matrix[0,0],  "False Serious: ", my_confusion_matrix[0,1], "False Slight: ",my_confusion_matrix[0,2] )
print("False Fatal: " , my_confusion_matrix[1,0],  "True Serious: ", my_confusion_matrix[1,1], "False Slight: ",my_confusion_matrix[1,2])
print("False Fatal: " , my_confusion_matrix[2,0],  "False Serious: ", my_confusion_matrix[2,1], "True Slight: ",my_confusion_matrix[2,2])


# confusion matrix explanation (applies to more dimensions)
#                  Predicted bird                         Predicted cat
# Actual bird       True bird                               False Cat 
# Actual cat        False Bird                             True Cat  

Testing Accuracy: 0.8962107638197624
Testing Precision: 0.91554023582485
Testing Recall: 0.8962471143727652
Confusion Matrix:
True Fatal:  15082 False Serious:  0 False Slight:  0
False Fatal:  4 True Serious:  14915 False Slight:  131
False Fatal:  269 False Serious:  4288 True Slight:  10518
