# K-Nearest Neighbors for Traffic Sign Classification
In this project, I solved a multi-class image classification problem by implementing a K-Nearest Neighbours (KNN)
classifier. Images from 58 different categories of traffic signs represent the dataset. Preprocessing the data,
balancing the dataset through augmentation, and using GridSearchCV to maximise model performance were the main
tasks.

# Import Libraries
Libraries Used:
cv2: For reading, resizing, and augmenting images.
Numpy: For processing numbers.
Matplotlib: For class distribution visualisation.
sklearn: For hyperparameter tuning, model evaluation, and training.

In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler


# Augmentation
Data augmentation ensures a minimum number of samples for every class. To improve the dataset, I used several kind
of changes:


In [None]:

# Augment Image Function
def augment_image(image):
    transformations = [
        lambda x: cv2.flip(x, 1),
        lambda x: cv2.rotate(x, cv2.ROTATE_90_CLOCKWISE),
        lambda x: cv2.rotate(x, cv2.ROTATE_90_COUNTERCLOCKWISE),
        lambda x: cv2.GaussianBlur(x, (5, 5), 0),
        lambda x: x + np.random.normal(0, 5, x.shape).astype(np.uint8)
    ]
    return random.choice(transformations)(image)

To improve the dataset, I created an augment_image function that applies random transformations (noise, blur,
rotation, and flipping).

#  Load and Augment the Dataset

In [None]:

# paths
data_path = '/Users/fatima..../Documents/GitHub/Intro-AI-Coursework/data/traffic_Data/DATA'

# Data Augmentation
images_augmented, class_ids_augmented = [], []
min_samples = 100

for class_id in range(58):  # Iterate through all 58 classes
    directory = os.path.join(data_path, str(class_id))
    if os.path.exists(directory):
        img_files = os.listdir(directory)
        img_count = 0

        for img_file in img_files:
            img_path = os.path.join(directory, img_file)
            image = cv2.imread(img_path)

            if image is not None:
                image = cv2.resize(image, (32, 32))
                image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                images_augmented.append(image.flatten())
                class_ids_augmented.append(class_id)
                img_count += 1

        # Augment if the class has fewer than min_samples
        if img_count < min_samples:
            for _ in range(min_samples - img_count):
                if image is not None:
                    augmented_image = augment_image(image)
                    images_augmented.append(augmented_image.flatten())
                    class_ids_augmented.append(class_id)

# Verify Augmented Dataset Distribution ---
class_distribution = Counter(class_ids_augmented)
print("Class distribution after augmentation:", class_distribution)

Every image is converted to greyscale and resized to 32x32 pixels.
Augments are used to balance the dataset if a class contains fewer than 100 samples.

# Class Distribution

# Class Distribution

In [None]:

# Plot Balanced Class Distribution
plt.figure(figsize=(12, 6))
plt.bar(class_distribution.keys(), class_distribution.values())
plt.xlabel('Class ID')
plt.ylabel('Number of Samples')
plt.title('Balanced Class Distribution After Augmentation')
plt.show()

# Random Baseline Accuracy
In order to provide multi-class classification a performance baseline:

In [None]:

# Multi-class Random Baseline Accuracy
num_classes = len(class_distribution)  # Total number of classes
class_probabilities = [count / sum(class_distribution.values()) for count in class_distribution.values()]
random_baseline_accuracy = sum([p**2 for p in class_probabilities])

print(f"\n--- Multi-class Baseline Accuracy ---")
print(f"Number of Classes: {num_classes}")
print(f"Multi-class Random Baseline Accuracy: {random_baseline_accuracy:.3f}")

In multi-class classification, the sum of squared class probabilities is used to determine the random baseline accuracy.

# Preprocess Data
To improve KNN performance, standardise the dataset:

In [None]:

# Prepare Data for Baseline Model
X = np.array(images_augmented)
y = np.array(class_ids_augmented)

# Standardize the Dataset
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Train-Test Split
Split the dataset into 50/50 for training and testing:

In [None]:

# Split Data into Training and Test Sets (50/50 split)
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.5, random_state=42)

# Train the Baseline KNN Model

In [None]:
# Baseline KNN Model
print("\n--- Baseline KNN Model Performance ---")
baseline_knn = KNeighborsClassifier(n_neighbors=5)
baseline_knn.fit(X_train, y_train)

# Evaluate Baseline Model
y_pred = baseline_knn.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred)
print(f"Baseline KNN Test Accuracy (50/50 split): {baseline_accuracy:.3f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Hyperparameter Tuning with GridSearchCV
To find the optimal KNN parameters:

In [None]:
#Hyperparameter Tuning with GridSearchCV
print("\n--- Hyperparameter Tuning Using GridSearchCV ---")
param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"Best Hyperparameters: {grid_search.best_params_}")
optimized_knn = grid_search.best_estimator_

# Evaluate Optimized KNN

In [None]:
# Evaluate Optimized Model
y_pred_optimized = optimized_knn.predict(X_test)
optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
print(f"Optimized KNN Test Accuracy: {optimized_accuracy:.3f}")
print("Classification Report (Optimized):")
print(classification_report(y_test, y_pred_optimized, zero_division=0))

# Compare Results

In [None]:
#  Compare Results
print("\n--- Comparison of Baseline and Optimized Model ---")
print(f"Random Baseline Accuracy: {random_baseline_accuracy:.3f}")
print(f"Baseline KNN Accuracy: {baseline_accuracy:.3f}")
print(f"Optimized KNN Accuracy: {optimized_accuracy:.3f}")
print(f"Improvement over Baseline KNN: {(optimized_accuracy - baseline_accuracy):.3f}")
print(f"Improvement over Random Baseline: {(optimized_accuracy - random_baseline_accuracy):.3f}")


The KNN results have been improved by data standardisation and augmentation.
The model's performance was greatly enhanced by hyperparameter tuning.