<a href="https://colab.research.google.com/github/willjhliang/traffic-sign-recognition/blob/main/models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download dataset from github repo
!rm -r sample_data
!git clone https://github.com/willjhliang/traffic-sign-recognition.git
!mv traffic-sign-recognition/data .
!rm -r traffic-sign-recognition

In [None]:
import os
from copy import deepcopy
import itertools
import numpy as np
import pandas as pd
from PIL import Image
from matplotlib import pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold

In [None]:
K = 58  # Number of classes

random_seed = 19104

# Data Exploration

In [None]:
def load_data(datapath):
    data = {}
    for k in range(K):
        data[k] = []
    for f in os.listdir(datapath):
        k = int(f[:3])
        img = Image.open(os.path.join(datapath, f)).resize((32, 32))
        data[k].append(np.array(img))
    return data

In [None]:
labels = pd.read_csv("data/labels.csv")

train_data = load_data('data/images/train')
test_data = load_data('data/images/test')

In [None]:
fig, axs = plt.subplots(6, 10)
fig.set_figheight(15)
fig.set_figwidth(15)
for k, (i, j) in itertools.zip_longest(range(K), list(itertools.product(range(6), range(10))), fillvalue=-1):
    axs[i,j].axis('off')
    if k >= 0:
        axs[i,j].imshow(train_data[k][0])


In [None]:
img_dist = plt.bar(list(range(K)), [len(train_data[k]) for k in range(K)])

In [None]:
def prepare_data(data):
    X = []
    y = []
    for k in range(K):
        for i in data[k]:
            X.append(i)
            y.append(k)
    
    X_flattened = deepcopy(X)
    for i in range(len(X_flattened)):
        X_flattened[i] = X_flattened[i].flatten()
    
    X = np.array(X)
    X_flattened = np.array(X_flattened)
    y = np.array(y)
    return X, X_flattened, y

In [None]:
X_train, X_train_flattened, y_train = prepare_data(train_data)
X_test, X_test_flattened, y_test = prepare_data(test_data)

# Models

## Baseline KNN

Train a baseline K-Nearest Neighbors models to classify traffic sign images. Use 10-Fold cross validation to determine the best value of K

In [None]:
kf = KFold(n_splits = 10)
best_k = -1
best_acc = 0
val_accuracies = []

for k_neighbors in range(1, 30, 2):
    # Iterate through possible values of k from 1 to 30, incrementing by 2

    for train_index, val_index in kf.split(X_train_flattened): # Iterate through all 10 folds
        total_acc = 0
        # Split data into training data and validation data
        X_train_fold, X_val_fold = X_train_flattened[train_index], X_train_flattened[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

        # Train KNN model
        knn_model = KNeighborsClassifier(n_neighbors=k_neighbors)
        knn_model.fit(X_train_fold, y_train_fold)
        total_acc = total_acc + knn_model.score(X_val_fold, y_val_fold)
    
    # Get avg accuracy for the folds for this k value
    avg_acc = total_acc / 10
    val_accuracies.append(avg_acc)
    if avg_acc > best_acc:
      best_acc = avg_acc
      best_k = k_neighbors
    

In [None]:
plt.plot(list(range(1, 30, 2)), val_accuracies)
plt.show()
print("Best k: ", best_k)

In [None]:
model = KNeighborsClassifier(n_neighbors=best_k)
model.fit(X_train_flattened, y_train)
model.score(X_test_flattened, y_test)