<a href="https://colab.research.google.com/github/wongzw/IS4242_Group_4/blob/4-knn-model/knn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pickle
import joblib
import cv2
import os
import re
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score
from google.colab import drive
from tqdm import tqdm
from PIL import Image

!pip install mediapipe

import tensorflow as tf
import mediapipe as mp


from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mediapipe
  Downloading mediapipe-0.9.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.6/33.6 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mediapipe
Successfully installed mediapipe-0.9.2.1
Mounted at /content/drive


### Attempt without Mediapipe

Load and convert into a 2d arrary image

In [None]:
folder_name = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/data" # Update with Folder Details

In [None]:
image_paths = os.listdir(folder_name)
img_list = []
label_list = []
txt = 'abcdefghiklmnopqrstuvwxy'

for i in tqdm(range(len(os.listdir(folder_name)))):
  filename = os.listdir(folder_name)[i]
  if filename.endswith(('.jpg','.png')):
      label_name = re.split(r'[-_]', filename)[0].lower()
      if label_name not in txt:
        continue
      label_list.append(label_name)
      img = Image.open(os.path.join(folder_name, filename))
      img = img.resize((28, 28), Image.ANTIALIAS)
      img = img.convert('L')
      img_array = np.array(img)
      
      img_list.append(img_array)


100%|██████████| 1221/1221 [02:48<00:00,  7.24it/s]


#### Train-Test Split

*Called in case where X and Y already pickled*

In [None]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
X = joblib.load(output_dir+'X_knn.pkl')
y = joblib.load(output_dir+'y_knn.pkl')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
X = np.array(img_list)
y = np.array(label_list)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"

joblib.dump(X, output_dir+'X_knn.pkl')
joblib.dump(y, output_dir+'y_knn.pkl')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/y_knn.pkl']

#### Data Augmentation on Training Data

In [None]:
import albumentations as A

transform = A.Compose([
    A.RandomBrightnessContrast(),
    A.Affine(scale=[0.8,1.2],translate_percent=0.05, shear=0.2, keep_ratio=True, p=0.5),
    A.Rotate(limit=10)
])

In [None]:
augmented_X = []
augmented_y = []
for i in range(len(X_train)):
  for j in (range(10)):
      transformed = transform(image=X_train[i])
      augmented_X.append(transformed['image'])
      augmented_y.append(y_train[i])

augmented_X = np.array(augmented_X)
augmented_y = np.array(augmented_y)
X_train = np.concatenate([X_train,augmented_X])
y_train = np.concatenate([y_train,augmented_y])

output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
joblib.dump(augmented_X, output_dir+'knn_augmented_X.pkl')
joblib.dump(augmented_y, output_dir+'knn_augmented_y.pkl')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_augmented_y.pkl']

In [None]:
print(X_train.shape)
print(y_train.shape)

(7183, 28, 28)
(7183,)


Data Preprocessing

In [None]:
X_train = X_train/255.0
X_test = X_test/255.0

In [None]:
X_train = X_train.reshape(X_train.shape[0], 784)
X_test = X_test.reshape(X_test.shape[0], 784)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7183, 784)
(164, 784)
(7183,)
(164,)


In [None]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
joblib.dump(X_train, output_dir+'knn_final_X_train.pkl')
joblib.dump(X_test, output_dir+'knn_final_X_test.pkl')
joblib.dump(y_train, output_dir+'knn_final_y_train.pkl')
joblib.dump(y_test, output_dir+'knn_final_y_test.pkl')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_final_y_test.pkl']

#### Model Training and Testing

In [None]:
k = 3 # Choose the number of neighbors
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)

joblib.dump(classifier, output_dir+'knn_model.joblib')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_model.joblib']

In [None]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

classifiction_report = classification_report(y_test, y_pred)
print("Classification Report:", classifiction_report)

Accuracy: 0.09146341463414634
Classification Report:               precision    recall  f1-score   support

           a       0.08      0.14      0.11         7
           b       0.00      0.00      0.00         6
           c       0.00      0.00      0.00         6
           d       0.00      0.00      0.00         6
           e       0.17      0.14      0.15         7
           f       0.00      0.00      0.00         6
           g       0.00      0.00      0.00         6
           h       0.00      0.00      0.00         7
           i       0.00      0.00      0.00         6
           k       0.33      0.17      0.22         6
           l       0.00      0.00      0.00         6
           m       0.00      0.00      0.00         6
           n       0.00      0.00      0.00         7
           o       0.00      0.00      0.00         6
           p       0.00      0.00      0.00         6
           q       0.00      0.00      0.00         6
           r       0.00     

In [None]:
y_score = classifier.predict_proba(X_test)
micro_roc_auc_roc = roc_auc_score(
    y_test,
    y_score,
    multi_class="ovr",
    average="micro",
)

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_roc}")


macro_roc_auc_ovr_tuned = roc_auc_score(
    y_test,
    y_score,
    multi_class="ovr",
    average="macro",
)

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{macro_roc_auc_ovr_tuned}")

Micro-averaged One-vs-Rest ROC AUC score:
0.5198130318392262
Macro-averaged One-vs-Rest ROC AUC score:
0.5146816105401604


#### Hyperparameter Tuning

In [45]:
import multiprocessing
grid_params = {'n_neighbors': [3, 5, 7],
               'weights': ['uniform', 'distance'],
               'algorithm':['kd_tree', 'auto'],
               'p': [1, 2],
               'metric' : ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params,scoring='accuracy', verbose = 3, cv=3, n_jobs = -1)

In [None]:
g_res = gs.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [None]:
print(g_res.best_score_)
print(g_res.best_params_)

0.3423476343375256
{'algorithm': 'kd_tree', 'metric': 'minkowski', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


Evaluating Tunned Model

In [None]:
knn_tuned = KNeighborsClassifier(n_neighbors = g_res.best_params_['n_neighbors'], weights = g_res.best_params_['weights'],algorithm = g_res.best_params_['algorithm'],metric = g_res.best_params_['metric'],p = g_res.best_params_['p'])
knn_tuned.fit(X_train, y_train)

In [None]:
joblib.dump(knn_tuned, output_dir+'knn_model_tuned.joblib')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_model_tuned.joblib']

In [None]:
y_pred_tuned = knn_tuned.predict(X_test)
print(classification_report(y_test, y_pred_tuned))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_tuned)}")

              precision    recall  f1-score   support

           a       0.12      0.14      0.13         7
           b       0.00      0.00      0.00         6
           c       0.00      0.00      0.00         6
           d       0.11      0.17      0.13         6
           e       0.00      0.00      0.00         7
           f       0.00      0.00      0.00         6
           g       0.00      0.00      0.00         6
           h       0.00      0.00      0.00         7
           i       0.00      0.00      0.00         6
           k       0.12      0.17      0.14         6
           l       0.00      0.00      0.00         6
           m       0.00      0.00      0.00         6
           n       0.00      0.00      0.00         7
           o       0.00      0.00      0.00         6
           p       0.00      0.00      0.00         6
           q       0.00      0.00      0.00         6
           r       0.00      0.00      0.00         8
           s       0.21    

In [None]:
y_score_tuned = knn_tuned.predict_proba(X_test)
micro_roc_auc_ovr_tuned = roc_auc_score(
    y_test,
    y_score_tuned,
    multi_class="ovr",
    average="micro",
)

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr_tuned:.2f}")


macro_roc_auc_ovr_tuned = roc_auc_score(
    y_test,
    y_score_tuned,
    multi_class="ovr",
    average="macro",
)

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{macro_roc_auc_ovr_tuned:.2f}")

Micro-averaged One-vs-Rest ROC AUC score:
0.53
Macro-averaged One-vs-Rest ROC AUC score:
0.52
