<a href="https://colab.research.google.com/github/wongzw/IS4242_Group_4/blob/4-knn-model/knn_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [89]:
import pickle
import joblib
import cv2
import os
import re
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import roc_auc_score, f1_score
from google.colab import drive
from tqdm import tqdm
from PIL import Image

!pip install mediapipe

import tensorflow as tf
import mediapipe as mp


from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Mounted at /content/drive


### Attempt without Mediapipe

Load and convert into a 2d arrary image

In [None]:
folder_name = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/data" # Update with Folder Details

In [None]:
image_paths = os.listdir(folder_name)
img_list = []
label_list = []
txt = 'abcdefghiklmnopqrstuvwxy'

for i in tqdm(range(len(os.listdir(folder_name)))):
  filename = os.listdir(folder_name)[i]
  if filename.endswith(('.jpg','.png')):
      label_name = re.split(r'[-_]', filename)[0].lower()
      if label_name not in txt:
        continue
      label_list.append(label_name)
      img = Image.open(os.path.join(folder_name, filename))
      img = img.resize((28, 28), Image.ANTIALIAS)
      img = img.convert('L')
      img_array = np.array(img)
      
      img_list.append(img_array)


100%|██████████| 1221/1221 [02:48<00:00,  7.24it/s]


#### Train-Test Split

*Called in case where X and Y already pickled*

In [None]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
X = joblib.load(output_dir+'X_knn.pkl')
y = joblib.load(output_dir+'y_knn.pkl')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
X = np.array(img_list)
y = np.array(label_list)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [None]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"

joblib.dump(X, output_dir+'X_knn.pkl')
joblib.dump(y, output_dir+'y_knn.pkl')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/y_knn.pkl']

#### Data Augmentation on Training Data

In [None]:
import albumentations as A

transform = A.Compose([
    A.RandomBrightnessContrast(),
    A.Affine(scale=[0.8,1.2],translate_percent=0.05, shear=0.2, keep_ratio=True, p=0.5),
    A.Rotate(limit=10)
])

In [None]:
augmented_X = []
augmented_y = []
for i in range(len(X_train)):
  for j in (range(10)):
      transformed = transform(image=X_train[i])
      augmented_X.append(transformed['image'])
      augmented_y.append(y_train[i])

augmented_X = np.array(augmented_X)
augmented_y = np.array(augmented_y)
X_train = np.concatenate([X_train,augmented_X])
y_train = np.concatenate([y_train,augmented_y])

output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
joblib.dump(augmented_X, output_dir+'knn_augmented_X.pkl')
joblib.dump(augmented_y, output_dir+'knn_augmented_y.pkl')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_augmented_y.pkl']

In [None]:
print(X_train.shape)
print(y_train.shape)

(7183, 28, 28)
(7183,)


Data Preprocessing

In [None]:
X_train = X_train/255.0
X_test = X_test/255.0

In [None]:
X_train = X_train.reshape(X_train.shape[0], 784)
X_test = X_test.reshape(X_test.shape[0], 784)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7183, 784)
(164, 784)
(7183,)
(164,)


In [None]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
joblib.dump(X_train, output_dir+'knn_final_X_train.pkl')
joblib.dump(X_test, output_dir+'knn_final_X_test.pkl')
joblib.dump(y_train, output_dir+'knn_final_y_train.pkl')
joblib.dump(y_test, output_dir+'knn_final_y_test.pkl')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_final_y_test.pkl']

#### Model Training and Testing

In [None]:
k = 3 # Choose the number of neighbors
classifier = KNeighborsClassifier(n_neighbors=k)
classifier.fit(X_train, y_train)

joblib.dump(classifier, output_dir+'knn_model.joblib')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_model.joblib']

In [None]:
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

classifiction_report = classification_report(y_test, y_pred)
print("Classification Report:", classifiction_report)

Accuracy: 0.09146341463414634
Classification Report:               precision    recall  f1-score   support

           a       0.08      0.14      0.11         7
           b       0.00      0.00      0.00         6
           c       0.00      0.00      0.00         6
           d       0.00      0.00      0.00         6
           e       0.17      0.14      0.15         7
           f       0.00      0.00      0.00         6
           g       0.00      0.00      0.00         6
           h       0.00      0.00      0.00         7
           i       0.00      0.00      0.00         6
           k       0.33      0.17      0.22         6
           l       0.00      0.00      0.00         6
           m       0.00      0.00      0.00         6
           n       0.00      0.00      0.00         7
           o       0.00      0.00      0.00         6
           p       0.00      0.00      0.00         6
           q       0.00      0.00      0.00         6
           r       0.00     

In [None]:
y_score = classifier.predict_proba(X_test)
micro_roc_auc_roc = roc_auc_score(
    y_test,
    y_score,
    multi_class="ovr",
    average="micro",
)

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_roc}")


macro_roc_auc_ovr_tuned = roc_auc_score(
    y_test,
    y_score,
    multi_class="ovr",
    average="macro",
)

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{macro_roc_auc_ovr_tuned}")

Micro-averaged One-vs-Rest ROC AUC score:
0.5198130318392262
Macro-averaged One-vs-Rest ROC AUC score:
0.5146816105401604


#### Hyperparameter Tuning

In [45]:
import multiprocessing
grid_params = {'n_neighbors': [3, 5, 7],
               'weights': ['uniform', 'distance'],
               'algorithm':['kd_tree', 'auto'],
               'p': [1, 2],
               'metric' : ['minkowski','euclidean','manhattan']}

gs = GridSearchCV(KNeighborsClassifier(), grid_params,scoring='accuracy', verbose = 3, cv=3, n_jobs = -1)

In [None]:
g_res = gs.fit(X_train, y_train)

Fitting 3 folds for each of 72 candidates, totalling 216 fits


In [None]:
print(g_res.best_score_)
print(g_res.best_params_)

0.3423476343375256
{'algorithm': 'kd_tree', 'metric': 'minkowski', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


Evaluating Tunned Model

In [None]:
knn_tuned = KNeighborsClassifier(n_neighbors = g_res.best_params_['n_neighbors'], weights = g_res.best_params_['weights'],algorithm = g_res.best_params_['algorithm'],metric = g_res.best_params_['metric'],p = g_res.best_params_['p'])
knn_tuned.fit(X_train, y_train)

In [None]:
joblib.dump(knn_tuned, output_dir+'knn_model_tuned.joblib')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_model_tuned.joblib']

In [None]:
y_pred_tuned = knn_tuned.predict(X_test)
print(classification_report(y_test, y_pred_tuned))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_tuned)}")

              precision    recall  f1-score   support

           a       0.12      0.14      0.13         7
           b       0.00      0.00      0.00         6
           c       0.00      0.00      0.00         6
           d       0.11      0.17      0.13         6
           e       0.00      0.00      0.00         7
           f       0.00      0.00      0.00         6
           g       0.00      0.00      0.00         6
           h       0.00      0.00      0.00         7
           i       0.00      0.00      0.00         6
           k       0.12      0.17      0.14         6
           l       0.00      0.00      0.00         6
           m       0.00      0.00      0.00         6
           n       0.00      0.00      0.00         7
           o       0.00      0.00      0.00         6
           p       0.00      0.00      0.00         6
           q       0.00      0.00      0.00         6
           r       0.00      0.00      0.00         8
           s       0.21    

In [None]:
y_score_tuned = knn_tuned.predict_proba(X_test)
micro_roc_auc_ovr_tuned = roc_auc_score(
    y_test,
    y_score_tuned,
    multi_class="ovr",
    average="micro",
)

print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr_tuned:.2f}")


macro_roc_auc_ovr_tuned = roc_auc_score(
    y_test,
    y_score_tuned,
    multi_class="ovr",
    average="macro",
)

print(f"Macro-averaged One-vs-Rest ROC AUC score:\n{macro_roc_auc_ovr_tuned:.2f}")

Micro-averaged One-vs-Rest ROC AUC score:
0.53
Macro-averaged One-vs-Rest ROC AUC score:
0.52


#### Cross Validation

In [78]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
X = joblib.load(output_dir+'X_knn.pkl')
y = joblib.load(output_dir+'y_knn.pkl')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [90]:
kfold = StratifiedKFold(n_splits=5, shuffle = True,  random_state = 42)

In [92]:
accuracy_scores = []
auc_scores_micro = []
f1_scores = []
classifier_cv = KNeighborsClassifier(n_neighbors=3)
count = 0

for train_idx, test_idx in kfold.split(X_train, y_train):
    X_train_cv, y_train_cv = X_train[train_idx], y_train[train_idx]
    X_test_cv, y_test_cv = X_train[test_idx], y_train[test_idx]
    print(f"Fold {count+1}/{5}:")
    
    augmented_X = []
    augmented_y = []
    for i in range(len(X_train_cv)):
        for j in (range(10)):
            transformed = transform(image=X_train_cv[i])
            augmented_X.append(transformed['image'])
            augmented_y.append(y_train_cv[i])

    augmented_X = np.array(augmented_X)
    augmented_y = np.array(augmented_y)
    X_train_cv = np.concatenate([X_train_cv,augmented_X])
    y_train_cv = np.concatenate([y_train_cv,augmented_y])

    X_train_cv = X_train_cv/255.0
    X_test_cv = X_test_cv/255.0

    X_train_cv = X_train_cv.reshape(X_train_cv.shape[0], 784)
    X_test_cv = X_test_cv.reshape(X_test_cv.shape[0], 784)

    classifier_cv.fit(X_train_cv, y_train_cv)

    y_pred = classifier_cv.predict(X_test_cv)
    accuracy_scores.append(accuracy_score(y_test_cv, y_pred))
    y_score = classifier.predict_proba(X_test_cv)

    auc_scores_micro.append(roc_auc_score(y_test_cv,y_score, multi_class='ovr', average='micro'))
    f1_scores.append(f1_score(y_test_cv, y_pred, average='weighted'))
    print("ROC AUC score:", auc_scores_micro[-1])
    print(f"Validation accuracy: {accuracy_scores[-1]}")
    print(f"Validation F1: {f1_scores[-1]}")
    count += 1

mean_accuracy = np.mean(accuracy_scores)
mean_f1 = np.mean(f1_scores)
mean_roc_micro = np.mean(auc_scores_micro)
print(f"\nMean cross-validation accuracy: {mean_accuracy:.3f}")
print(f"\nMean cross-validation F1: {mean_f1:.3f}")
print(f"\nMean cross-validation AUC-ROC (micro): {mean_roc_micro:.3f}")


Fold 1/5:
ROC AUC score: 0.9376923914943641
Validation accuracy: 0.08396946564885496
Validation F1: 0.08470833163244479
Fold 2/5:
ROC AUC score: 0.925275460282795
Validation accuracy: 0.0916030534351145
Validation F1: 0.09375628113840957
Fold 3/5:
ROC AUC score: 0.9532939957385679
Validation accuracy: 0.11450381679389313
Validation F1: 0.10935273378021469
Fold 4/5:
ROC AUC score: 0.9290378183689221
Validation accuracy: 0.15384615384615385
Validation F1: 0.147824174317387
Fold 5/5:
ROC AUC score: 0.9211036789297659
Validation accuracy: 0.1
Validation F1: 0.09147415179089387

Mean cross-validation accuracy: 0.109

Mean cross-validation F1: 0.105

Mean cross-validation AUC-ROC (micro): 0.933


In [94]:
X_train = X_train/255.0
X_test = X_test/255.0

X_train = X_train.reshape(X_train.shape[0], 784)
X_test = X_test.reshape(X_test.shape[0], 784)


In [95]:
classifier_cv.fit(X_train, y_train)

In [97]:
y_pred_classifier_cv = classifier_cv.predict(X_test)

In [98]:
print(classification_report(y_test, y_pred_classifier_cv))
print(f"Accuracy Score: {accuracy_score(y_test, y_pred_classifier_cv)}")

y_pred_classifier_cv_score = cross_val_predict(classifier_cv, X_test, y_test, cv=kfold, method='predict_proba')
auc_scores = roc_auc_score(y_test, y_pred_classifier_cv_score, multi_class='ovr')

print("AUC scores: ", auc_scores)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           a       0.11      0.29      0.16         7
           b       0.00      0.00      0.00         6
           c       0.00      0.00      0.00         6
           d       0.14      0.17      0.15         6
           e       0.13      0.29      0.18         7
           f       0.00      0.00      0.00         6
           g       0.07      0.17      0.10         6
           h       0.00      0.00      0.00         7
           i       0.00      0.00      0.00         6
           k       0.00      0.00      0.00         6
           l       0.00      0.00      0.00         6
           m       0.08      0.17      0.11         6
           n       0.00      0.00      0.00         7
           o       0.00      0.00      0.00         6
           p       0.00      0.00      0.00         6
           q       0.00      0.00      0.00         6
           r       0.00      0.00      0.00         8
           s       0.00    

#### Hyperparameter Tuning - Cross Validation

In [99]:
import multiprocessing
grid_params = {'n_neighbors': [3, 5, 7],
               'weights': ['uniform', 'distance'],
               'algorithm':['kd_tree', 'auto'],
               'p': [1, 2],
               'metric' : ['minkowski','euclidean','manhattan']}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gs_cv = GridSearchCV(KNeighborsClassifier(), grid_params,scoring='accuracy', verbose = 3, cv=kf, n_jobs = -1)

In [100]:
g_cv_res = gs_cv.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [101]:
print(g_cv_res.best_score_)
print(g_cv_res.best_params_)

0.1164063417498532
{'algorithm': 'kd_tree', 'metric': 'minkowski', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}


Evaluating Tunned Model

In [103]:
output_dir = f"/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/"
X = joblib.load(output_dir+'X_knn.pkl')
y = joblib.load(output_dir+'y_knn.pkl')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [104]:
accuracy_scores = []
auc_scores_micro = []
f1_scores = []
classifier_cv_tuned = KNeighborsClassifier(n_neighbors = g_cv_res.best_params_['n_neighbors'], weights = g_cv_res.best_params_['weights'],algorithm = g_cv_res.best_params_['algorithm'],metric = g_cv_res.best_params_['metric'],p = g_cv_res.best_params_['p'])
count = 0

for train_idx, test_idx in kfold.split(X_train, y_train):
    X_train_cv, y_train_cv = X_train[train_idx], y_train[train_idx]
    X_test_cv, y_test_cv = X_train[test_idx], y_train[test_idx]
    print(f"Fold {count+1}/{5}:")
    
    augmented_X = []
    augmented_y = []
    for i in range(len(X_train_cv)):
        for j in (range(10)):
            transformed = transform(image=X_train_cv[i])
            augmented_X.append(transformed['image'])
            augmented_y.append(y_train_cv[i])

    augmented_X = np.array(augmented_X)
    augmented_y = np.array(augmented_y)
    X_train_cv = np.concatenate([X_train_cv,augmented_X])
    y_train_cv = np.concatenate([y_train_cv,augmented_y])

    X_train_cv = X_train_cv/255.0
    X_test_cv = X_test_cv/255.0

    X_train_cv = X_train_cv.reshape(X_train_cv.shape[0], 784)
    X_test_cv = X_test_cv.reshape(X_test_cv.shape[0], 784)

    classifier_cv_tuned.fit(X_train_cv, y_train_cv)

    y_pred = classifier_cv_tuned.predict(X_test_cv)
    accuracy_scores.append(accuracy_score(y_test_cv, y_pred))
    y_score = classifier.predict_proba(X_test_cv)

    auc_scores_micro.append(roc_auc_score(y_test_cv,y_score, multi_class='ovr', average='micro'))
    f1_scores.append(f1_score(y_test_cv, y_pred, average='weighted'))
    print("ROC AUC score:", auc_scores_micro[-1])
    print(f"Validation accuracy: {accuracy_scores[-1]}")
    print(f"Validation F1: {f1_scores[-1]}")
    count += 1

mean_accuracy = np.mean(accuracy_scores)
mean_f1 = np.mean(f1_scores)
mean_roc_micro = np.mean(auc_scores_micro)
print(f"\nMean cross-validation accuracy: {mean_accuracy:.3f}")
print(f"\nMean cross-validation F1: {mean_f1:.3f}")
print(f"\nMean cross-validation AUC-ROC (micro): {mean_roc_micro:.3f}")


Fold 1/5:
ROC AUC score: 0.9376923914943641
Validation accuracy: 0.0916030534351145
Validation F1: 0.0869713358038459
Fold 2/5:
ROC AUC score: 0.925275460282795
Validation accuracy: 0.0916030534351145
Validation F1: 0.08428923269515996
Fold 3/5:
ROC AUC score: 0.9532939957385679
Validation accuracy: 0.11450381679389313
Validation F1: 0.11051798074698838
Fold 4/5:
ROC AUC score: 0.9290378183689221
Validation accuracy: 0.15384615384615385
Validation F1: 0.144904529859281
Fold 5/5:
ROC AUC score: 0.9211036789297659
Validation accuracy: 0.1
Validation F1: 0.08432938856015779

Mean cross-validation accuracy: 0.110

Mean cross-validation F1: 0.102

Mean cross-validation AUC-ROC (micro): 0.933


In [105]:
joblib.dump(classifier_cv_tuned, output_dir+'knn_model_cv_tuned.joblib')

['/content/drive/MyDrive/4. NUS/Studies/Y4S2/IS4242/IS4242/dump/knn_model_cv_tuned.joblib']