## Import packages

In [1]:
from joblib import dump, load
import os
from PIL import Image
from tqdm import tqdm
import re
import numpy as np
import pandas as pd
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix, roc_auc_score, accuracy_score
from sklearn.svm import SVC

In [3]:
import albumentations as A

transform = A.Compose([
    A.RandomBrightnessContrast(),
    A.Affine(scale=[0.8,1.2],translate_percent=0.05, shear=0.2, keep_ratio=True, p=0.5),
    A.Rotate(limit=10)
])

## Data Preprocessing

#### Read image and convert it to 28x28 matrix:

In [5]:
X = load('X_224.pkl')
y = load('y_224.pkl')

In [6]:
len(X)

817

#### Train test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
print(X_train.shape)
print(X_test.shape)

(653, 224, 224)
(164, 224, 224)


In [9]:
print(y_train.shape)
print(y_test.shape)

(653, 24)
(164, 24)


#### DONT DO data augmentation on model without mediapipe - will take too long to train model
Augmentation only on train data

In [10]:
# augmented_X = []
# augmented_y = []
# for i in range(len(X_train)):
#     for j in (range(10)):
#         transformed = transform(image=X_train[i])
#         augmented_X.append(transformed['image'])
#         augmented_y.append(y_train[i])

# augmented_X = np.array(augmented_X)
# augmented_y = np.array(augmented_y)
# X_train = np.concatenate([X_train,augmented_X])
# y_train = np.concatenate([y_train,augmented_y])

# # output_dir = '/content/drive/MyDrive/IS4242/dump/'
# # joblib.dump(augmented_X, output_dir+'augmented_X.pkl')
# # joblib.dump(augmented_y, output_dir+'augmented_y.pkl')

In [11]:
# print(X_train.shape)
# print(y_train.shape)

In [12]:
# print(y_train)

## Without mediapipe

In [13]:
#scaling to improve performance - only for without mediapipe training
X_train = X_train/255
X_test = X_test/255

#### SVM - linear


In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(653, 224, 224)
(164, 224, 224)
(653, 24)
(164, 24)


In [15]:
# linear model

model_linear = SVC(kernel='linear', probability=True)
model_linear.fit(X_train.reshape(X_train.shape[0], -1), np.argmax(y_train, axis=1))
# predict
y_pred = model_linear.predict(X_test.reshape((X_test.shape[0],-1)))

##### Evaluation

In [16]:
# confusion matrix and accuracy

# accuracy
print("accuracy:", accuracy_score(y_true=np.argmax(y_test, axis=1), y_pred=y_pred), "\n")

# cm
print(confusion_matrix(y_true=np.argmax(y_test, axis=1), y_pred=y_pred))

accuracy: 0.15853658536585366 

[[0 2 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0]
 [2 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0]
 [0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 2 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 1 2 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 2 0 0 0 1 2 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0]
 [1 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 2 1 0 0 1 0]
 [0 0 1 0 0 0 0 1 1 0 2 1 1 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 1 0 2 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 2 0 0 1 0 0 0 0 0 2 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 2 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
 [0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 4 1 0 0 0 0 0 0]
 [1 1 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 2 0 0 0 0 0 1]
 [0 3 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0]

In [17]:
print(classification_report(np.argmax(y_test, axis=1), y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.08      0.50      0.14         2
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         8
           7       0.00      0.00      0.00         6
           8       0.20      0.20      0.20        10
           9       0.17      0.08      0.11        12
          10       0.50      0.25      0.33         8
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         8
          13       0.17      0.29      0.21         7
          14       0.17      0.14      0.15         7
          15       0.00      0.00      0.00         4
          16       0.36      0.50      0.42         8
          17       0.40    

In [18]:
#get AUROC scores
y_pred_scores = model_linear.predict_proba(X_test.reshape((X_test.shape[0],-1)))
roc_auc_micro = roc_auc_score(y_test, y_pred_scores, multi_class='ovr', average='micro')
roc_auc_macro = roc_auc_score(y_test, y_pred_scores, multi_class='ovr', average='macro')
print("ROC AUC score (micro average)", roc_auc_micro)
print("ROC AUC score (macro average)", roc_auc_macro)

ROC AUC score (micro average) 0.5224455551819569
ROC AUC score (macro average) 0.5864831480154968


#### SVM - rbf non linear 

In [20]:
# non-linear model
# using rbf kernel, C=1, default value of gamma
non_linear = SVC(kernel='rbf', probability=True)
non_linear.fit(X_train.reshape((X_train.shape[0],-1)), np.argmax(y_train, axis=1))

# predict
y_pred = non_linear.predict(X_test.reshape((X_test.shape[0],-1)))

##### Evaluation

In [21]:
# confusion matrix and accuracy

# accuracy
print("accuracy:", accuracy_score(y_true=np.argmax(y_test, axis=1), y_pred=y_pred), "\n")

# cm
print(confusion_matrix(y_true=np.argmax(y_test, axis=1), y_pred=y_pred))

accuracy: 0.03048780487804878 

[[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 3]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1]
 [0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 2]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 1 1 0 0 0 0 3 0]
 [0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 3 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 2]
 [0 1 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 1 0 0 0 0 2 2]
 [0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 5]
 [0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 2 0 0 2 2]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 2 1 0 1 2]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3 0]
 [0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 4 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 5 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 2]
 [0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 1 0 1 3]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 4 3]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2 1 0 1 3]

In [22]:
print(classification_report(np.argmax(y_test, axis=1), y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         8
           7       0.00      0.00      0.00         6
           8       0.00      0.00      0.00        10
           9       0.00      0.00      0.00        12
          10       0.00      0.00      0.00         8
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         8
          13       0.00      0.00      0.00         7
          14       0.00      0.00      0.00         7
          15       0.00      0.00      0.00         4
          16       0.00      0.00      0.00         8
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
#get AUROC scores
y_pred_scores = non_linear.predict_proba(X_test.reshape((X_test.shape[0],-1)))
roc_auc_micro = roc_auc_score(y_test, y_pred_scores, multi_class='ovr', average='micro')
roc_auc_macro = roc_auc_score(y_test, y_pred_scores, multi_class='ovr', average='macro')
print("ROC AUC score (micro average)", roc_auc_micro)
print("ROC AUC score (macro average)", roc_auc_macro)

ROC AUC score (micro average) 0.5441264904430593
ROC AUC score (macro average) 0.5666830257598786


#### SVM - GridSearchCV to get the best parameters

In [24]:
#add gridsearchcv

param_grid = {'C': [0.1, 1, 100, 1000], 
              'gamma': [1, 0.01, 0.0001]}
grid = GridSearchCV(SVC(kernel='linear', probability=True), param_grid, n_jobs=-1)
grid.fit(X_train.reshape((X_train.shape[0],-1)), np.argmax(y_train, axis=1))
# predict
y_pred = grid.predict(X_test.reshape((X_test.shape[0],-1)))

In [25]:
print(grid.best_params_)

{'C': 0.1, 'gamma': 1}


##### Evaluation

In [26]:
# confusion matrix and accuracy

# accuracy
print("accuracy:", accuracy_score(y_true=np.argmax(y_test, axis=1), y_pred=y_pred), "\n")

# cm
print(confusion_matrix(y_true=np.argmax(y_test, axis=1), y_pred=y_pred))

accuracy: 0.15853658536585366 

[[0 2 0 0 0 0 1 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0]
 [2 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0]
 [0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 2 0 1 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 1 2 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 2 0 0 0 1 2 0 0 1 0 0 0 0 2 0 0 0 0 0 0 0]
 [1 0 1 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 2 1 0 0 1 0]
 [0 0 1 0 0 0 0 1 1 0 2 1 1 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0]
 [1 1 0 0 0 0 1 0 2 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 2 0 0 1 0 0 0 0 0 2 0 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 0 1 0 1 0 0 0 1 0 0 1 1 0 0 2 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1]
 [0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 4 1 0 0 0 0 0 0]
 [1 1 0 0 1 0 0 0 0 0 0 0 1 1 0 1 0 2 0 0 0 0 0 1]
 [0 3 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 1 0 0 1 0]

In [27]:
print(classification_report(np.argmax(y_test, axis=1), y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.08      0.50      0.14         2
           2       0.00      0.00      0.00         6
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00         5
           5       0.00      0.00      0.00         4
           6       0.00      0.00      0.00         8
           7       0.00      0.00      0.00         6
           8       0.20      0.20      0.20        10
           9       0.17      0.08      0.11        12
          10       0.50      0.25      0.33         8
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         8
          13       0.17      0.29      0.21         7
          14       0.17      0.14      0.15         7
          15       0.00      0.00      0.00         4
          16       0.36      0.50      0.42         8
          17       0.40    

In [28]:
#get AUROC scores

y_pred_scores = grid.predict_proba(X_test.reshape((X_test.shape[0],-1)))
roc_auc_micro = roc_auc_score(y_test, y_pred_scores, multi_class='ovr', average='micro')
roc_auc_macro = roc_auc_score(y_test, y_pred_scores, multi_class='ovr', average='macro')
print("ROC AUC score (micro average)", roc_auc_micro)
print("ROC AUC score (macro average)", roc_auc_macro)

ROC AUC score (micro average) 0.5220980006724776
ROC AUC score (macro average) 0.5802250367784576
