## Best CNN Model for ASL Recognition (28 classes)
- 5 Convolutional Layers, 3 Pooling Layers, 1 Linear Layer, 1 Output Layer
- ReLU Activation Functions
- Batch normalization on all layers, dropout (p=0.4) on linear layers only
- Training batch size of 64
- Learning rate of 0.001
- Convolutional Kernel: 5x5
- Pooling kernel: 2x2

In [1]:
# Using Python 3.11.4
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchsummary
import os
import cv2
import sklearn.metrics
from sklearn.model_selection import train_test_split
import random
import gc
import datetime
import sys
import zipfile

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Load in raw dataset, run pre-processing and convert image data to tensors.
- Resize each image to 128x128 from 200x200
- Adjust brightness and contrast randomly, convert all images to grayscale
- Create horizontal flip of each image (1 new image)
- Apply two random rotations for original image and two random rotations for flipped image (4 new images)
- Total: 1 original image -> 6 processed images. So the original dataset size of 84000 turns to 504000

In [None]:
#Data Pre-Processing -> Apply random contrast, saturation.  Flip horizontally once, then do 2 random rotations between -45 and 45 to flip and original (1 image becomes 6 images)
base_path = r"C:\Users\twinj\OneDrive\CAP5610_Project"

#unzip the images and prepare for data processing
zip_path = os.path.join(base_path, "asl_alphabet_train.zip") #a zip files containing the 28 classes as folders of each letter
extract_folder = os.path.join(base_path, "extracted_imgs")
#create new folder for extracted files
os.makedirs(extract_folder, exist_ok=True)
#extract the files from the zipped folder to this new folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)
#path to new directory
images_path = os.path.join(extract_folder, "asl_alphabet_train")
image_folders = os.listdir(images_path)
#print(image_folders)

# Pre-allocate image_data to ensure memory is reserved
SKIP_VAL = 1
NUM_CLASSES= 28
image_data= np.empty(((3000//SKIP_VAL)*NUM_CLASSES*6, 128, 128, 1), dtype=np.float16)
print("RAM used to store image data: " + str(sys.getsizeof(image_data) // (1024*1024)) + " MB")
labels = []
index = 0  # This will keep track of where you are in the array

for i in range(len(image_folders)):  # only use every five images from raw dataset
    gc.collect()
    #print("Now loading in images from folder: " + str(image_folders[i]) + ", time: " + str(datetime.datetime.now().hour) + ":" + str(datetime.datetime.now().minute))
    
    # open up a folder corresponding to the hand gesture
    letter_folder_path = os.path.join(images_path, image_folders[i])
    folder_imgs = os.listdir(letter_folder_path)

    # open up each image in the subfolder and save data/labels
    for img in range(0, len(folder_imgs), SKIP_VAL):
        # read raw image
        image_path = os.path.join(letter_folder_path, folder_imgs[img])
        temp_data = cv2.imread(image_path)

        # resize to 128x128
        mod_img = cv2.resize(temp_data, (128, 128), cv2.INTER_LINEAR)
        
        # apply brightness and contrast, convert to grayscale
        brightness = random.randint(-50, 50)
        contrast = random.uniform(0.8, 1.2)
        mod_img = mod_img.astype(np.float32)  # convert to float32 for operations
        mod_img = cv2.addWeighted(mod_img, contrast, np.zeros(mod_img.shape, mod_img.dtype), 0, brightness)
        mod_img = cv2.cvtColor(mod_img, cv2.COLOR_BGR2GRAY)
        mod_img = mod_img[..., np.newaxis]  # adds a channel dimension
        
        # Assign the modified image to the pre-allocated array
        image_data[index] = mod_img.astype(np.float16)
        labels.append(np.float16(i)) #number corresponding to image folder currently open
        index += 1  # Increment the index for the next image
        
        # apply and save horizontal flip
        mod_img_flip = cv2.flip(mod_img, 1)
        mod_img_flip = mod_img_flip[..., np.newaxis]
        image_data[index] = mod_img_flip.astype(np.float16)
        labels.append(np.float16(i))
        index += 1
        
        # apply some random rotation (2 images for original and 2 for flipped) and save
        pos_rot = random.randint(5, 30)
        neg_rot = random.randint(-30, -5)
        (h, w) = mod_img.shape[:2]
        (ctrX, ctrY) = (h // 2, w // 2)
        pos_rot_mat = cv2.getRotationMatrix2D((ctrX, ctrY), pos_rot, 1.0)
        neg_rot_mat = cv2.getRotationMatrix2D((ctrX, ctrY), neg_rot, 1.0)
        
        orig_pos_rot_img = cv2.warpAffine(mod_img, pos_rot_mat, (w, h))
        orig_neg_rot_img = cv2.warpAffine(mod_img, neg_rot_mat, (w, h))
        flip_pos_rot_img = cv2.warpAffine(mod_img_flip, pos_rot_mat, (w, h))
        flip_neg_rot_img = cv2.warpAffine(mod_img_flip, neg_rot_mat, (w, h))
        
        # Assign the rotated images to the pre-allocated array
        orig_pos_rot_img = orig_pos_rot_img[..., np.newaxis]
        image_data[index] = orig_pos_rot_img.astype(np.float16)
        labels.append(np.float16(i))
        index += 1
        
        orig_neg_rot_img = orig_neg_rot_img[..., np.newaxis]
        image_data[index] = orig_neg_rot_img.astype(np.float16)
        labels.append(np.float16(i))
        index += 1
        
        flip_pos_rot_img = flip_pos_rot_img[..., np.newaxis]
        image_data[index] = flip_pos_rot_img.astype(np.float16)
        labels.append(np.float16(i))
        index += 1
        
        flip_neg_rot_img = flip_neg_rot_img[..., np.newaxis]
        image_data[index] = flip_neg_rot_img.astype(np.float16)
        labels.append(np.float16(i))
        index += 1
        
        del temp_data, mod_img, mod_img_flip, orig_neg_rot_img, orig_pos_rot_img, flip_neg_rot_img, flip_pos_rot_img

print(f"Length of image_data array: {len(image_data)}")
print(f"Length of labels list: {len(labels)}")
print("28 folders times 1500 images times 6 augmentations in each folder equals: " + str(NUM_CLASSES * 3000//SKIP_VAL * 6))


In [None]:
# Generate one image to ensure processing looks correct 
plt.imshow(image_data[2000].squeeze(), cmap='gray')
plt.title(f"Label: {labels[2000]}")
plt.show()

In [None]:
#ensure image data and labels are all float16
for i in range(len(image_data)):
    if image_data[i].dtype != "float16":
        print(image_data[i].dtype, i)

for i in range(len(labels)):
    if labels[i].dtype != "float16":
        print(labels[i].dtype, i)

#PyTorch requires color channels to be swapped for the image data
image_data = np.transpose(image_data, (0, 3, 1, 2))
print(image_data.shape)

#check the number of each label that was imported to make sure it is even
unique, counts = np.unique(labels, return_counts=True)
dict(zip(unique, counts))

In [None]:
#convert image_data and labels to tensors
image_data_tens = []

for i in image_data:
  t = torch.from_numpy(i)
  image_data_tens.append(t)

#Loop to transform numpy data to tensor
label_tens = []

for i in labels:
  t = torch.from_numpy(np.array(i))
  label_tens.append(t)

print(type(label_tens[1]))
print(type(image_data_tens[0]))

del image_data #can delete this now that we have tensors, but need to keep labels for reference in calculating accuracy

## Prepare to Run the Model

Setup training and test splits

In [160]:
#Split dataset into training and test sets
x_Train, x_Test, y_Train, y_Test = train_test_split(image_data_tens, label_tens, random_state=42, shuffle=True, test_size=0.2)
unique, counts = np.unique(y_Train, return_counts=True)
print("train data contains: " + str(dict(zip(unique, counts))))
unique, counts = np.unique(y_Test, return_counts=True)
print("test data contains: "+ str(dict(zip(unique, counts))))
print("Train set samples: "+ str(len(y_Train)))
print("Test set samples: "+ str(len(y_Test)))

train_data = list(zip(x_Train, y_Train))
test_data = list(zip(x_Test, y_Test))

train data contains: {0.0: 726, 1.0: 733, 2.0: 710, 3.0: 700, 4.0: 709, 5.0: 730, 6.0: 723, 7.0: 725, 8.0: 720, 9.0: 699, 10.0: 717, 11.0: 723, 12.0: 701, 13.0: 719, 14.0: 737, 15.0: 748, 16.0: 710, 17.0: 736, 18.0: 702, 19.0: 709, 20.0: 727, 21.0: 719, 22.0: 738, 23.0: 718, 24.0: 719, 25.0: 732, 26.0: 726, 27.0: 704}
test data contains: {0.0: 174, 1.0: 167, 2.0: 190, 3.0: 200, 4.0: 191, 5.0: 170, 6.0: 177, 7.0: 175, 8.0: 180, 9.0: 201, 10.0: 183, 11.0: 177, 12.0: 199, 13.0: 181, 14.0: 163, 15.0: 152, 16.0: 190, 17.0: 164, 18.0: 198, 19.0: 191, 20.0: 173, 21.0: 181, 22.0: 162, 23.0: 182, 24.0: 181, 25.0: 168, 26.0: 174, 27.0: 196}
Train set samples: 20160
Test set samples: 5040


In [161]:
del image_data_tens, label_tens

Model Hyperparameters

In [5]:
n_epochs = 50
train_batch_size = 64 
test_batch_size = 128
learning_rate = 0.001
conv_kernel_size = 5
pool_kernel_size = 2

Load data into DataLoader objects for training/test

In [None]:
# Create a sampler for train and test data
train_sampler = torch.randperm(len(train_data))
test_sampler = torch.randperm(len(test_data))

# Convert the output of torch.randperm to a CPU tensor (needs to be CPU for this)
train_sampler = train_sampler.cpu()
test_sampler = test_sampler.cpu()

train_loader = torch.utils.data.DataLoader(
    train_data, batch_size = train_batch_size, shuffle = False, sampler=train_sampler, num_workers = 24, pin_memory=True)

test_loader = torch.utils.data.DataLoader(
    test_data, batch_size = test_batch_size, shuffle=False, sampler=test_sampler, num_workers = 24, pin_memory=True)

Setup the model layers

In [None]:
class CNN(nn.Module):

  #__init__: Construct the layers in the model
  def __init__(self):
    super(CNN, self).__init__()

    # Convolutional layers
    self.conv1 = nn.Conv2d(1, 8, conv_kernel_size, padding=2, stride=1)
    self.conv2 = nn.Conv2d(8, 16, conv_kernel_size, padding=2, stride=1)
    self.conv3 = nn.Conv2d(16, 32, conv_kernel_size, padding=2, stride=1)
    self.conv4 = nn.Conv2d(32, 64, conv_kernel_size, padding=2, stride=1)
    self.conv5 = nn.Conv2d(64, 128, conv_kernel_size, padding=2, stride=1)

    # Pooling layer
    self.pool = nn.MaxPool2d(pool_kernel_size)

    # Calculate the size of the flattened tensor after the convolution and pooling layers
    self.flattened_size = 128 * (128 // (pool_kernel_size ** 3)) * (128 // (pool_kernel_size ** 3)) 

    # Fully connected layers
    self.fc1 = nn.Linear(self.flattened_size, 64)
    self.out = nn.Linear(64, 28)

    # Activation function layer
    self.relu = nn.ReLU()

    # Flatten layer
    self.flatten = nn.Flatten()

    # Dropout layers
    self.fc_drop = nn.Dropout(p=0.4)

    # Batch normalization layers (need unique ones for each size of layer)
    self.conv1_bn = nn.BatchNorm2d(8, affine=True)  # affine = True makes it learnable
    self.conv2_bn = nn.BatchNorm2d(16, affine=True)
    self.conv3_bn = nn.BatchNorm2d(32, affine=True)
    self.conv4_bn = nn.BatchNorm2d(64, affine=True)
    self.conv5_bn = nn.BatchNorm2d(128, affine=True)
    self.fc1_bn = nn.BatchNorm1d(64, affine=True)

  # forward: setup the layer order (forward pass)
  def forward(self, x):

    # First conv layer
    x = self.conv1(x)
    x = self.conv1_bn(x)
    x = self.relu(x)

    # Second conv & pooling layer
    x = self.conv2(x)
    x = self.conv2_bn(x)
    x = self.relu(x)
    x = self.pool(x)

    # Third conv layer
    x = self.conv3(x)
    x = self.conv3_bn(x)
    x = self.relu(x)

    # Fourth conv layer and pool
    x = self.conv4(x)
    x = self.conv4_bn(x)
    x = self.relu(x)
    x = self.pool(x)

    # FouFifthrth conv layer and pool
    x = self.conv5(x)
    x = self.conv5_bn(x)
    x = self.relu(x)
    x = self.pool(x)
    
    # Flatten input and feed to fully connected layers
    x = self.flatten(x)
    x = self.fc1(x)
    x = self.fc1_bn(x)
    x = self.fc_drop(x)
    x = self.relu(x)
    x = self.out(x)
    output = x

    # nn.CrossEntropyLoss() expects raw logits, so the softmax is included in the loss function.
    return output


In [None]:
#Instantiate a CNN model consructed above
cnn_model = CNN()
cnn_model.to(device) #move model to CUDA

#Print CNN model to view layers (only layer structure seen, not in forward defined format)
print(cnn_model)

#View summary of model (with full forward defined structure) and see all parameters
torchsummary.summary(cnn_model, input_size=(1, 128, 128))
#check model is assigned to cuda
print(next(cnn_model.parameters()).device)

Train the model, evaluate on test set and save metrics

In [None]:
#Define the loss function and optimizer
#should try an adjustable loss function eventually
cnn_model = cnn_model.to(device)
loss_f = nn.CrossEntropyLoss()
optimizer = optim.Adam(cnn_model.parameters(), lr = learning_rate)

In [173]:
def getClassAccuracy(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    classes = np.unique(np.array(labels, dtype=int)) #make keys ints 
    class_accuracies = {}

    for c in classes:
        indicies = np.where(labels == c)[0]  #Get indices where label == c
        correct = np.sum(preds[indicies] == labels[indicies])
        total = len(indicies)
        class_accuracies[c] = (correct / total) if total > 0 else 0.0  # Avoid division by zero

    return class_accuracies

def averageAccuracy(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    
    correct = np.sum(preds[:] == labels[:])
    total = len(preds)
    accuracy = (correct / total) if total > 0 else 0.0
    return accuracy

def classF1Score(preds, labels):
    score_array = sklearn.metrics.f1_score(labels, preds, average=None) 
    ave_f1_score = np.mean(score_array)  
    class_max = np.argmax(score_array)  
    class_min = np.argmin(score_array)  
    best_class_f1_score = np.max(score_array)
    worst_class_f1_score = np.min(score_array)
    
    return ave_f1_score, class_max, best_class_f1_score, class_min, worst_class_f1_score, score_array

In [None]:
test_loss = []
test_accuracy = []
test_f1_score = []
max_acc_class = []
best_class_acc = []
min_acc_class = []
worst_class_acc = []
max_f1_class = []
best_class_f1 = []
min_f1_class = []
worst_class_f1 = []
last_class_f1_score_array = []
last_class_accuracy_array = []

def test(model):
    #put the model into eval mode to disable grads and learning
    model.eval()
    with torch.no_grad():
        n_correct = 0
        n_total_samples = len(test_loader.dataset)
        all_predictions = []
        all_labels = []
        #Loop for testing image predictions vs labels
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device).long()  # Convert labels to long (int64) for CrossEntropyLoss
            outputs = model(images.float())
            probs = torch.softmax(outputs, dim=1)  # Convert logits to probabilities
            preds = torch.argmax(probs, dim=1)  # Get predicted class labels
            loss = loss_f(outputs, labels.long())
            n_correct += (preds == labels).sum().item()
            all_predictions.append(preds.tolist())
            all_labels.append(labels.tolist())
        
        #flatten lists to be just one list so we can get accuracy metrics
        all_predictions = [item for sublist in all_predictions for item in sublist]
        all_labels = [item for sublist in all_labels for item in sublist]
        #calculate and store metrics
        loss = loss_f(outputs, labels)
        test_loss.append(loss.item())
        class_accuracies = getClassAccuracy(np.array(all_predictions), np.array(all_labels))
        last_class_accuracy_array.append(class_accuracies)
        best_class = max(class_accuracies, key=class_accuracies.get)
        worst_class = min(class_accuracies, key=class_accuracies.get)
        max_acc_class.append(best_class)
        best_class_acc.append(class_accuracies[best_class])
        min_acc_class.append(worst_class)
        worst_class_acc.append(class_accuracies[worst_class])
        ave_accuracy = averageAccuracy(np.array(all_predictions), np.array(all_labels))
        test_accuracy.append(ave_accuracy)
        ave_f1_score, f1_class_max, best_class_f1_score, f1_class_min, worst_class_f1_score, score_array = classF1Score(np.array(all_predictions), np.array(all_labels))
        last_class_f1_score_array.append(score_array)
        test_f1_score.append(ave_f1_score)
        max_f1_class.append(f1_class_max)
        best_class_f1.append(best_class_f1_score)
        min_f1_class.append(f1_class_min)
        worst_class_f1.append(worst_class_f1_score)
        print(f"Test loss: {test_loss[-1]}") 
        print(f"Test average accuracy: {test_accuracy[-1]:.4f}, Max accuracy class: {max_acc_class[-1]} with {best_class_acc[-1]:.2f}, Min accuracy class: {min_acc_class[-1]} with {worst_class_acc[-1]:.2f}")
        print(f"Test average F1 score: {test_f1_score[-1]:.4f}, Max f1 class: {max_f1_class[-1]} with {best_class_f1[-1]:.2f}, Min f1 class: {min_f1_class[-1]} with {worst_class_f1[-1]:.2f}")
    #put the model back in train mode
    model.train()


Train Model

In [None]:
n_total_steps = len(train_loader)
training_loss = []
training_accuracy = []

# Loop iterates over the number of epochs specified
for epoch in range(n_epochs):
  cnn_model.train() 

  # Loop iterates over the training loader
  for i, (images, labels) in enumerate(train_loader):
    images = images.to(device)
    labels = labels.to(device).long()
    
    # Forward pass and loss calculation
    outputs = cnn_model(images.float())
    probs = torch.softmax(outputs, dim=1)  # Convert output logits to probabilities
    preds = torch.argmax(probs, dim=1)  # Get predicted class labels
    loss = loss_f(outputs, labels)

    # Backward path and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss every batch size updates to visualize training
    if (i + 1) % n_total_steps == 0:
      print(f'Epoch [{epoch + 1}/{n_epochs}], Step [{i + 1}/{n_total_steps}], Loss: {loss.item():.4f}, Time: {datetime.datetime.now().hour}:{datetime.datetime.now().minute}')

  training_loss.append(loss.item())
  train_acc = averageAccuracy(preds.cpu().numpy(), labels.cpu().numpy())
  training_accuracy.append(train_acc)
  test(cnn_model)  # run the test dataset each epoch iteration


Plot the metrics for evaluation

In [None]:
print("===============================================")
print("TEST EVALUATION RESULTS")
print("===============================================")
print("Final Training Loss: " + str(training_loss[-1])) 
print("Overall Test Loss: " + str(test_loss[-1]))
print("Overall Test Accuracy: " + str(test_accuracy[-1]*100) + "%")
print("Macro F1 Score: " + str(test_f1_score[-1]))
print("===============================================")
print("CLASS PERFORMANCE BREAKDOWN")
print("===============================================")
print(f"Best performing class (accuracy): Class {max_acc_class[-1]} with {best_class_acc[-1]*100:2f}% accuracy")
print(f"Worst performing class (accuracy): Class {(min_acc_class[-1])} with {(worst_class_acc[-1]*100):2f}% accuracy")
print(f"Best performing class (F1): Class {(max_f1_class[-1])} with {(best_class_acc[-1]):2f} F1 score")
print(f"Worst performing class (F1): Class {(min_f1_class[-1])} with {(worst_class_acc[-1]):2f} F1 score")
print("===============================================")

class_accuracy_data = list(last_class_accuracy_array[-1].values())
class_f1_data = last_class_f1_score_array[-1]

#get all labels from the dataset so I can pull unique values later
all_labels = []
for _, labels in train_loader:  
    all_labels.extend(labels.numpy())  
all_labels = np.array(all_labels)

#make a loss plot
plt.figure()
plt.plot(test_loss, label="Test Loss")
plt.plot(training_loss, label="Training Loss")
plt.title("Training vs Test Loss over Epochs")
plt.legend()
plt.show()

#make an accuracy plot
plt.figure()
plt.plot(test_accuracy, label="Test Accuracy")
plt.plot(training_accuracy, label="Training Accuracy")
plt.title("Model Accuracy Over Epochs")
plt.legend()
plt.show()

#Make a class performance plot
plt.figure()
x = np.arange(len(np.unique(all_labels))) * 5 # X locations for the groups
width = 1.5  # Width of the bars
fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x - width/2, class_accuracy_data, width, label='Accuracy')
ax.bar(x + width/2, class_f1_data, width, label='F1 Score')
# Labels, title, and legend
ax.set_xticks(x)
ax.set_xticklabels(np.unique(all_labels).astype(int))
ax.set_xlabel('Class')
ax.set_ylabel('Performance')
ax.set_title('Per-Class Performance (Final Epoch)')
ax.legend()
plt.show()