# Gesture Recognition
In this group project, you are going to build a 3D Conv model that will be able to predict the 5 gestures correctly. Please import the following libraries to get started.

In [23]:
import numpy as np
import os
import time
import pandas as pd
import datetime
import time
import warnings
warnings.filterwarnings("ignore")

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Dense, GRU, Flatten, TimeDistributed, BatchNormalization, Activation, Dropout, Conv3D, MaxPooling3D, Conv2D, MaxPooling2D, SimpleRNN, LSTM
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, Callback
from tensorflow.keras.optimizers import Adam

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Reshape
from tensorflow.keras.regularizers import l2

from skimage.transform import resize  # Use skimage for image resizing
import imageio
from imageio import imread  # Use imageio for reading images
from PIL import Image
import cv2

We set the random seed so that the results don't vary drastically.

In [2]:
np.random.seed(30)
import random as rn
rn.seed(30)
from keras import backend as K
import tensorflow as tf
tf.random.set_seed(30)

In this block, you read the folder names for training and validation. You also set the `batch_size` here. Note that you set the batch size in such a way that you are able to use the GPU in full capacity. You keep increasing the batch size until the machine throws an error.

In [3]:
train_doc = np.random.permutation(open('C:\\Users\\Vinay Joshi\\Documents\\PGD AI\\5. Gesture Recognition Project\\Project_data\\train.csv').readlines())
val_doc = np.random.permutation(open('C:\\Users\\Vinay Joshi\\Documents\\PGD AI\\5. Gesture Recognition Project\\Project_data\\val.csv').readlines())

train_path = 'C:\\Users\\Vinay Joshi\\Documents\\PGD AI\\5. Gesture Recognition Project\\Project_data\\train'
val_path = 'C:\\Users\\Vinay Joshi\\Documents\\PGD AI\\5. Gesture Recognition Project\\Project_data\\val'

# Define the model directory
model_dir = 'C:\\Users\\Vinay Joshi\\Documents\\PGD AI\\5. Gesture Recognition Project\\Bestmodels'

# Define filepaths for all models
model_conv3D_filepath = model_dir + '\\' + 'BestModelConv3D.keras'
model_2Drnn_filepath = model_dir + '\\' + 'BestModelConv2D_RNN.keras'
model_2Dlstm_filepath = model_dir + '\\' + 'BestModelConv2D_LSTM.keras'
model_2Dgru_filepath = model_dir + '\\' + 'BestModelConv2D_GRU.keras'
model_3Drnn_filepath = model_dir + '\\' + 'BestModelConv3D_RNN.keras'
model_3Dlstm_filepath = model_dir + '\\' + 'BestModelConv3D_LSTM.keras'
model_3Dgru_filepath = model_dir + '\\' + 'BestModelConv3D_GRU.keras'

batch_size = 32
num_epochs = 30 # choose the number of epochs

# Defining input dimensions for the model
num_frames = 16  # x, number of frames
img_height = 64  # y, image height
img_width = 64   # z, image width

## Generator
This is one of the most important part of the code. The overall structure of the generator has been given. In the generator, you are going to preprocess the images as you have images of 2 different dimensions as well as create a batch of video frames. You have to experiment with `img_idx`, `y`,`z` and normalization such that you get high accuracy.

In [4]:

def generator(source_path, folder_list, batch_size, x, y, z):
    print( 'Source path = ', source_path, '; batch size =', batch_size)
    #img_idx = #create a list of image numbers you want to use for a particular video
  
    img_idx = np.linspace(0, 29, x, dtype=int)  # Select 10 frames from 30

    while True:
        t = np.random.permutation(folder_list)
        num_batches = len(folder_list) // batch_size # calculate the number of batches
        for batch in range(num_batches): # we iterate over the number of batches
            batch_data = np.zeros((batch_size,x,y,z,3)) # x is the number of images you use for each video, (y,z) is the final size of the input images and 3 is the number of channels RGB
            batch_labels = np.zeros((batch_size,5)) # batch_labels is the one hot representation of the output
            for folder in range(batch_size): # iterate over the batch_size
                imgs = os.listdir(source_path+'/'+ t[folder + (batch*batch_size)].split(';')[0]) # read all the images in the folder
                for idx,item in enumerate(img_idx): #  Iterate iver the frames/images of a folder to read them in
                    image = imread(source_path+'/'+ t[folder + (batch*batch_size)].strip().split(';')[0]+'/'+imgs[item]).astype(np.float32)
                    
                    #crop the images and resize them. Note that the images are of 2 different shape 
                    #and the conv3D will throw error if the inputs in a batch have different shapes

                    # Crop the image
                    h, w = image.shape[:2]
                    min_dim = min(h, w)
                    crop_img = image[(h//2 - min_dim//2):(h//2 + min_dim//2), (w//2 - min_dim//2):(w//2 + min_dim//2)]
                    
                    # Resize the image
                    resized_image = resize(crop_img, (y, z), anti_aliasing=True)
                    
                    # Normalize the image
                    resized_image /= 255.0                    
                    
                    batch_data[folder,idx,:,:,0] = resized_image[:, :, 0] #normalise and feed in the image
                    batch_data[folder,idx,:,:,1] = resized_image[:, :, 1] #normalise and feed in the image
                    batch_data[folder,idx,:,:,2] = resized_image[:, :, 2] #normalise and feed in the image
                    
                batch_labels[folder, int(t[folder + (batch*batch_size)].strip().split(';')[2])] = 1
            yield batch_data, batch_labels #you yield the batch_data and the batch_labels, remember what does yield do

        
        # write the code for the remaining data points which are left after full batches
        if len(folder_list) % batch_size != 0:
            batch_data = np.zeros((len(folder_list) % batch_size, len(img_idx), y, z, 3))
            batch_labels = np.zeros((len(folder_list) % batch_size, 5))
            
            for folder in range(len(folder_list) % batch_size):
                imgs = os.listdir(source_path + '/' + t[folder + (num_batches * batch_size)].split(';')[0])
                
                for idx, item in enumerate(img_idx):
                    image = imread(source_path + '/' + t[folder + (num_batches * batch_size)].strip().split(';')[0] + '/' + imgs[item]).astype(np.float32)
                    
                    # Crop the image
                    h, w = image.shape[:2]
                    min_dim = min(h, w)
                    crop_img = image[(h//2 - min_dim//2):(h//2 + min_dim//2), (w//2 - min_dim//2):(w//2 + min_dim//2)]
                    
                    # Resize the image
                    resized_image = resize(crop_img, (y, z), anti_aliasing=True)
                    
                    # Normalize the image
                    resized_image /= 255.0
                    
                    # Store in batch_data
                    batch_data[folder, idx, :, :, 0] = resized_image[:, :, 0]
                    batch_data[folder, idx, :, :, 1] = resized_image[:, :, 1]
                    batch_data[folder, idx, :, :, 2] = resized_image[:, :, 2]
                
                # One-hot encode labels
                batch_labels[folder, int(t[folder + (num_batches * batch_size)].strip().split(';')[2])] = 1
            
            yield batch_data, batch_labels

Note here that a video is represented above in the generator as (number of images, height, width, number of channels). Take this into consideration while creating the model architecture.

In [5]:
curr_dt_time = datetime.datetime.now()

num_train_sequences = len(train_doc)
print('# training sequences =', num_train_sequences)
num_val_sequences = len(val_doc)
print('# validation sequences =', num_val_sequences)
print ('# epochs =', num_epochs)

# training sequences = 663
# validation sequences = 100
# epochs = 30


Let us create the `train_generator` and the `val_generator` which will be used in `.fit_generator`.

In [6]:
train_generator = generator(train_path, train_doc, batch_size, x=num_frames, y=img_height, z=img_width)
val_generator = generator(val_path, val_doc, batch_size, x=num_frames, y=img_height, z=img_width)

In [7]:
# Create the model directory if it doesn't exist
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

The `steps_per_epoch` and `validation_steps` are used by `fit_generator` to decide the number of next() calls it need to make.

In [8]:
if (num_train_sequences%batch_size) == 0:
    steps_per_epoch = int(num_train_sequences/batch_size)
else:
    steps_per_epoch = (num_train_sequences//batch_size) + 1

if (num_val_sequences%batch_size) == 0:
    validation_steps = int(num_val_sequences/batch_size)
else:
    validation_steps = (num_val_sequences//batch_size) + 1

# Model
Here you make the model using different functionalities that Keras provides. Remember to use `Conv3D` and `MaxPooling3D` and not `Conv2D` and `Maxpooling2D` for a 3D convolution model. You would want to use `TimeDistributed` while building a Conv2D + RNN model. Also remember that the last layer is the softmax. Design the network in such a way that the model is able to give good accuracy on the least number of parameters so that it can fit in the memory of the webcam.

## Model 1
A basic Conv3D model with three 3D convolutional layers (32, 64, 128 filters) followed by fully connected layers. MaxPooling3D is applied after each convolution to reduce spatial dimensions.

In [9]:
model_conv3D = Sequential()

# 1st Conv3D layer
model_conv3D.add(Conv3D(filters=32, kernel_size=(3, 3, 3), 
                 input_shape=(num_frames, img_height, img_width, 3),  # Using variables
                 padding='same', activation='relu'))
model_conv3D.add(MaxPooling3D(pool_size=(2, 2, 2)))

# 2nd Conv3D layer
model_conv3D.add(Conv3D(filters=64, kernel_size=(3, 3, 3), padding='same', activation='relu'))
model_conv3D.add(MaxPooling3D(pool_size=(2, 2, 2)))

# 3rd Conv3D layer
model_conv3D.add(Conv3D(filters=128, kernel_size=(3, 3, 3), padding='same', activation='relu'))
model_conv3D.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Flatten the output for the Dense layer
model_conv3D.add(Flatten())

# Fully connected layer
model_conv3D.add(Dense(512, activation='relu'))
model_conv3D.add(Dropout(0.5))

# Output layer (Softmax for classification)
model_conv3D.add(Dense(5, activation='softmax'))  # 5 output classes

# Compile the model
model_conv3D.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint = ModelCheckpoint(model_conv3D_filepath, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, 
                             save_weights_only=False, mode='max')  # Save as .keras

# Define ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add EarlyStopping to monitor val_categorical_accuracy
early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Add all callbacks to the list
callbacks_list = [checkpoint, LR, early_stopping]

# Train the model
model_conv3D.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, 
                 callbacks=callbacks_list, validation_data=val_generator, 
                 validation_steps=validation_steps, class_weight=None, initial_epoch=0)


Source path =  C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Project_data\train ; batch size = 32
Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.1721 - loss: 1.9838Source path =  C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Project_data\val ; batch size = 32

Epoch 1: val_categorical_accuracy improved from -inf to 0.23000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 4s/step - categorical_accuracy: 0.1731 - loss: 1.9734 - val_categorical_accuracy: 0.2300 - val_loss: 1.5611 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.2994 - loss: 1.5510
Epoch 2: val_categorical_accuracy improved from 0.23000 to 0.44000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 3s/step - categorical_accuracy: 0.9868 - loss: 0.0391 - val_categorical_accuracy: 0.9100 - val_loss: 0.4670 - learning_rate: 2.0000e-04
Epoch 19/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.9894 - loss: 0.0265
Epoch 19: val_categorical_accuracy did not improve from 0.91000

Epoch 19: ReduceLROnPlateau reducing learning rate to 4.0000001899898055e-05.
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 3s/step - categorical_accuracy: 0.9895 - loss: 0.0264 - val_categorical_accuracy: 0.9000 - val_loss: 0.4526 - learning_rate: 2.0000e-04
Epoch 20/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.9935 - loss: 0.0322
Epoch 20: val_categorical_accuracy did not improve from 0.91000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 3s/step - categorical_accuracy: 0.9935 - loss: 0.0319 - val_catego

<keras.src.callbacks.history.History at 0x203eae096c0>

## Model 2
This model uses TimeDistributed Conv2D layers (32, 64 filters) for spatial feature extraction followed by a SimpleRNN with 64 units to handle temporal sequences. MaxPooling2D is applied after each Conv2D layer.

In [10]:
# Conv2D + RNN Model
model_rnn = Sequential()

# TimeDistributed Conv2D layers
model_rnn.add(TimeDistributed(Conv2D(32, (3, 3), activation='relu', padding='same'), 
                              input_shape=(num_frames, img_height, img_width, 3)))
model_rnn.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model_rnn.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same')))
model_rnn.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model_rnn.add(TimeDistributed(Flatten()))  # Flatten before passing to RNN

# SimpleRNN layer
model_rnn.add(SimpleRNN(64, return_sequences=False))

# Dense layers
model_rnn.add(Dense(512, activation='relu'))
model_rnn.add(Dropout(0.5))
model_rnn.add(Dense(5, activation='softmax'))  # Output layer

# Compile the model
model_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_rnn = ModelCheckpoint(model_2Drnn_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                 save_best_only=True, save_weights_only=False, mode='max')

# Define ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add EarlyStopping to monitor val_categorical_accuracy
early_stopping_rnn = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Add all callbacks to the list
callbacks_rnn_list = [checkpoint_rnn, LR, early_stopping_rnn]

# Train the model with EarlyStopping
model_rnn.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, 
              callbacks=callbacks_rnn_list, validation_data=val_generator, validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - categorical_accuracy: 0.2079 - loss: 1.6718
Epoch 1: val_categorical_accuracy improved from -inf to 0.22000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv2D_RNN.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 5s/step - categorical_accuracy: 0.2080 - loss: 1.6726 - val_categorical_accuracy: 0.2200 - val_loss: 1.6153 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - categorical_accuracy: 0.2234 - loss: 1.6650
Epoch 2: val_categorical_accuracy did not improve from 0.22000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 4s/step - categorical_accuracy: 0.2229 - loss: 1.6644 - val_categorical_accuracy: 0.2200 - val_loss: 1.5773 - learning_rate: 0.0010
Epoch 3/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - categorical_a

<keras.src.callbacks.history.History at 0x20392287fd0>

## Model 3
Combines TimeDistributed Conv2D layers (32, 64 filters) for spatial features with an LSTM layer (64 units) for temporal dependencies. MaxPooling2D is used after each Conv2D layer, and the output is flattened before passing to the LSTM.

In [11]:
# Conv2D + LSTM Model
model_lstm = Sequential()

# TimeDistributed Conv2D layers
model_lstm.add(TimeDistributed(Conv2D(32, (3, 3), activation='relu', padding='same'), 
                               input_shape=(num_frames, img_height, img_width, 3)))
model_lstm.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model_lstm.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same')))
model_lstm.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model_lstm.add(TimeDistributed(Flatten()))  # Flatten before passing to LSTM

# LSTM layer
model_lstm.add(LSTM(64, return_sequences=False))

# Dense layers
model_lstm.add(Dense(512, activation='relu'))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(5, activation='softmax'))  # Output layer

# Compile the model
model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_lstm = ModelCheckpoint(model_2Dlstm_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                  save_best_only=True, save_weights_only=False, mode='max')

# Define ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add EarlyStopping to monitor val_categorical_accuracy
early_stopping_lstm = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Add all callbacks to the list
callbacks_lstm_list = [checkpoint_lstm, LR, early_stopping_lstm]

# Train the model with EarlyStopping
model_lstm.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, 
               callbacks=callbacks_lstm_list, validation_data=val_generator, validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - categorical_accuracy: 0.2010 - loss: 1.6405
Epoch 1: val_categorical_accuracy improved from -inf to 0.27000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv2D_LSTM.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 4s/step - categorical_accuracy: 0.2023 - loss: 1.6399 - val_categorical_accuracy: 0.2700 - val_loss: 1.5677 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - categorical_accuracy: 0.3125 - loss: 1.5425
Epoch 2: val_categorical_accuracy improved from 0.27000 to 0.69000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv2D_LSTM.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 4s/step - categorical_accuracy: 0.3146 - loss: 1.5406 - val_categorical_accuracy: 0.6900 - val_loss: 

<keras.src.callbacks.history.History at 0x203b867e8f0>

## Model 4
This architecture uses TimeDistributed Conv2D layers (32, 64 filters) followed by a GRU layer with 64 units. MaxPooling2D is applied after each Conv2D layer, and the output is flattened before the GRU layer.

In [12]:
# Conv2D + GRU Model
model_gru = Sequential()

# TimeDistributed Conv2D layers
model_gru.add(TimeDistributed(Conv2D(32, (3, 3), activation='relu', padding='same'), 
                              input_shape=(num_frames, img_height, img_width, 3)))
model_gru.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model_gru.add(TimeDistributed(Conv2D(64, (3, 3), activation='relu', padding='same')))
model_gru.add(TimeDistributed(MaxPooling2D(pool_size=(2, 2))))
model_gru.add(TimeDistributed(Flatten()))  # Flatten before passing to GRU

# GRU layer
model_gru.add(GRU(64, return_sequences=False))

# Dense layers
model_gru.add(Dense(512, activation='relu'))
model_gru.add(Dropout(0.5))
model_gru.add(Dense(5, activation='softmax'))  # Output layer

# Compile the model
model_gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_gru = ModelCheckpoint(model_2Dgru_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                 save_best_only=True, save_weights_only=False, mode='max')

# Define ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add EarlyStopping to monitor val_categorical_accuracy
early_stopping_gru = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Add all callbacks to the list
callbacks_gru_list = [checkpoint_gru, LR, early_stopping_gru]

# Train the model with EarlyStopping
model_gru.fit(train_generator, steps_per_epoch=steps_per_epoch, epochs=num_epochs, verbose=1, 
              callbacks=callbacks_gru_list, validation_data=val_generator, validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - categorical_accuracy: 0.2085 - loss: 1.6191
Epoch 1: val_categorical_accuracy improved from -inf to 0.28000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv2D_GRU.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 4s/step - categorical_accuracy: 0.2096 - loss: 1.6181 - val_categorical_accuracy: 0.2800 - val_loss: 1.4887 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - categorical_accuracy: 0.4015 - loss: 1.4391
Epoch 2: val_categorical_accuracy improved from 0.28000 to 0.57000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv2D_GRU.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 4s/step - categorical_accuracy: 0.4040 - loss: 1.4356 - val_categorical_accuracy: 0.5700 - val_loss: 1.

<keras.src.callbacks.history.History at 0x203ca63fbb0>

## Model 5
A Conv3D model with two 3D convolutional layers (32, 64 filters) followed by a SimpleRNN with 64 units. MaxPooling3D reduces spatial dimensions after each convolution, and the output is flattened before the RNN.

In [13]:
# Conv3D + SimpleRNN Model
model_conv3D_rnn = Sequential()

# Conv3D layers
model_conv3D_rnn.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                            input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_rnn.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_rnn.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
model_conv3D_rnn.add(MaxPooling3D(pool_size=(2, 2, 2)))

# TimeDistributed(Flatten()) to keep the time dimension intact
model_conv3D_rnn.add(TimeDistributed(Flatten()))  # Flatten the spatial dimensions but keep the time axis

# SimpleRNN layer
model_conv3D_rnn.add(SimpleRNN(64, return_sequences=False))

# Fully connected layers
model_conv3D_rnn.add(Dense(512, activation='relu'))
model_conv3D_rnn.add(Dropout(0.5))
model_conv3D_rnn.add(Dense(5, activation='softmax'))  # Output layer

# Compile the model
model_conv3D_rnn.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_rnn = ModelCheckpoint(model_3Drnn_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                 save_best_only=True, save_weights_only=False, mode='max')

# Define ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add EarlyStopping to monitor val_categorical_accuracy
early_stopping_rnn = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Add all callbacks to the list
callbacks_rnn_list = [checkpoint_rnn, LR, early_stopping_rnn]

# Train the model with EarlyStopping
model_conv3D_rnn.fit(train_generator, 
                     steps_per_epoch=steps_per_epoch, 
                     epochs=num_epochs, 
                     verbose=1, 
                     callbacks=callbacks_rnn_list,  # Using ModelCheckpoint, ReduceLROnPlateau, and EarlyStopping callbacks
                     validation_data=val_generator, 
                     validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.2172 - loss: 1.7549
Epoch 1: val_categorical_accuracy improved from -inf to 0.23000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 3s/step - categorical_accuracy: 0.2161 - loss: 1.7537 - val_categorical_accuracy: 0.2300 - val_loss: 1.5981 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.2350 - loss: 1.5975
Epoch 2: val_categorical_accuracy improved from 0.23000 to 0.43000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 3s/step - categorical_accuracy: 0.2373 - loss: 1.5947 - val_categorical_accuracy: 0.4300 - val_loss: 1.

<keras.src.callbacks.history.History at 0x205ef61e740>

## Model 6
Combines Conv3D layers (32, 64 filters) for spatial-temporal feature extraction with an LSTM layer (64 units). MaxPooling3D is applied after each convolution, followed by a TimeDistributed Flatten layer and the LSTM.

In [14]:
# Conv3D + LSTM Model
model_conv3D_lstm = Sequential()

# Conv3D layers
model_conv3D_lstm.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                             input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_lstm.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_lstm.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
model_conv3D_lstm.add(MaxPooling3D(pool_size=(2, 2, 2)))

# TimeDistributed(Flatten()) to keep the time dimension intact
model_conv3D_lstm.add(TimeDistributed(Flatten()))  # Flatten the spatial dimensions but keep the time axis

# LSTM layer
model_conv3D_lstm.add(LSTM(64, return_sequences=False))

# Fully connected layers
model_conv3D_lstm.add(Dense(512, activation='relu'))
model_conv3D_lstm.add(Dropout(0.5))
model_conv3D_lstm.add(Dense(5, activation='softmax'))  # Output layer

# Compile the model
model_conv3D_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_lstm = ModelCheckpoint(model_3Dlstm_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                  save_best_only=True, save_weights_only=False, mode='max')

# Define ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add EarlyStopping to monitor val_categorical_accuracy
early_stopping_lstm = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Add all callbacks to the list
callbacks_lstm_list = [checkpoint_lstm, LR, early_stopping_lstm]

# Train the model with EarlyStopping
model_conv3D_lstm.fit(train_generator, 
                      steps_per_epoch=steps_per_epoch, 
                      epochs=num_epochs, 
                      verbose=1, 
                      callbacks=callbacks_lstm_list, 
                      validation_data=val_generator, 
                      validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.2163 - loss: 1.6203
Epoch 1: val_categorical_accuracy improved from -inf to 0.28000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_LSTM.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 3s/step - categorical_accuracy: 0.2181 - loss: 1.6191 - val_categorical_accuracy: 0.2800 - val_loss: 1.4614 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.3392 - loss: 1.4849
Epoch 2: val_categorical_accuracy improved from 0.28000 to 0.49000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_LSTM.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 3s/step - categorical_accuracy: 0.3433 - loss: 1.4805 - val_categorical_accuracy: 0.4900 - val_loss: 

<keras.src.callbacks.history.History at 0x205ed316e30>

## Model 7
Similar to Conv3D + LSTM, this model uses Conv3D layers (32, 64 filters) but replaces LSTM with a GRU layer (64 units). MaxPooling3D is applied after each convolution, followed by a TimeDistributed Flatten layer and the GRU.

In [15]:
# Conv3D + GRU Model
model_conv3D_gru = Sequential()

# Conv3D layers
model_conv3D_gru.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                            input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_gru.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_gru.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
model_conv3D_gru.add(MaxPooling3D(pool_size=(2, 2, 2)))

# TimeDistributed(Flatten()) to keep the time dimension intact
model_conv3D_gru.add(TimeDistributed(Flatten()))  # Flatten the spatial dimensions but keep the time axis

# GRU layer (expects 3D input: (batch_size, timesteps, features))
model_conv3D_gru.add(GRU(64, return_sequences=False))

# Fully connected layers
model_conv3D_gru.add(Dense(512, activation='relu'))
model_conv3D_gru.add(Dropout(0.5))
model_conv3D_gru.add(Dense(5, activation='softmax'))  # Output layer

# Compile the model
model_conv3D_gru.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_gru = ModelCheckpoint(model_3Dgru_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                 save_best_only=True, save_weights_only=False, mode='max')

# Define ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add EarlyStopping to monitor val_categorical_accuracy
early_stopping_gru = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Add all callbacks to the list
callbacks_gru_list = [checkpoint_gru, LR, early_stopping_gru]

# Train the model with EarlyStopping
model_conv3D_gru.fit(train_generator, 
                     steps_per_epoch=steps_per_epoch, 
                     epochs=num_epochs, 
                     verbose=1, 
                     callbacks=callbacks_gru_list, 
                     validation_data=val_generator, 
                     validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.1967 - loss: 1.6712
Epoch 1: val_categorical_accuracy improved from -inf to 0.31000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_GRU.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 3s/step - categorical_accuracy: 0.1975 - loss: 1.6701 - val_categorical_accuracy: 0.3100 - val_loss: 1.5180 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - categorical_accuracy: 0.3314 - loss: 1.4977
Epoch 2: val_categorical_accuracy improved from 0.31000 to 0.47000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_GRU.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 3s/step - categorical_accuracy: 0.3341 - loss: 1.4936 - val_categorical_accuracy: 0.4700 - val_loss: 1.

<keras.src.callbacks.history.History at 0x205ec7cebf0>

### Let's evaluate the models we've built till now on test data and compare thier soze, time to test and test accuracy

In [16]:
# Dictionary to store the model names, accuracies, model sizes, and inference times
model_accuracy_summary = {
    'Model': [],
    'Train Acc': [],
    'Test Acc': [],
    'Model Size (MB)': [],
    'Test Time (sec)': []
}

# Calculate the number of test sequences and steps
num_test_sequences = len(val_doc)
if (num_test_sequences % batch_size) == 0:
    test_steps = int(num_test_sequences / batch_size)
else:
    test_steps = (num_test_sequences // batch_size) + 1
    
# Function to evaluate the model and measure inference time on test data
def evaluate_model(model_filepath, model_name):
    # Load the model
    model = load_model(model_filepath)

    # Evaluate on training data
    train_loss, train_accuracy = model.evaluate(train_generator, steps=steps_per_epoch, verbose=1)

    # Measure inference time on test data
    start_time = time.time()
    test_loss, test_accuracy = model.evaluate(val_generator, steps=test_steps, verbose=1)
    end_time = time.time()

    # Calculate inference time
    inference_time = end_time - start_time

    # Calculate the model size in MB (formatted without decimals)
    model_size = int(os.path.getsize(model_filepath) / (1024 * 1024))  # Convert from bytes to MB

    # Append the results to the summary dictionary
    model_accuracy_summary['Model'].append(model_name)
    model_accuracy_summary['Train Acc'].append(train_accuracy)
    model_accuracy_summary['Test Acc'].append(test_accuracy)
    model_accuracy_summary['Model Size (MB)'].append(model_size)
    model_accuracy_summary['Test Time (sec)'].append(f"{inference_time:.1f}")  # Format to 1 digit

    # Print the results for the current model
    print(f"Model: {model_name}")
    print(f"Train Accuracy: {train_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")
    print(f"Model Size: {model_size} MB")  # No digits after the decimal point
    print(f"Test Inference Time: {inference_time:.1f} sec")  # 1 digit after decimal point
    print("="*50)


# Evaluate the Conv3D model
evaluate_model(model_conv3D_filepath, 'Conv3D')

# Evaluate the Conv2D + RNN (SimpleRNN) model
evaluate_model(model_2Drnn_filepath, 'Conv2D + RNN')

# Evaluate the Conv2D + LSTM model
evaluate_model(model_2Dlstm_filepath, 'Conv2D + LSTM')

# Evaluate the Conv2D + GRU model
evaluate_model(model_2Dgru_filepath, 'Conv2D + GRU')

# Evaluate the Conv3D + RNN (SimpleRNN) model
evaluate_model(model_3Drnn_filepath, 'Conv3D + RNN')

# Evaluate the Conv3D + LSTM model
evaluate_model(model_3Dlstm_filepath, 'Conv3D + LSTM')

# Evaluate the Conv3D + GRU model
evaluate_model(model_3Dgru_filepath, 'Conv3D + GRU')

# Print the summary in a table format using pandas
summary_df = pd.DataFrame(model_accuracy_summary)
print("\nSummary of Model Performance:\n")
print(summary_df)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 2s/step - categorical_accuracy: 1.0000 - loss: 0.0090
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - categorical_accuracy: 0.9379 - loss: 0.3880  
Model: Conv3D
Train Accuracy: 1.0
Test Accuracy: 0.9100000262260437
Model Size: 99 MB
Test Inference Time: 10.2 sec
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 3s/step - categorical_accuracy: 0.9923 - loss: 0.0654
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - categorical_accuracy: 0.7563 - loss: 0.7149 
Model: Conv2D + RNN
Train Accuracy: 0.9894419312477112
Test Accuracy: 0.75
Model Size: 12 MB
Test Inference Time: 13.1 sec
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 3s/step - categorical_accuracy: 1.0000 - loss: 0.0028
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1s/step - categorical_accuracy: 0.7592 - loss: 0.9054  
Model: Conv2D + LSTM
Train Accuracy: 1.0
Test Accur

After running the first 7 models, I evaluated their performance based on three primary factors: test accuracy, model size, and inference time. The goal was to select models that could offer a good balance between these metrics, keeping in mind the constraints of deploying the model in a real-time environment like a webcam, where time to test and model size are crucial considerations.

Out of the initial 7 models, the Conv3D + RNN, Conv3D + LSTM, and Conv3D + GRU models stood out as candidates for further experimentation. These models offered higher accuracy compared to their 2D counterparts and demonstrated better handling of spatiotemporal data. They also featured relatively small sizes and inference times, making them suitable for real-time use. We selected these three models because:

They provided a good combination of accuracy and efficiency.
They leveraged 3D convolutions, which are better suited for video data.
The RNN, LSTM, and GRU units handled temporal dependencies effectively.

In [39]:
#Defines the path for saving the models
model_rnn_extra_filepath = model_dir + '\\' + 'BestModelConv3D_RNN_Extra.keras'
model_lstm_extra_filepath = model_dir + '\\' + 'BestModelConv3D_LSTM_Extra.keras'
model_gru_extra_filepath = model_dir + '\\' + 'BestModelConv3D_GRU_Extra.keras'
model_rnn_optimized1_filepath = model_dir + '\\' + 'BestModelConv3D_RNN_Optimized2.keras'
model_rnn_optimized2_filepath = model_dir + '\\' + 'BestModelConv3D_RNN_Optimized1.keras'

In [25]:
batch_size = 32

# Defining input dimensions for the model
num_frames = 16  # x, number of frames

#Increase the image size
img_height = 128  # y, image height
img_width = 128   # z, image width

In [40]:
#Generate the train and validation generators againto accomodate for the increase in the input image size
train_generator = generator(source_path=train_path, folder_list=train_doc, batch_size=batch_size, x=num_frames, y=img_height, z=img_width)
val_generator = generator(source_path=val_path, folder_list=val_doc, batch_size=batch_size, x=num_frames, y=img_height, z=img_width)


## Model 8
This model adds an additional Conv3D layer (64 filters) to the Conv3D + RNN architecture. The RNN layer has 128 units, and MaxPooling3D is applied after each convolution.

In [27]:
# Conv3D + RNN Model with an additional Conv3D layer
model_conv3D_rnn_extra = Sequential()

# Conv3D layers
model_conv3D_rnn_extra.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                                  input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_rnn_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_rnn_extra.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
model_conv3D_rnn_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Additional Conv3D layer
model_conv3D_rnn_extra.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))  # Extra layer
model_conv3D_rnn_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Reshape for RNN layer
model_conv3D_rnn_extra.add(Reshape((-1, 16 * 16 * 64)))  # Adjust to new pooling result

# RNN layer
model_conv3D_rnn_extra.add(SimpleRNN(128, return_sequences=False))

# Fully connected layers
model_conv3D_rnn_extra.add(Dense(512, activation='relu'))
model_conv3D_rnn_extra.add(Dropout(0.5))
model_conv3D_rnn_extra.add(Dense(5, activation='softmax'))

# Compile the model
model_conv3D_rnn_extra.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_rnn_extra = ModelCheckpoint(model_rnn_extra_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                       save_best_only=True, save_weights_only=False, mode='max')

# EarlyStopping callback to stop training when val_categorical_accuracy plateaus
early_stopping_rnn_extra = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add callbacks to the list
callbacks_rnn_extra_list = [checkpoint_rnn_extra, early_stopping_rnn_extra, LR]

# Train the model
model_conv3D_rnn_extra.fit(train_generator, 
                           steps_per_epoch=steps_per_epoch, 
                           epochs=num_epochs, 
                           verbose=1, 
                           callbacks=callbacks_rnn_extra_list, 
                           validation_data=val_generator, 
                           validation_steps=validation_steps)


Source path =  C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Project_data\train ; batch size = 32
Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.2293 - loss: 1.6375Source path =  C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Project_data\val ; batch size = 32

Epoch 1: val_categorical_accuracy improved from -inf to 0.24000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN_Extra.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 6s/step - categorical_accuracy: 0.2307 - loss: 1.6357 - val_categorical_accuracy: 0.2400 - val_loss: 1.4376 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.4419 - loss: 1.3390
Epoch 2: val_categorical_accuracy improved from 0.24000 to 0.63000, saving model to C:\Users\Vinay Joshi\Docume

<keras.src.callbacks.history.History at 0x205ed330c10>

## Model 9
Adds an extra Conv3D layer (64 filters) to the Conv3D + LSTM architecture. The LSTM layer has 128 units, and MaxPooling3D is applied after each convolution.

In [28]:
# Conv3D + LSTM Model with an additional Conv3D layer
model_conv3D_lstm_extra = Sequential()

# Conv3D layers
model_conv3D_lstm_extra.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                                   input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_lstm_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_lstm_extra.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
model_conv3D_lstm_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Additional Conv3D layer
model_conv3D_lstm_extra.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))  # Extra layer
model_conv3D_lstm_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Reshape for LSTM layer
model_conv3D_lstm_extra.add(Reshape((-1, 16 * 16 * 64)))  # Adjust to new pooling result

# LSTM layer
model_conv3D_lstm_extra.add(LSTM(128, return_sequences=False))

# Fully connected layers
model_conv3D_lstm_extra.add(Dense(512, activation='relu'))
model_conv3D_lstm_extra.add(Dropout(0.5))
model_conv3D_lstm_extra.add(Dense(5, activation='softmax'))

# Compile the model
model_conv3D_lstm_extra.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_lstm_extra = ModelCheckpoint(model_lstm_extra_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                        save_best_only=True, save_weights_only=False, mode='max')

# EarlyStopping callback to stop training when val_categorical_accuracy plateaus
early_stopping_lstm_extra = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add callbacks to the list
callbacks_lstm_extra_list = [checkpoint_lstm_extra, early_stopping_lstm_extra, LR]

# Train the model
model_conv3D_lstm_extra.fit(train_generator, 
                            steps_per_epoch=steps_per_epoch, 
                            epochs=num_epochs, 
                            verbose=1, 
                            callbacks=callbacks_lstm_extra_list, 
                            validation_data=val_generator, 
                            validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.1935 - loss: 1.6228
Epoch 1: val_categorical_accuracy improved from -inf to 0.17000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_LSTM_Extra.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 6s/step - categorical_accuracy: 0.1942 - loss: 1.6223 - val_categorical_accuracy: 0.1700 - val_loss: 1.5801 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 0.2871 - loss: 1.5326
Epoch 2: val_categorical_accuracy improved from 0.17000 to 0.43000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_LSTM_Extra.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 6s/step - categorical_accuracy: 0.2895 - loss: 1.5293 - val_categorical_accuracy: 0.430

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0022 - val_categorical_accuracy: 0.8500 - val_loss: 0.7222 - learning_rate: 4.0000e-05
Epoch 19/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0019
Epoch 19: val_categorical_accuracy did not improve from 0.88000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0019 - val_categorical_accuracy: 0.8500 - val_loss: 0.6915 - learning_rate: 4.0000e-05
Epoch 20/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0020
Epoch 20: val_categorical_accuracy improved from 0.88000 to 0.91000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_LSTM_Extra.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 6s/step 

<keras.src.callbacks.history.History at 0x20603d54a00>

## Model 10
Extends the Conv3D + GRU model by adding another Conv3D layer (64 filters). The GRU layer has 128 units, and MaxPooling3D is applied after each convolution.

In [29]:
# Conv3D + GRU Model with an additional Conv3D layer
model_conv3D_gru_extra = Sequential()

# Conv3D layers
model_conv3D_gru_extra.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                                  input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_gru_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_gru_extra.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))
model_conv3D_gru_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Additional Conv3D layer
model_conv3D_gru_extra.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same'))  # Extra layer
model_conv3D_gru_extra.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Reshape for GRU layer
model_conv3D_gru_extra.add(Reshape((-1, 16 * 16 * 64)))  # Adjust to new pooling result

# GRU layer
model_conv3D_gru_extra.add(GRU(128, return_sequences=False))

# Fully connected layers
model_conv3D_gru_extra.add(Dense(512, activation='relu'))
model_conv3D_gru_extra.add(Dropout(0.5))
model_conv3D_gru_extra.add(Dense(5, activation='softmax'))

# Compile the model
model_conv3D_gru_extra.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# ModelCheckpoint callback to save the best model based on validation accuracy
checkpoint_gru_extra = ModelCheckpoint(model_gru_extra_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                       save_best_only=True, save_weights_only=False, mode='max')

# EarlyStopping callback to stop training when val_categorical_accuracy plateaus
early_stopping_gru_extra = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Add callbacks to the list
callbacks_gru_extra_list = [checkpoint_gru_extra, early_stopping_gru_extra, LR]

# Train the model
model_conv3D_gru_extra.fit(train_generator, 
                           steps_per_epoch=steps_per_epoch, 
                           epochs=num_epochs, 
                           verbose=1, 
                           callbacks=callbacks_gru_extra_list, 
                           validation_data=val_generator, 
                           validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 0.1991 - loss: 1.7352
Epoch 1: val_categorical_accuracy improved from -inf to 0.21000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_GRU_Extra.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 6s/step - categorical_accuracy: 0.1993 - loss: 1.7316 - val_categorical_accuracy: 0.2100 - val_loss: 1.5508 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.2701 - loss: 1.4911
Epoch 2: val_categorical_accuracy improved from 0.21000 to 0.39000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_GRU_Extra.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 6s/step - categorical_accuracy: 0.2727 - loss: 1.4887 - val_categorical_accuracy: 0.3900 

<keras.src.callbacks.history.History at 0x206037f1a50>

In [30]:
# Evaluate Conv3D + RNN Model with extra layer
evaluate_model(model_rnn_extra_filepath, 'Conv3D + RNN (Extra Layer)')

# Evaluate Conv3D + LSTM Model with extra layer
evaluate_model(model_lstm_extra_filepath, 'Conv3D + LSTM (Extra Layer)')

# Evaluate Conv3D + GRU Model with extra layer
evaluate_model(model_gru_extra_filepath, 'Conv3D + GRU (Extra Layer)')

# Print the summary in a table format using pandas
performance_summary_df = pd.DataFrame(model_accuracy_summary)
print("\nPerformance Summary of Conv3D Models with Extra Layers:\n")
print(performance_summary_df)



[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 3s/step - categorical_accuracy: 1.0000 - loss: 0.0035
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3s/step - categorical_accuracy: 0.9366 - loss: 0.3673 
Model: Conv3D + RNN (Extra Layer)
Train Accuracy: 1.0
Test Accuracy: 0.9200000166893005
Model Size: 26 MB
Test Inference Time: 13.1 sec
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 4s/step - categorical_accuracy: 1.0000 - loss: 0.0010
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - categorical_accuracy: 0.8886 - loss: 0.5859 
Model: Conv3D + LSTM (Extra Layer)
Train Accuracy: 1.0
Test Accuracy: 0.8700000047683716
Model Size: 99 MB
Test Inference Time: 16.5 sec
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 3s/step - categorical_accuracy: 0.9315 - loss: 0.2109
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - categorical_accuracy: 0.7922 - loss: 0.4607  
Model: Conv3D + GR

In the second phase of experimentation, I added an additional Conv3D layer to the Conv3D + RNN, Conv3D + LSTM, and Conv3D + GRU models (models 8, 9, and 10). The objective was to improve the accuracy without significantly increasing the model size or inference time.

Among these, the Conv3D + RNN (Extra Layer) model (Model 8) emerged as the best candidate. Although the Conv3D + LSTM (Extra Layer) and Conv3D + GRU (Extra Layer) models performed well in terms of accuracy, they led to a considerable increase in model size and inference time, making them less suitable for deployment in a resource-constrained environment like a webcam.

Conv3D + RNN (Extra Layer) provided a reasonable accuracy improvement with minimal increase in model size and inference time. This tradeoff made it the best choice for further optimization and real-time deployment, as it balanced performance with practical constraints.

## Model 11
An optimized Conv3D + RNN model with three Conv3D layers (32, 64, 32 filters) using L2 regularization and 128 SimpleRNN units. MaxPooling3D is applied after each convolution, with reduced filter sizes for efficiency.

In [32]:
# Optimized Conv3D + RNN Model with Dropout and L2 Regularization
model_conv3D_rnn_optimized = Sequential()

# Conv3D layers with reduced filters and L2 regularization
model_conv3D_rnn_optimized.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                                      kernel_regularizer=l2(0.001), 
                                      input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_rnn_optimized.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Second Conv3D layer
model_conv3D_rnn_optimized.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same',
                                      kernel_regularizer=l2(0.001)))
model_conv3D_rnn_optimized.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Additional Conv3D layer with reduced filters
model_conv3D_rnn_optimized.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same',
                                      kernel_regularizer=l2(0.001)))  # Reduced to 32 filters
model_conv3D_rnn_optimized.add(MaxPooling3D(pool_size=(2, 2, 2)))

# Reshape for RNN layer
model_conv3D_rnn_optimized.add(Reshape((-1, 16 * 16 * 32)))

# RNN layer with L2 regularization
model_conv3D_rnn_optimized.add(SimpleRNN(128, return_sequences=False, kernel_regularizer=l2(0.001)))

# Fully connected layers with Dropout and L2 regularization
model_conv3D_rnn_optimized.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model_conv3D_rnn_optimized.add(Dropout(0.5))
model_conv3D_rnn_optimized.add(Dense(5, activation='softmax'))

# Compile the model
model_conv3D_rnn_optimized.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# Checkpoint for saving the best model
checkpoint_rnn_optimized = ModelCheckpoint(model_rnn_optimized1_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                           save_best_only=True, save_weights_only=False, mode='max')

# EarlyStopping callback to stop training when val_categorical_accuracy plateaus
early_stopping_rnn_optimized = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# ReduceLROnPlateau callback to reduce learning rate when validation loss plateaus
LR = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=4, verbose=1, mode='auto', min_lr=1e-6)

# Train the model with early stopping
model_conv3D_rnn_optimized.fit(train_generator, 
                               steps_per_epoch=steps_per_epoch, 
                               epochs=num_epochs, 
                               verbose=1, 
                               callbacks=[checkpoint_rnn_optimized, early_stopping_rnn_optimized, LR], 
                               validation_data=val_generator, 
                               validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.2512 - loss: 2.1867
Epoch 1: val_categorical_accuracy improved from -inf to 0.41000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN_Optimized2.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 6s/step - categorical_accuracy: 0.2507 - loss: 2.1822 - val_categorical_accuracy: 0.4100 - val_loss: 1.8632 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.4228 - loss: 1.7958
Epoch 2: val_categorical_accuracy improved from 0.41000 to 0.46000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN_Optimized2.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m121s[0m 6s/step - categorical_accuracy: 0.4240 - loss: 1.7921 - val_categorical_accurac

<keras.src.callbacks.history.History at 0x20630fecd00>

In [33]:
# Evaluate the optimized Conv3D + RNN Model
evaluate_model(model_rnn_optimized1_filepath, 'Conv3D + RNN (Optimized)1')

# Print the updated performance summary in a table format using pandas
performance_summary_df = pd.DataFrame(model_accuracy_summary)
print("\nUpdated Performance Summary with Conv3D + RNN (Optimized)1 Model:\n")
print(performance_summary_df)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 4s/step - categorical_accuracy: 0.9984 - loss: 0.2790
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3s/step - categorical_accuracy: 0.9047 - loss: 0.5970 
Model: Conv3D + RNN (Optimized)1
Train Accuracy: 0.9969834089279175
Test Accuracy: 0.8899999856948853
Model Size: 13 MB
Test Inference Time: 13.6 sec

Updated Performance Summary with Conv3D + RNN (Optimized)1 Model:

                          Model  Train Acc  Test Acc  Model Size (MB)  \
0                        Conv3D   1.000000      0.91               99   
1                  Conv2D + RNN   0.989442      0.75               12   
2                 Conv2D + LSTM   1.000000      0.75               48   
3                  Conv2D + GRU   0.996983      0.84               36   
4                  Conv3D + RNN   0.984917      0.82               13   
5                 Conv3D + LSTM   1.000000      0.88               49   
6                  Conv3D + GRU   1

## Model 12
A further optimized Conv3D + RNN model with two Conv3D layers (32, 64 filters) and 128 SimpleRNN units. Increased dropout (0.4) is added to reduce overfitting, and L2 regularization is applied.

In [34]:
# Define callbacks for learning rate reduction and early stopping
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1, min_lr=1e-6)

early_stopping = EarlyStopping(monitor='val_categorical_accuracy', patience=5, verbose=1, restore_best_weights=True)

# Optimized Conv3D + RNN Model with Dropout and L2 Regularization
model_conv3D_rnn_optimized = Sequential()

# Conv3D layers with L2 regularization and increased dropout
model_conv3D_rnn_optimized.add(Conv3D(32, (3, 3, 3), activation='relu', padding='same', 
                                      kernel_regularizer=l2(0.001), 
                                      input_shape=(num_frames, img_height, img_width, 3)))
model_conv3D_rnn_optimized.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_rnn_optimized.add(Dropout(0.4))  # Increased dropout to prevent overfitting

# Second Conv3D layer
model_conv3D_rnn_optimized.add(Conv3D(64, (3, 3, 3), activation='relu', padding='same', kernel_regularizer=l2(0.001)))
model_conv3D_rnn_optimized.add(MaxPooling3D(pool_size=(2, 2, 2)))
model_conv3D_rnn_optimized.add(Dropout(0.4))

# Reshape for RNN layer
model_conv3D_rnn_optimized.add(Reshape((-1, 16 * 16 * 64)))

# RNN layer with L2 regularization
model_conv3D_rnn_optimized.add(SimpleRNN(128, return_sequences=False, kernel_regularizer=l2(0.001)))

# Fully connected layers with Dropout and L2 regularization
model_conv3D_rnn_optimized.add(Dense(256, activation='relu', kernel_regularizer=l2(0.001)))
model_conv3D_rnn_optimized.add(Dropout(0.5))
model_conv3D_rnn_optimized.add(Dense(5, activation='softmax'))

# Compile the model
model_conv3D_rnn_optimized.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

# Checkpoint to save the best model
checkpoint_rnn_optimized = ModelCheckpoint(model_rnn_optimized2_filepath, monitor='val_categorical_accuracy', verbose=1, 
                                           save_best_only=True, save_weights_only=False, mode='max')

# Train the model with additional callbacks for reducing learning rate and early stopping
model_conv3D_rnn_optimized.fit(train_generator, 
                               steps_per_epoch=steps_per_epoch, 
                               epochs=num_epochs, 
                               verbose=1, 
                               callbacks=[checkpoint_rnn_optimized, reduce_lr, early_stopping], 
                               validation_data=val_generator, 
                               validation_steps=validation_steps)


Epoch 1/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 0.2554 - loss: 2.2734
Epoch 1: val_categorical_accuracy improved from -inf to 0.19000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN_Optimized1.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 6s/step - categorical_accuracy: 0.2546 - loss: 2.2720 - val_categorical_accuracy: 0.1900 - val_loss: 2.1025 - learning_rate: 0.0010
Epoch 2/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 0.1784 - loss: 2.1454
Epoch 2: val_categorical_accuracy improved from 0.19000 to 0.21000, saving model to C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN_Optimized1.keras
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 6s/step - categorical_accuracy: 0.1797 - loss: 2.1444 - val_categorical_accurac

Epoch 19/30
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - categorical_accuracy: 1.0000 - loss: 0.3749
Epoch 19: val_categorical_accuracy did not improve from 0.84000
[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.3747 - val_categorical_accuracy: 0.8200 - val_loss: 0.9882 - learning_rate: 2.0000e-04
Epoch 19: early stopping
Restoring model weights from the end of the best epoch: 14.


<keras.src.callbacks.history.History at 0x2065f4330a0>

In [35]:
# Evaluate the optimized Conv3D + RNN Model
evaluate_model(model_rnn_optimized2_filepath, 'Conv3D + RNN (Optimized)2')

# Print the updated performance summary in a table format using pandas
performance_summary_df = pd.DataFrame(model_accuracy_summary)
print("\nUpdated Performance Summary with Conv3D + RNN (Optimized)2 Model:\n")
print(performance_summary_df)

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 3s/step - categorical_accuracy: 0.9952 - loss: 0.4133
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2s/step - categorical_accuracy: 0.8434 - loss: 1.0602  
Model: Conv3D + RNN (Optimized)2
Train Accuracy: 0.9954751133918762
Test Accuracy: 0.8299999833106995
Model Size: 25 MB
Test Inference Time: 15.8 sec

Updated Performance Summary with Conv3D + RNN (Optimized)2 Model:

                          Model  Train Acc  Test Acc  Model Size (MB)  \
0                        Conv3D   1.000000      0.91               99   
1                  Conv2D + RNN   0.989442      0.75               12   
2                 Conv2D + LSTM   1.000000      0.75               48   
3                  Conv2D + GRU   0.996983      0.84               36   
4                  Conv3D + RNN   0.984917      0.82               13   
5                 Conv3D + LSTM   1.000000      0.88               49   
6                  Conv3D + GRU   

In [36]:
# Load the best model in .keras format (model no. 8)
best_model_rnn_extra_keras = load_model(model_rnn_extra_filepath)

# Save it as .h5 format
best_model_rnn_extra_keras.save(model_rnn_extra_filepath + '.h5')

print(f"Model saved in .h5 format at {model_rnn_extra_filepath + '.h5'}")



Model saved in .h5 format at C:\Users\Vinay Joshi\Documents\PGD AI\5. Gesture Recognition Project\Bestmodels\BestModelConv3D_RNN_Extra.keras.h5


In [37]:
# Load the saved model in .h5 format
best_model_rnn_extra_h5 = load_model(model_rnn_extra_filepath + '.h5')

# Evaluate on test data (validation generator used as a substitute for test data here)
test_loss, test_accuracy = best_model_rnn_extra_h5.evaluate(val_generator, steps=validation_steps, verbose=1)

# Print the test results
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")



[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3s/step - categorical_accuracy: 0.9149 - loss: 0.4913
Test Loss: 0.45276182889938354
Test Accuracy: 0.9200000166893005
