In [None]:
import numpy as np 
import pandas as pd 
import os
import json
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import Sequential, Model 
from tensorflow.keras.layers import Dense, LSTM, Input, Masking, Activation, Dropout, Concatenate
from tensorflow.keras.layers import Flatten, MultiHeadAttention, LayerNormalization
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.optimizers.experimental import AdamW
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from keras.utils import to_categorical
from keras.regularizers import l1, l2

In [None]:
mode = 'training'
# mode = 'submission'
# mode = 'inference_testing'

In [None]:
## loading train.csv data

train_df = pd.read_csv('/kaggle/input/asl-signs/train.csv')
train_df.head()

In [None]:
## loading sign index map 

json_file = open('/kaggle/input/asl-signs/sign_to_prediction_index_map.json')
sign_label = json.load(json_file)  ## a dictionary variable with all labels and its integer representation
n_signs = len(sign_label)
n_signs  ## number of classes

In [None]:
## understanding parquet data for one sign example given

pq_path = os.path.join('/kaggle/input/asl-signs',train_df['path'][0])   ## taking first data in df
pq_df = pd.read_parquet(pq_path)
pq_df

In [None]:
pq_df[pq_df['frame']==20].groupby('type')['landmark_index'].count()  ## for one frame finding the landmark data

Order is face, left_hand, pose, right_hand

For each type, the index starts from 0. Thus, should calculate for each type and get only required index

0 - 468 --> face landmarks

468 - 489 --> left hand landmarks

489 - 522 --> pose landmarks

522 - 543 --> right hand landmarks

In [None]:
del pq_df

As seen, the most videos have between 1 and 300 frames, the counts of frames after 300 are very less almost 1 video per frame number only. But 50 is where max videos have frames 
Hence, considering max_frames as 30. 

### Understanding landmarks in Mediapipeline holistic data

In [None]:
pip install mediapipe

In [None]:
import mediapipe as mp

lips_indices = set()

for ele in mp.solutions.face_mesh_connections.FACEMESH_LIPS:
    lips_indices.add(ele[0])
    lips_indices.add(ele[1])

In [None]:
np.array(lips_indices)

In [None]:
left_hand_indices = [i for i in range(468,489)]
right_hand_indices = [i for i in range(522,543)]
np.array(left_hand_indices), np.array(right_hand_indices)

In 0 - 468 of face landmarks, only the lip and outline indices are taken and only hand landmarks are taken. Pose is not taken. 

In [None]:
reqd_keypoints = list(lips_indices) + list(left_hand_indices) + list(right_hand_indices)

In [None]:
ROWS_PER_FRAME = 543    ## number of landmark indexes for each frame
MAX_LENGTH = 30
N_KEYPTS = len(reqd_keypoints)  ## 82
N_KEYPTS

## Creating Dataset for training and validation

In [None]:
## the way data will be loaded for evaluation, using same for training
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)   ## (n_frames, ROWS_PER_FRAME, 3)  ## (n_frames, 543, 3)

In [None]:
def create_dataset(file_path_list, labels_list):
    x_data = np.empty(shape=(len(file_path_list), MAX_LENGTH, N_KEYPTS*3)) 
    y_data = []

    for i in tqdm(range(len(file_path_list))):
        pq_path = os.path.join('/kaggle/input/asl-signs', file_path_list[i])
        video_keypoints = load_relevant_data_subset(pq_path)  ## gives a numpy array of shape (n_frames, 543, 3)
        video_keypoints = video_keypoints[:, reqd_keypoints]   ## keeping only required keypoints
        video_keypoints[np.isnan(video_keypoints)] = 0        ## converting all nan to 0 in the numpy array
           
            
        if video_keypoints.shape[0] < MAX_LENGTH:
            diff = MAX_LENGTH - video_keypoints.shape[0]
            video_keypoints = np.append(np.zeros((diff, N_KEYPTS, 3)), video_keypoints, axis=0)
        else:
            video_keypoints = video_keypoints[-(MAX_LENGTH):]
            
        x = np.concatenate([video_keypoints[...,i] for i in range(3)], -1)
        
        del video_keypoints
        
        x_data[i] = x

        y_data.append(sign_label[labels_list[i]])
        
    x_data = np.asarray(x_data).astype(np.float32)
    y_data = np.asarray(y_data).astype(np.int)
        
    
    return x_data, y_data   ## this gives a x_data with max_length arrays

The shape of data when giving input should be same, but here it will be different. Thus, will be using padding command to make the length equal for all arrays. 

In [None]:
## splitting data for training and validation  ## taking only 10,000 rows of data
if mode == 'training':
#     train_data, val_data = train_test_split(train_df.sample(10000), test_size=0.05, random_state=11)
    train_data, val_data = train_test_split(train_df, test_size=0.05, random_state=21)

In [None]:
# getting training data in the form required for model 
if mode == 'training':

    x_train, y_train = create_dataset(list(train_data['path']), list(train_data['sign']))

In [None]:
# getting validation data in the form required for model 
if mode == 'training':

    x_val, y_val = create_dataset(list(val_data['path']), list(val_data['sign']))

In [None]:
x_train.shape, y_train.shape, x_val.shape, y_val.shape

## Building Model and training

In [None]:
# # Define model input shape
input_shape = (None, N_KEYPTS*3)

# Define input layer
inputs = Input(shape=input_shape)

# Define transformer layer
transformer_layer = MultiHeadAttention(num_heads=8, key_dim=64, dropout=0.3)
transformer_output = transformer_layer(inputs, inputs)

# Add layer normalization and residual connection
transformer_output = LayerNormalization()(inputs + transformer_output)

# Define LSTM layer
lstm_output1 = LSTM(128, return_sequences=True)(transformer_output)
dropout_output1 = Dropout(0.3)(lstm_output1)  ## dropout layer

# Define LSTM layer
lstm_output2 = LSTM(128)(dropout_output1)
dropout_output2 = Dropout(0.3)(lstm_output2)  ## dropout layer

## Adding dense layers 
dense_output = Dense(64)(dropout_output2)
dense_output2 = Dense(32)(dense_output)

# Define output layer
output = Dense(n_signs, activation='softmax')(dense_output2)

# Create model
transformer_lstm_model = Model(inputs=inputs, outputs=output)

In [None]:
## compiling the model

transformer_lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy', 'sparse_top_k_categorical_accuracy'])

In [None]:
## model summary
transformer_lstm_model.summary()

In [None]:
# creating a folder to save model weights 


folder_path ='/kaggle/working/Transformer_LSTM/'

if os.path.exists(folder_path) == True:
    None
else:
    os.makedirs(folder_path)

In [None]:
# enforciong early stopping, saving weights and optimizing learning rate

checkpoint_list = [tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',patience=10, restore_best_weights=True), 
                   tf.keras.callbacks.ModelCheckpoint(filepath='/kaggle/working/Transformer_LSTM/weights_epoch_{epoch:02d}.hdf5', save_weights_only=True, monitor='val_accuracy', 
                                                      mode='max', save_best_only=True),
                   tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_accuracy', factor = 0.2, patience = 5)]

In [None]:
EPOCHS = 20
BATCH_SIZE = 256
STEPS_PER_EPOCH = len(train_data['path'])//BATCH_SIZE
VAL_STEPS = len(val_data['path'])//BATCH_SIZE

In [None]:
STEPS_PER_EPOCH, VAL_STEPS

In [None]:
EPOCHS = 150

if mode == 'training':
    history = transformer_lstm_model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, steps_per_epoch=STEPS_PER_EPOCH, 
                                     validation_data=(x_val, y_val), validation_steps=VAL_STEPS, callbacks=checkpoint_list)

In [None]:
if mode == "training":

    fig,ax = plt.subplots(1, 2, figsize=(12, 6))
    ax[0].plot(history.history['val_accuracy'], label='val_accuracy')
    ax[0].plot(history.history['accuracy'], label='accuracy')
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Accuracy')
    ax[0].legend()
    
    ax[1].plot(history.history['val_loss'], label='val_loss')
    ax[1].plot(history.history['loss'], label='loss')
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Loss')
    ax[1].legend()

    plt.show()

In [None]:
if mode == "training":
    del x_train 
    del y_train
    del x_val
    del y_val

## Converting model to format as required by competition

#### Below code referenced from JESSE VAN DER LINDEN notebook for Google - Isolated Sign Language Recognition competition 

In [None]:
## the model input to the inference model while testing should be of shape (543, 3)
## but the model that we have trained is takes input (537, 543*3)
## thus adding a input layer before our trained model to change input shape as required for testing

def model_for_submission(model):
    
    input_layer = Input(shape=(ROWS_PER_FRAME, 3), name="inputs")  ## added input layer
    
    ## keeping only required keypoints 
    processed_input = tf.gather(input_layer, reqd_keypoints, axis=1)
    
    ## if data has nan replacing that with 0
    processed_input = tf.where(tf.math.is_nan(processed_input), tf.zeros_like(processed_input), processed_input)
    
    ## flatten x, y, z data
    processed_input = tf.concat([processed_input[...,i] for i in range(3)], -1)
    
 ## changing shape of array to (1, n_frames, N_KEYPTS*3)
    processed_input = tf.expand_dims(processed_input,0)
    
    ## calling trained model
    trained_model = model(processed_input)
    
    ## adding final layer 
    output_layer = Activation('linear', name='outputs')(trained_model)
    
    ## getting model
    final_model = Model(inputs=input_layer, outputs=output_layer)
    
    ## compiling model
    final_model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return final_model

In [None]:
mode = 'inference_testing'

In [None]:
if mode=='inference_testing':
    submission_model = model_for_submission(transformer_lstm_model)
    submission_model.summary(expand_nested=True)

In [None]:
## saving the model in tflite

if mode=='inference_testing':
    converter = tf.lite.TFLiteConverter.from_keras_model(submission_model)
    tflite_model = converter.convert()

    with open('/kaggle/working/transformer_lstm_model.tflite','wb') as f:
        f.write(tflite_model)

## Submitting the tflite file

In [None]:
mode = 'submission'

In [None]:
if mode=='submission':
    !zip submission.zip transformer_lstm_model.tflite

### Submission Code

In [None]:
mode='inference_testing'

In [None]:
!pip install tflite-runtime==2.9.1

In [None]:
## getting data as required in competition
if mode=='inference_testing':
    random_n = np.random.choice([i for i in range(len(val_data))], 100)
#     print(random_n)
    test_videos = []
    test_labels = []
    for n in random_n:
        path = os.path.join('/kaggle/input/asl-signs',list(train_df['path'])[n])
        test = load_relevant_data_subset(path)
        test_videos.append(test)
        label = list(train_df['sign'])[n]
        test_labels.append(label)
    
    test_videos = np.array(test_videos)

In [None]:
def number_to_sign(number):
    for key, value in sign_label.items():
        if value == number:
            return key

In [None]:
import time
## using same inference code as given in compettiion
import tflite_runtime.interpreter as tflite

if mode=='inference_testing':
    interpreter = tflite.Interpreter('/kaggle/working/transformer_lstm_model.tflite')

    found_signatures = list(interpreter.get_signature_list().keys())

#     if REQUIRED_SIGNATURE not in found_signatures:
#         raise KernelEvalException('Required input signature not found.')

    prediction_fn = interpreter.get_signature_runner("serving_default")
    
    all_time = []
    correct_count = 0
    for i in range(len(test_videos)):
        start_time = time.time()
        output = prediction_fn(inputs=test_videos[i])
        end_time = time.time()
        inference_time = end_time - start_time
        all_time.append(inference_time)
        pred_n = np.argmax(output["outputs"])
        predicted_sign = number_to_sign(pred_n)
        print(f"True: {predicted_sign} \t Precited: {list(train_df['sign'])[i]}")
        if predicted_sign == test_labels[i]:
            correct_count += 1
            
print(f'The number of signs predicted correctly were {correct_count} out of total {len(test_videos)}.')
print()
print(f'The Accuracy is {(correct_count/len(test_videos)):.2%}.')

In [None]:
print(f'Average inference time: {np.mean(all_time):.4f}')
model_size = os.path.getsize('/kaggle/working/transformer_lstm_model.tflite')
print(f'model size: {model_size/(1024**2):.3f} MB')