[Sign language competition:](https://www.kaggle.com/competitions/asl-signs)
* `Landmark Data`: A set of labeled landmark data extracted from raw videos using the MediaPipe Holistic Solution. This dataset is used to train machine learning models for isolated American Sign Language recognition in the competition.
  * A set of points on an object that are used to determine its shape, orientation, and location in space.
  * 
* `TensorFlow Lite`: A lightweight and cross-platform framework for deploying machine learning models on mobile and embedded devices. It enables on-device machine learning inference with low latency and a small binary size.

Reference


* https://www.kaggle.com/code/dschettler8845/gislr-learn-eda-baseline#introduction
  * ASL is a visual-gestural language, meaning that it uses facial expressions, body language, and hand movements to convey meaning.
  * ISOLATED SIGN LANGUAGE RECOGNITION: the process of recognizing sign language gestures performed by a person in isolation, without considering the context or the surrounding gestures.


# 1 Setup

## 1.1 Import package

In [None]:
! pip install pandarallel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandarallel
  Downloading pandarallel-1.6.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill>=0.3.1
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.4-py3-none-any.whl size=16677 sha256=a9ad59a6dc79f1f4d140499af8fcf64ac5757f6ed7da168eecd75197d60f3e50
  Stored in directory: /root/.cache/pip/wheels/41/01/29/deaa71fe596f8d857e57c4fb388db8861e23e6ed0b03204dcb
Successfully built pandarallel
Installing collected packages: dill, pandarallel
Successfully installed dill-0.3.6 pandarallel-1.6.4


In [None]:
! pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from glob import glob
import os

import json
import pandas as pd
import numpy as np
from pandarallel import pandarallel
from tqdm.notebook import tqdm; tqdm.pandas();

import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GroupShuffleSplit 

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

import tensorflow as tf

In [None]:
# configuration

seed = 2023
train_test_ratio = 0.2

## 1.2 Load data from Kaggle

In [None]:
# import kaggle data
'''
step 1: download token from kaggle
step 2: click "join" competition
step 3: upload kaggle.json inside file
'''

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 /root/.kaggle/kaggle.json

In [None]:
# download the file (takes time)
! kaggle competitions download -c asl-signs

Downloading asl-signs.zip to /content
100% 37.4G/37.4G [05:07<00:00, 192MB/s]
100% 37.4G/37.4G [05:07<00:00, 130MB/s]


In [None]:
# unzip selected training files
%%capture

! unzip asl-signs.zip train_landmark_files/16069/*.parquet
! unzip asl-signs.zip train_landmark_files/18796/*.parquet
! unzip asl-signs.zip train_landmark_files/2044/*.parquet
! unzip asl-signs.zip train_landmark_files/22343/*.parquet
! unzip asl-signs.zip train_landmark_files/25571/*.parquet

In [None]:
! unzip asl-signs.zip sign_to_prediction_index_map.json

Archive:  asl-signs.zip
  inflating: sign_to_prediction_index_map.json  


In [None]:
! unzip asl-signs.zip train.csv

Archive:  asl-signs.zip
  inflating: train.csv               


## 1.3 Load data files
There are 3 input files:
* `train.csv`: each row is a vocab
* `*.parquet`: each parquet file is frame-body-coordinate level to illustrate a word
* `sign_to_prediction_index_map.json`: translate vocab to index (for model training)

In [None]:
# 1.3.1 load train.csv
train_df = pd.read_csv('train.csv',error_bad_lines=False)
print('Original shape: ', train_df.shape)

# keep path to selected files
file_list = ['train_landmark_files/16069', 'train_landmark_files/18796', 'train_landmark_files/2044', 'train_landmark_files/22343', 'train_landmark_files/25571']
train_df = train_df[train_df['path'].str.contains('|'.join(file_list))]
print('Subset shape', train_df.shape)



  train_df = pd.read_csv('train.csv',error_bad_lines=False)


Original shape:  (94477, 4)
Subset shape (21702, 4)


In [27]:
# 1.3.2 load sign name to index map json
def read_json_file(file_path):
    """
    Returns a dictionary object representing the JSON data.
    """
    # Open the file and load the JSON data into a Python object
    with open(file_path, 'r') as file:
        json_data = json.load(file)
    return json_data
        

# encoder
s2p_map = {k.lower():v for k,v in read_json_file("sign_to_prediction_index_map.json").items()}
encoder = lambda x: s2p_map.get(x.lower())

# decoder
p2s_map = {v:k for k,v in read_json_file("sign_to_prediction_index_map.json").items()}
decoder = lambda x: p2s_map.get(x)

In [28]:
# apply encoder
train_df['sign_ord'] = train_df['sign'].apply(encoder)

# 2 EDA

## 2.1 train_df EDA

In [None]:
train_df

Unnamed: 0,path,participant_id,sequence_id,sign
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird
12,train_landmark_files/22343/1000638205.parquet,22343,1000638205,puzzle
20,train_landmark_files/22343/1001223069.parquet,22343,1001223069,not
25,train_landmark_files/18796/1001373962.parquet,18796,1001373962,have
...,...,...,...,...
94442,train_landmark_files/2044/998713046.parquet,2044,998713046,fine
94450,train_landmark_files/22343/999026579.parquet,22343,999026579,face
94463,train_landmark_files/22343/999330838.parquet,22343,999330838,pretty
94471,train_landmark_files/25571/999740509.parquet,25571,999740509,scissors


In [25]:
# 2.1.1 TARGET: `sign` column
display(train_df["sign"].describe().to_frame().T)

fig = px.histogram(train_df
                   , y=train_df["sign"]
                   , color="sign"
                   , orientation="h"
                   , height=5000
                   , labels={"y":"<b>Sign (label)</b>", "count":"<b>Total Row Count</b>"}
                   , title="<b>Row Counts by Sign (label)</b>"
                   , category_orders={"sign": train_df["sign"].value_counts().index}
)
fig.update_yaxes(title_text="<b>Total Row Count</b>")
fig.update_layout(showlegend=False)
fig.show()

Unnamed: 0,count,unique,top,freq
sign,21702,250,duck,105


## 2.2 Parquet EDA

### 2.2.1. Full dataset EDA
* build train_extended to extract metadata from parquet files

In [None]:
# 2.2.1 body type missing value

# build meta feature
def get_seq_meta(row):
    # count values in parquet files
    df = get_sign_df(row['path'], invert_y=True)
    type_counts = df['type'].value_counts(dropna=False).to_dict()
    nan_counts  = df.groupby("type")["x"].apply(lambda x: x.isna().sum())

    # append frame data in train_df
    row['start_frame'] = df['frame'].min()
    row['end_frame'] = df['frame'].max()
    row['total_frames'] = df['frame'].nunique()

    # append count values in train_df
    for i in ["face", "pose", "left_hand", "right_hand"]:
        row[f'{i}_count'] = type_counts[i]
        row[f'{i}_nan_count'] = nan_counts[i]

    # append max & min coordicates in train_df
    for coord in ['x', 'y', 'z']:
        row[f'{coord}_min'] = df[coord].min()
        row[f'{coord}_max'] = df[coord].max()

    return row

In [None]:
# %%time

# pandarallel.initialize(progress_bar=True)

# extended_train_df = train_df.parallel_apply(lambda x: get_seq_meta(x), axis=1)
# extended_train_df.to_csv('extended_train_df.csv')

In [None]:
# %cd /content/drive/MyDrive/NUS_MSBA/BT5153/group project
# extended_train_df = pd.read_csv('extended_train_df.csv')

/content/drive/MyDrive/NUS_MSBA/BT5153/group project


In [None]:
extended_train_df

Unnamed: 0.1,Unnamed: 0,path,participant_id,sequence_id,sign,start_frame,end_frame,total_frames,face_count,face_nan_count,...,left_hand_count,left_hand_nan_count,right_hand_count,right_hand_nan_count,x_min,x_max,y_min,y_max,z_min,z_max
0,2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud,103,207,105,49140,0,...,2205,1617,2205,2205,-0.042923,1.197836,-2.591290,-0.248094,-2.838325,1.587503
1,3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird,17,28,12,5616,0,...,252,252,252,0,-0.129268,1.156573,-2.294936,-0.310272,-3.018237,2.196296
2,12,train_landmark_files/22343/1000638205.parquet,22343,1000638205,puzzle,13,31,19,8892,0,...,399,168,399,399,-0.239978,0.986260,-2.137023,-0.218943,-3.229496,1.410132
3,20,train_landmark_files/22343/1001223069.parquet,22343,1001223069,not,25,42,18,8424,0,...,378,252,378,378,-0.178844,1.025033,-2.393715,-0.200144,-2.494985,1.714197
4,25,train_landmark_files/18796/1001373962.parquet,18796,1001373962,have,24,29,6,2808,0,...,126,126,126,21,-0.069743,0.948946,-2.336824,-0.260548,-3.078757,1.713525
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21697,94442,train_landmark_files/2044/998713046.parquet,2044,998713046,fine,20,44,25,11700,0,...,525,525,525,126,-0.037728,1.044223,-2.452121,-0.266636,-3.486691,2.342996
21698,94450,train_landmark_files/22343/999026579.parquet,22343,999026579,face,20,51,32,14976,0,...,672,126,672,672,-0.036662,1.065053,-2.470449,-0.244516,-2.409825,1.835042
21699,94463,train_landmark_files/22343/999330838.parquet,22343,999330838,pretty,65,254,190,88920,0,...,3990,1827,3990,3990,-0.213252,1.141706,-2.253999,-0.217906,-2.910883,1.890073
21700,94471,train_landmark_files/25571/999740509.parquet,25571,999740509,scissors,41,199,159,74412,0,...,3339,3339,3339,2625,-0.334650,1.004044,-2.466989,-0.270005,-3.387433,2.359299


In [None]:
# Compute the appearance and NaN percentages
type_kp_map = dict(face=468, left_hand=21, pose=33, right_hand=21)

for _type, _count in tqdm(type_kp_map.items(), desc="Computing percentages"):
    extended_train_df[f"{_type}_appears_pct"] = extended_train_df[f"{_type}_count"] / (extended_train_df["total_frames"] * _count)
    extended_train_df[f"{_type}_nan_pct"] = extended_train_df[f"{_type}_nan_count"] / (extended_train_df["total_frames"] * _count)

# Plot the frequency (log scale) of the number of samples in the dataset that have a given percentage of NaN values v.s. percentage of data points that are NaN
def title_map_fn(ann):
    title_map = {
    'face_nan_pct': '<b>Percentage Of <i>Face</i> Data Points That Are NaN</b>', 
    'left_hand_nan_pct': '<b>Percentage Of <i>Left Hand</i> Data Points That Are NaN</b>',
    'pose_nan_pct': '<b>Percentage Of <i>Pose</i> Data Points That Are NaN</b>',
    'right_hand_nan_pct': '<b>Percentage Of <i>Right Hand</i> Data Points That Are NaN</b>'}
    ann.text = title_map.get(ann.text[1:])
    
fig = px.histogram(extended_train_df
                   , ["face_nan_pct", "left_hand_nan_pct", "pose_nan_pct", "right_hand_nan_pct"]
                   , height=750
                   , labels={'variable': '', 'count': '<b>Frequency (LOG)</b>', 'value':"<b>Percentage of Points That Are NaN</b>"}
                   , log_y=True
                   , facet_col='variable'
                   , nbins=20
                   , opacity=0.75
                   , facet_col_wrap=2
                   , facet_col_spacing=0.05)
fig.update_yaxes(title_text='<b>Frequency (LOG)</b>', col=1)
fig.for_each_annotation(title_map_fn)
fig.update_layout(showlegend=False)
fig.show()

### 2.2.3 Sample parquet EDA

In [None]:
# read a parquet file from a selected row in train.csv
def get_sign_df(pq_path, invert_y=True):
    sign_df = pd.read_parquet(pq_path)

    # y value is inverted, because of the video flipping
    if invert_y: sign_df["y"] *= -1 
        
    return sign_df

'''
Data structure:
- frame: each picture in the video
- row_id: each tracing point on body parts
- x, y, z: coordinate -- ignore z because mediaPipe model is not fully trained to predict depth
'''

selected_row = 100
demo_sign_df = get_sign_df(train_df.iloc[selected_row]["path"])
demo_sign_df

Unnamed: 0,frame,row_id,type,landmark_index,x,y,z
0,22,22-face-0,face,0,0.482337,-0.440157,-0.041068
1,22,22-face-1,face,1,0.486704,-0.407177,-0.057901
2,22,22-face-2,face,2,0.484998,-0.420724,-0.035808
3,22,22-face-3,face,3,0.470767,-0.380886,-0.037523
4,22,22-face-4,face,4,0.485586,-0.397479,-0.059235
...,...,...,...,...,...,...,...
8683,37,37-right_hand-16,right_hand,16,,,
8684,37,37-right_hand-17,right_hand,17,,,
8685,37,37-right_hand-18,right_hand,18,,,
8686,37,37-right_hand-19,right_hand,19,,,


In [None]:
# EDA on selected demo parque
print("Unique body parts: ", demo_sign_df['type'].unique())
demo_graph = demo_sign_df[['frame', 'type', 'x']].groupby(by=['frame', 'type']).count().reset_index()
demo_graph

Unique body parts:  ['face' 'left_hand' 'pose' 'right_hand']


Unnamed: 0,frame,type,x
0,22,face,468
1,22,left_hand,21
2,22,pose,33
3,22,right_hand,0
4,23,face,468
...,...,...,...
59,36,right_hand,0
60,37,face,468
61,37,left_hand,21
62,37,pose,33


In [None]:
# graph
fig = px.line(demo_graph, x='frame', y='x', color='type', hover_data=['frame', 'type', 'x'])

# Add a title to the plot
fig.update_layout(title='Number of keypoints (X) by frame and type')

fig.show()

'''
For each part of the body ('type') we have the following keypoint counts:
* `Right Hand` --> 21 Keypoints
* `Left Hand` --> 21 Keypoints (some can be missing, indicating that it is not a dominant hand)
* `Pose` --> 33 Keypoints
* `Face` --> 468 Keypoints
'''

"\nFor each part of the body ('type') we have the following keypoint counts:\n* `Right Hand` --> 21 Keypoints\n* `Left Hand` --> 21 Keypoints (some can be missing, indicating that it is not a dominant hand)\n* `Pose` --> 33 Keypoints\n* `Face` --> 468 Keypoints\n"

# 3 Baseline Model

## 3.1 Feature processing
* Baseline model: left hand and right hand

In [None]:
# Configuration

# Landmark indices in original data
LEFT_HAND_IDXS0 = np.arange(468,489)
RIGHT_HAND_IDXS0 = np.arange(522,543)

# Concated Landmart indices
HAND_IDXS0 = np.concatenate((LEFT_HAND_IDXS0, RIGHT_HAND_IDXS0), axis=0)

# Landmark indices in processed data
LEFT_HAND_IDXS = np.argwhere(LEFT_HAND_IDXS0).squeeze()
RIGHT_HAND_IDXS = np.argwhere(RIGHT_HAND_IDXS0).squeeze()
HAND_IDXS = np.argwhere(HAND_IDXS0).squeeze()

In [None]:
# define a function to load data
'''
output: n_frames, keypoints, [x,y]
'''

KEYPOINTS_PER_FRAME = 543

def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y'] # z is irrelavant
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / KEYPOINTS_PER_FRAME)
    data = data.values.reshape(n_frames, KEYPOINTS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

In [None]:
N_ROWS = 543
N_DIMS = 2
DIM_NAMES = ['x', 'y']
INPUT_SIZE = 64 # TBC - downsize parameter to prevent video to be too long
N_COLS = 21 # dominant side index

# Tensorflow layer to process data in TFLite
'''
input: (batch_size, number of keypoints, [x,y] coordinates)
output1: (input_size, dominant side keypoints, [x,y] coordinates)
output2: (input_size)
'''
class PreprocessLayer(tf.keras.layers.Layer):
    def __init__(self):
        super(PreprocessLayer, self).__init__()

        # tbc for model *******************************************************
        normalisation_correction = tf.constant([
                    # Add 0.50 to left hand (original right hand)
                    [0.50] * len(LEFT_HAND_IDXS),
                    # Y coordinates stay intact
                    [0] * len(LEFT_HAND_IDXS),
                ],
                dtype=tf.float32,
            )
        self.normalisation_correction = tf.transpose(normalisation_correction, [1,0])
        # *********************************************************************

    def pad_edge(self, t, repeats, side):
        if side == 'LEFT':
            return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
        elif side == 'RIGHT':
            return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
    
    @tf.function(
        input_signature=(tf.TensorSpec(shape=[None,N_ROWS,N_DIMS], dtype=tf.float32),),
    )
    def call(self, data0):
        # Number of Frames in Video
        N_FRAMES0 = tf.shape(data0)[0]
        
        # Find dominant hand by comparing summed absolute coordinates
        left_hand_sum = tf.math.reduce_sum(tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1))
        right_hand_sum = tf.math.reduce_sum(tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1))
        left_dominant = left_hand_sum >= right_hand_sum
        
        # Count non NaN Hand values in each frame for the dominant hand
        if left_dominant:
            frames_hands_non_nan_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1),
                    axis=[1, 2],
                )
        else:
            frames_hands_non_nan_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1),
                    axis=[1, 2],
                )
        
        # Find frames indices with coordinates of dominant hand
        non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0)
        non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
       
        # Filter out frames w/o dominant hand
        data = tf.gather(data0, non_empty_frames_idxs, axis=0)
        
        # Cast Indices in float32 to be compatible with Tensorflow Lite
        non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
        
        # Normalize to start with 0
        non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs)
        
        # Number of Frames in Filtered Video
        N_FRAMES = tf.shape(data)[0]
        
        # Gather Relevant Landmark Columns: TBC ********************************
        if left_dominant:
            data = tf.gather(data, LEFT_HAND_IDXS0, axis=1)
        else:
            data = tf.gather(data, RIGHT_HAND_IDXS0, axis=1)
            data = (
                    self.normalisation_correction + (
                        (data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0, 1.0))
                )
        # **********************************************************************
      
        # Video fits in INPUT_SIZE
        if N_FRAMES < INPUT_SIZE:
            # Pad With -1 to indicate padding
            non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE-N_FRAMES]], constant_values=-1)
            # Pad Data With Zeros
            data = tf.pad(data, [[0, INPUT_SIZE-N_FRAMES], [0,0], [0,0]], constant_values=0)
            # Fill NaN Values With 0
            data = tf.where(tf.math.is_nan(data), 0.0, data)
            return data, non_empty_frames_idxs
        
        # Video needs to be downsampled to INPUT_SIZE
        else:
            # Repeat
            if N_FRAMES < INPUT_SIZE**2:
                repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
                data = tf.repeat(data, repeats=repeats, axis=0)
                non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)

            # Pad To Multiple Of Input Size
            pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
            if tf.math.mod(len(data), INPUT_SIZE) > 0:
                pool_size += 1

            if pool_size == 1:
                pad_size = (pool_size * INPUT_SIZE) - len(data)
            else:
                pad_size = (pool_size * INPUT_SIZE) % len(data)

            # Pad Start/End with Start/End value
            pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
            pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
            if tf.math.mod(pad_size, 2) > 0:
                pad_right += 1

            # Pad By Concatenating Left/Right Edge Values
            data = self.pad_edge(data, pad_left, 'LEFT')
            data = self.pad_edge(data, pad_right, 'RIGHT')

            # Pad Non Empty Frame Indices
            non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
            non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')

            # Reshape to Mean Pool
            data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
            non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])

            # Mean Pool
            data = tf.experimental.numpy.nanmean(data, axis=1)
            non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)

            # Fill NaN Values With 0
            data = tf.where(tf.math.is_nan(data), 0.0, data)
            
            return data, non_empty_frames_idxs
    
preprocess_layer = PreprocessLayer()

In [None]:
# load & process data
def get_data(file_path):
    # Load Raw Data
    data = load_relevant_data_subset(file_path)
    # Process Data Using Tensorflow
    data = preprocess_layer(data)
    
    return data

In [None]:
# input-output shape check
path = train_df['path'][2]
data = load_relevant_data_subset(path)
print(f'input shape: {data.shape}')
print(f'output shape1: {preprocess_layer(data)[0].shape}')
print(f'output shape2: {preprocess_layer(data)[1].shape}')

input shape: (105, 543, 2)
output shape1: (64, 21, 2)
output shape2: (64,)


## 3.2 Create dataset

In [None]:
# define a function to pre-process data
N_SAMPLES = train_df.shape[0] # choose all samples
path = "/content/drive/MyDrive/NUS_MSBA/BT5153/group project/baseline model inputs/"

def preprocess_data():
    # Create arrays to save data
    X = np.zeros([N_SAMPLES, INPUT_SIZE, N_COLS, N_DIMS], dtype=np.float32)
    y = np.zeros([N_SAMPLES], dtype=np.int32)
    NON_EMPTY_FRAME_IDXS = np.full([N_SAMPLES, INPUT_SIZE], -1, dtype=np.float32)

    # Fill X/y
    for row_idx, (file_path, sign_ord) in enumerate(tqdm(train_df[['path', 'sign_ord']].values)):

        data, non_empty_frame_idxs = get_data(file_path)
        X[row_idx] = data # processed layer
        y[row_idx] = sign_ord
        NON_EMPTY_FRAME_IDXS[row_idx] = non_empty_frame_idxs

    # # Save X/y
    # np.save('X.npy', X)
    # np.save('y.npy', y)
    # np.save('NON_EMPTY_FRAME_IDXS.npy', NON_EMPTY_FRAME_IDXS)
    
    # Save Validation
    splitter = GroupShuffleSplit(test_size=0.10, n_splits=2, random_state=seed)
    PARTICIPANT_IDS = train_df['participant_id'].values
    train_idxs, val_idxs = next(splitter.split(X, y, groups=PARTICIPANT_IDS))

    # Save Train
    X_train = X[train_idxs]
    NON_EMPTY_FRAME_IDXS_TRAIN = NON_EMPTY_FRAME_IDXS[train_idxs]
    y_train = y[train_idxs]
    np.save(f'{path}X_train.npy', X_train)
    np.save(f'{path}y_train.npy', y_train)
    np.save(f'{path}NON_EMPTY_FRAME_IDXS_TRAIN.npy', NON_EMPTY_FRAME_IDXS_TRAIN)

    # Save Validation
    X_val = X[val_idxs]
    NON_EMPTY_FRAME_IDXS_VAL = NON_EMPTY_FRAME_IDXS[val_idxs]
    y_val = y[val_idxs]
    np.save(f'{path}X_val.npy', X_val)
    np.save(f'{path}y_val.npy', y_val)
    np.save(f'{path}NON_EMPTY_FRAME_IDXS_VAL.npy', NON_EMPTY_FRAME_IDXS_VAL)

    # Split Statistics
    print(f'Patient ID Intersection Train/Val: {set(PARTICIPANT_IDS[train_idxs]).intersection(PARTICIPANT_IDS[val_idxs])}')
    print(f'X_train shape: {X_train.shape}, X_val shape: {X_val.shape}')
    print(f'y_train shape: {y_train.shape}, y_val shape: {y_val.shape}')

In [None]:
preprocess_data()

  0%|          | 0/21702 [00:00<?, ?it/s]

Patient ID Intersection Train/Val: set()
X_train shape: (17025, 64, 21, 2), X_val shape: (4677, 64, 21, 2)
y_train shape: (17025,), y_val shape: (4677,)


# 4 Full Model

## 4.1 Feature processing
* Pose model: hand + pose + lip

In [None]:
# Configuration

LIPS_IDXS0 = np.array([
        61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
        291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
        78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
        95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
    ])

# Landmark indices in original data
LEFT_HAND_IDXS0 = np.arange(468,489)
RIGHT_HAND_IDXS0 = np.arange(522,543)
LEFT_POSE_IDXS0 = np.array([502, 504, 506, 508, 510])
RIGHT_POSE_IDXS0 = np.array([503, 505, 507, 509, 511])

# Concated Landmart indices
HAND_IDXS0 = np.concatenate((LEFT_HAND_IDXS0, RIGHT_HAND_IDXS0), axis=0)
LANDMARK_IDXS_LEFT_DOMINANT0 = np.concatenate((LIPS_IDXS0, LEFT_HAND_IDXS0, LEFT_POSE_IDXS0))
LANDMARK_IDXS_RIGHT_DOMINANT0 = np.concatenate((LIPS_IDXS0, RIGHT_HAND_IDXS0, RIGHT_POSE_IDXS0))

# Landmark indices in processed data
LIPS_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LIPS_IDXS0)).squeeze()
LEFT_HAND_IDXS = np.argwhere(LEFT_HAND_IDXS0).squeeze()
RIGHT_HAND_IDXS = np.argwhere(RIGHT_HAND_IDXS0).squeeze()
HAND_IDXS = np.argwhere(HAND_IDXS0).squeeze()
POSE_IDXS = np.argwhere(np.isin(LANDMARK_IDXS_LEFT_DOMINANT0, LEFT_POSE_IDXS0)).squeeze()

print(LANDMARK_IDXS_LEFT_DOMINANT0.size)

66


In [None]:
N_COLS = 66 # dominant side index
N_ROWS = 543
DIM_NAMES = ['x', 'y', 'z']
N_DIMS = 3
INPUT_SIZE = 64 

# Tensorflow layer to process data in TFLite
'''
input: (batch_size, number of keypoints, [x,y] coordinates)
output1: (input_size, dominant side keypoints, [x,y] coordinates)
output2: (input_size)
'''
class PreprocessLayer_pose(tf.keras.layers.Layer):
    def __init__(self):
        super(PreprocessLayer_pose, self).__init__()

        # tbc for model *******************************************************
        normalisation_correction = tf.constant([
                    # Add 0.50 to left hand (original right hand)
                    [0] * len(LIPS_IDXS) + [0.50] * len(LEFT_HAND_IDXS) + [0.50] * len(POSE_IDXS),
                    # Y coordinates stay intact
                    [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
                    # Z coordinates stay intact
                    [0] * len(LANDMARK_IDXS_LEFT_DOMINANT0),
                ],
                dtype=tf.float32,
            )
        self.normalisation_correction = tf.transpose(normalisation_correction, [1,0])
        # *********************************************************************

    def pad_edge(self, t, repeats, side):
        if side == 'LEFT':
            return tf.concat((tf.repeat(t[:1], repeats=repeats, axis=0), t), axis=0)
        elif side == 'RIGHT':
            return tf.concat((t, tf.repeat(t[-1:], repeats=repeats, axis=0)), axis=0)
    
    @tf.function(
        input_signature=(tf.TensorSpec(shape=[None,N_ROWS,N_DIMS], dtype=tf.float32),),
    )
    def call(self, data0):
        # Number of Frames in Video
        N_FRAMES0 = tf.shape(data0)[0]
        
        # Find dominant hand by comparing summed absolute coordinates
        left_hand_sum = tf.math.reduce_sum(tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1))
        right_hand_sum = tf.math.reduce_sum(tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1))
        left_dominant = left_hand_sum >= right_hand_sum
        
        # Count non NaN Hand values in each frame for the dominant hand
        if left_dominant:
            frames_hands_non_nan_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, LEFT_HAND_IDXS0, axis=1)), 0, 1),
                    axis=[1, 2],
                )
        else:
            frames_hands_non_nan_sum = tf.math.reduce_sum(
                    tf.where(tf.math.is_nan(tf.gather(data0, RIGHT_HAND_IDXS0, axis=1)), 0, 1),
                    axis=[1, 2],
                )
        
        # Find frames indices with coordinates of dominant hand
        non_empty_frames_idxs = tf.where(frames_hands_non_nan_sum > 0)
        non_empty_frames_idxs = tf.squeeze(non_empty_frames_idxs, axis=1)
       
        # Filter out frames w/o dominant hand
        data = tf.gather(data0, non_empty_frames_idxs, axis=0)
        
        # Cast Indices in float32 to be compatible with Tensorflow Lite
        non_empty_frames_idxs = tf.cast(non_empty_frames_idxs, tf.float32)
        
        # Normalize to start with 0
        non_empty_frames_idxs -= tf.reduce_min(non_empty_frames_idxs)
        
        # Number of Frames in Filtered Video
        N_FRAMES = tf.shape(data)[0]
        
        # Gather Relevant Landmark Columns: TBC ********************************
        if left_dominant:
            data = tf.gather(data, LANDMARK_IDXS_LEFT_DOMINANT0, axis=1)
        else:
            data = tf.gather(data, LANDMARK_IDXS_RIGHT_DOMINANT0, axis=1)
            data = (
                    self.normalisation_correction + (
                        (data - self.normalisation_correction) * tf.where(self.normalisation_correction != 0, -1.0, 1.0))
                )
        # **********************************************************************
      
        # Video fits in INPUT_SIZE
        if N_FRAMES < INPUT_SIZE:
            # Pad With -1 to indicate padding
            non_empty_frames_idxs = tf.pad(non_empty_frames_idxs, [[0, INPUT_SIZE-N_FRAMES]], constant_values=-1)
            # Pad Data With Zeros
            data = tf.pad(data, [[0, INPUT_SIZE-N_FRAMES], [0,0], [0,0]], constant_values=0)
            # Fill NaN Values With 0
            data = tf.where(tf.math.is_nan(data), 0.0, data)
            return data, non_empty_frames_idxs
        
        # Video needs to be downsampled to INPUT_SIZE
        else:
            # Repeat
            if N_FRAMES < INPUT_SIZE**2:
                repeats = tf.math.floordiv(INPUT_SIZE * INPUT_SIZE, N_FRAMES0)
                data = tf.repeat(data, repeats=repeats, axis=0)
                non_empty_frames_idxs = tf.repeat(non_empty_frames_idxs, repeats=repeats, axis=0)

            # Pad To Multiple Of Input Size
            pool_size = tf.math.floordiv(len(data), INPUT_SIZE)
            if tf.math.mod(len(data), INPUT_SIZE) > 0:
                pool_size += 1

            if pool_size == 1:
                pad_size = (pool_size * INPUT_SIZE) - len(data)
            else:
                pad_size = (pool_size * INPUT_SIZE) % len(data)

            # Pad Start/End with Start/End value
            pad_left = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
            pad_right = tf.math.floordiv(pad_size, 2) + tf.math.floordiv(INPUT_SIZE, 2)
            if tf.math.mod(pad_size, 2) > 0:
                pad_right += 1

            # Pad By Concatenating Left/Right Edge Values
            data = self.pad_edge(data, pad_left, 'LEFT')
            data = self.pad_edge(data, pad_right, 'RIGHT')

            # Pad Non Empty Frame Indices
            non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_left, 'LEFT')
            non_empty_frames_idxs = self.pad_edge(non_empty_frames_idxs, pad_right, 'RIGHT')

            # Reshape to Mean Pool
            data = tf.reshape(data, [INPUT_SIZE, -1, N_COLS, N_DIMS])
            non_empty_frames_idxs = tf.reshape(non_empty_frames_idxs, [INPUT_SIZE, -1])

            # Mean Pool
            data = tf.experimental.numpy.nanmean(data, axis=1)
            non_empty_frames_idxs = tf.experimental.numpy.nanmean(non_empty_frames_idxs, axis=1)

            # Fill NaN Values With 0
            data = tf.where(tf.math.is_nan(data), 0.0, data)
            
            return data, non_empty_frames_idxs
    
preprocess_layer_pose = PreprocessLayer_pose()

In [None]:
# load & process data
def get_data_pose(file_path):
    # Load Raw Data
    data = load_relevant_data_subset(file_path)
    # Process Data Using Tensorflow
    data = preprocess_layer_pose(data)
    
    return data

In [None]:
KEYPOINTS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / KEYPOINTS_PER_FRAME)
    data = data.values.reshape(n_frames, KEYPOINTS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

In [None]:
# input-output shape check
'''
43 additional key points
'''
path = train_df['path'][2]
data = load_relevant_data_subset(path)
print(f'input shape: {data.shape}')
print(f'output shape1: {preprocess_layer_pose(data)[0].shape}')
print(f'output shape2: {preprocess_layer_pose(data)[1].shape}')

input shape: (105, 543, 3)
output shape1: (64, 66, 3)
output shape2: (64,)


## 4.2 Create dataset

In [None]:
# define a function to pre-process data
N_SAMPLES = train_df.shape[0] # choose all samples
path = "/content/drive/MyDrive/NUS_MSBA/BT5153/group project/pose model inputs/"

def preprocess_data_pose():
    # Create arrays to save data
    X = np.zeros([N_SAMPLES, INPUT_SIZE, N_COLS, N_DIMS], dtype=np.float32)
    y = np.zeros([N_SAMPLES], dtype=np.int32)
    NON_EMPTY_FRAME_IDXS = np.full([N_SAMPLES, INPUT_SIZE], -1, dtype=np.float32)

    # Fill X/y
    for row_idx, (file_path, sign_ord) in enumerate(tqdm(train_df[['path', 'sign_ord']].values)):

        data, non_empty_frame_idxs = get_data_pose(file_path) # TBC
        X[row_idx] = data # processed layer
        y[row_idx] = sign_ord
        NON_EMPTY_FRAME_IDXS[row_idx] = non_empty_frame_idxs

    # # Save X/y
    # np.save('X_pose.npy', X)
    # np.save('y_pose.npy', y)
    # np.save('NON_EMPTY_FRAME_IDXS_pose.npy', NON_EMPTY_FRAME_IDXS)
    
    # Save Validation
    splitter = GroupShuffleSplit(test_size=0.10, n_splits=2, random_state=seed)
    PARTICIPANT_IDS = train_df['participant_id'].values
    train_idxs, val_idxs = next(splitter.split(X, y, groups=PARTICIPANT_IDS))

    # Save Train
    X_train = X[train_idxs]
    NON_EMPTY_FRAME_IDXS_TRAIN = NON_EMPTY_FRAME_IDXS[train_idxs]
    y_train = y[train_idxs]
    np.save(f'{path}X_train_pose.npy', X_train)
    np.save(f'{path}y_train_pose.npy', y_train)
    np.save(f'{path}NON_EMPTY_FRAME_IDXS_TRAIN_pose.npy', NON_EMPTY_FRAME_IDXS_TRAIN)

    # Save Validation
    X_val = X[val_idxs]
    NON_EMPTY_FRAME_IDXS_VAL = NON_EMPTY_FRAME_IDXS[val_idxs]
    y_val = y[val_idxs]
    np.save(f'{path}X_val_pose.npy', X_val)
    np.save(f'{path}y_val_pose.npy', y_val)
    np.save(f'{path}NON_EMPTY_FRAME_IDXS_VAL_pose.npy', NON_EMPTY_FRAME_IDXS_VAL)

    # Split Statistics
    print(f'Patient ID Intersection Train/Val: {set(PARTICIPANT_IDS[train_idxs]).intersection(PARTICIPANT_IDS[val_idxs])}')
    print(f'X_train shape: {X_train.shape}, X_val shape: {X_val.shape}')
    print(f'y_train shape: {y_train.shape}, y_val shape: {y_val.shape}')

In [29]:
preprocess_data_pose()

  0%|          | 0/21702 [00:00<?, ?it/s]

Patient ID Intersection Train/Val: set()
X_train shape: (17025, 64, 66, 3), X_val shape: (4677, 64, 66, 3)
y_train shape: (17025,), y_val shape: (4677,)
