# Tensorflow - Loading data - Adjusting the layer to our needs

https://www.kaggle.com/code/roberthatch/gislr-feature-data-on-the-shoulders/notebook

## Import libraries

In [7]:
%pip install tqdm
%pip install tflite-runtime
import os

import json
from tqdm import tqdm
import numpy as np
import pandas as pd
import random

import tensorflow as tf
import tflite_runtime.interpreter as tflite

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
[31mERROR: Could not find a version that satisfies the requirement tflite-runtime (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for tflite-runtime[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'tflite_runtime'

## Setup

In [None]:
#file paths for Kaggle
LANDMARK_FILES_DIR = "/kaggle/input/asl-signs/train_landmark_files"
TRAIN_FILE = "/kaggle/input/asl-signs/train.csv"
OUTPUT = ""
label_map = json.load(open("/kaggle/input/asl-signs/sign_to_prediction_index_map.json", "r"))

#for local notebook, adjust file paths here if required
#LANDMARK_FILES_DIR = "../data/asl-signs/"
#TRAIN_FILE = "../data/asl-signs/train.csv"
#OUTPUT = "../data/" #path to save x and y files
#label_map = json.load(open("../data/asl-signs/sign_to_prediction_index_map.json", "r"))

## Configuration

In [None]:
#limit dataset for quick test
QUICK_TEST = False
QUICK_LIMIT = 1000

#Define length of sequences for padding or cutting; 22 is the median length of all sequences
LENGTH = 22

#define min or max length of sequences; sequences too long/too short will be dropped
#max value of 92 was defined by calculating the interquartile range
MIN_LENGTH = 10
MAX_LENGTH = 92

#final data will be flattened, if false data will be 3 dimensional
FLATTEN = False

#define initialization of numpy array 
ARRAY = False #(True=Zeros, False=empty values)

#Define padding mode 
#1 = padding at start&end; 2 = padding at end; 3 = no padding, 4 = copy first/lastframe, 5 = copy last frame)
#Note: Mode 3 will give you an error due to different lengths, working on that
PADDING = 2
CONSTANT_VALUE = 0 #only required for mode 1 and 2; enter tf.constant(float('nan')) for NaN

#define if z coordinate will be dropped
DROP_Z = True

#define if csv file should be filtered
CSV_FILTER  = True
#define how many participants for test set
TEST_COUNT = 5 #5 participants account for ca 23% of dataset
#generate test or train dataset (True = Train dataset; False = Test dataset)
TRAIN = False #only works if CSV_FILTER is activated
#TRAIN = False

#define filenames for x and y:
#feature_data = 'X_train' #x data
#feature_labels = 'y_train' #y data

#use for test dataset
feature_data = 'X_test' #x data
feature_labels = 'y_test' #y data


RANDOM_STATE = 42

#Defining Landmarks
#index ranges for each landmark type
#dont change these landmarks
FACE = list(range(0, 468))
LEFT_HAND = list(range(468, 489))
POSE = list(range(489, 522))
POSE_UPPER = list(range(489, 510))
RIGHT_HAND = list(range(522, 543))
LIPS = [61, 185, 40, 39, 37,  0, 267, 269, 270, 409,
                 291,146, 91,181, 84, 17, 314, 405, 321, 375, 
                 78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 
                 95, 88, 178, 87, 14,317, 402, 318, 324, 308]
#defining landmarks that will be merged
averaging_sets = [FACE]

#generating list with all landmarks selected for preprocessing
#change landmarks you want to use here:
point_landmarks = LEFT_HAND + POSE_UPPER + RIGHT_HAND + LIPS


#calculating sum of total landmarks used
LANDMARKS = len(point_landmarks) + len(averaging_sets)
print(f'Total count of used landmarks: {LANDMARKS}')

#defining input shape for model
if DROP_Z:
    INPUT_SHAPE = (LENGTH,LANDMARKS*2)
else:
    INPUT_SHAPE = (LENGTH,LANDMARKS*3)


Total count of used landmarks: 104


### Helper Functions

In [None]:
ROWS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    #defines which columns will be read from the file
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    #calculates the number of frames in the data by dividing the length of the data by the number of rows per frame
    n_frames = int(len(data) / ROWS_PER_FRAME)
    #reshapes the data into a 3D array with shape (n_frames, ROWS_PER_FRAME, len(data_columns))
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

In [None]:
def tf_nan_mean(x, axis=0):
    #calculates the mean of a TensorFlow tensor x along a specified axis while ignoring any NaN values in the tensor.
    return tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), x), axis=axis) / tf.reduce_sum(tf.where(tf.math.is_nan(x), tf.zeros_like(x), tf.ones_like(x)), axis=axis)

def tf_nan_std(x, axis=0):
    #calculates the standard deviation of a tensor x along a specified axis, while ignoring any NaN values in the tensor
    d = x - tf_nan_mean(x, axis=axis)
    return tf.math.sqrt(tf_nan_mean(d * d, axis=axis))

#this function is only required if mean and std will be calculated for specific segments of the data
def flatten_means_and_stds(x, axis=0):
    #Get means and stds
    x_mean = tf_nan_mean(x, axis=0)
    x_std  = tf_nan_std(x,  axis=0)
    #concats mean and std values for each sequence
    x_out = tf.concat([x_mean, x_std], axis=0)
    x_out = tf.reshape(x_out, (1, INPUT_SHAPE[1]*2))
    #replaces NaN values with zeros
    x_out = tf.where(tf.math.is_finite(x_out), x_out, tf.zeros_like(x_out))
    return x_out

## TensorFlow Feature Preprocessing Layer

In [None]:
#generating preprocessing layer that will be added to final model
class FeatureGen(tf.keras.layers.Layer):
    #defines custom tensorflow layer 
    def __init__(self):
        #initializes layer
        super(FeatureGen, self).__init__()
    
    def call(self, x_in):
        #drop z coordinates if required
        if DROP_Z:
            x_in = x_in[:, :, 0:2]
        
        #generates list with mean values for landmarks that will be merged
        x_list = [tf.expand_dims(tf_nan_mean(x_in[:, av_set[0]:av_set[0]+av_set[1], :], axis=1), axis=1) for av_set in averaging_sets]
        #extracts specific columns from input x_in defined by landmarks
        x_list.append(tf.gather(x_in, point_landmarks, axis=1))
        #concatenates the two tensors from above along axis 1/columns
        x = tf.concat(x_list, 1)

        #padding to desired length of sequence (defined by LENGTH)
        #get current number of rows
        x_padded = x
        current_rows = tf.shape(x_padded)[0]
        #if current number of rows is greater than desired number of rows, truncate excess rows
        if current_rows > LENGTH:
            x_padded = x_padded[:LENGTH, :, :]

        #if current number of rows is less than desired number of rows, add padding
        elif current_rows < LENGTH:
            #calculate amount of padding needed
            pad_rows = LENGTH - current_rows

            if PADDING ==4: #copy first/last frame
                if pad_rows %2 == 0: #if pad_rows is even
                    padding_front = tf.repeat(x_padded[0:1, :], pad_rows//2, axis=0)
                    padding_back = tf.repeat(x_padded[-1:, :], pad_rows//2, axis=0)
                else: #if pad_rows is odd
                    padding_front = tf.repeat(x_padded[0:1, :], (pad_rows//2)+1, axis=0)
                    padding_back = tf.repeat(x_padded[-1:, :], pad_rows//2, axis=0)
                x_padded = tf.concat([padding_front, x_padded, padding_back], axis=0)
            elif PADDING == 5: #copy last frame
                padding_back = tf.repeat(x_padded[-1:, :], pad_rows, axis=0)
                x_padded = tf.concat([x_padded, padding_back], axis=0)
            else:
                if PADDING ==1: #padding at start and end
                    if pad_rows %2 == 0: #if pad_rows is even
                        paddings = [[pad_rows//2, pad_rows//2], [0, 0], [0, 0]]
                    else: #if pad_rows is odd
                        paddings = [[pad_rows//2+1, pad_rows//2], [0, 0], [0, 0]]
                elif PADDING ==2: #padding only at the end of sequence
                    paddings = [[0, pad_rows], [0, 0], [0, 0]]
                elif PADDING ==3: #no padding
                    paddings = [[0, 0], [0, 0], [0, 0]]
                x_padded = tf.pad(x_padded, paddings, mode='CONSTANT', constant_values=CONSTANT_VALUE)

        x = x_padded
        current_rows = tf.shape(x)[0]

        #interpolate single missing values
        x = pd.DataFrame(np.array(x).flatten()).interpolate(method='linear', limit=2, limit_direction='both')
        #fill missing values with zeros
        x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)
        
        #reshape data to 2D or 3D array
        if FLATTEN:
            x = tf.reshape(x, (1, current_rows*INPUT_SHAPE[1]))
        else:
            x = tf.reshape(x, (1, current_rows, INPUT_SHAPE[1]))

        return x

#define converter using generated layer
feature_converter = FeatureGen()

In [None]:
# load model
model = tf.keras.models.load_model('/kaggle/input/model/LSTM_model_5.h5')

In [None]:
class TFLiteModel(tf.keras.Model):
    def __init__(self, model):
        super().__init__()
        self.prep_inputs = FeatureGen()
        self.model = model
        
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, 543, 3], dtype=tf.float32, name='inputs')])
    def call(self, inputs):
        x = self.prep_inputs(tf.cast(inputs, dtype=tf.float32))
        x = tf.expand_dims(x, axis=0)
        outputs = self.model(x)[0, :]
        # Return a dictionary with the output tensor
        return {'outputs': outputs}

In [None]:
tflite_keras_model = TFLiteModel(model)

In [None]:
# Save model
model_path = "model.tflite"
with open(model_path, "wb") as f:
    f.write(tflite_model)

In [None]:
!zip submission.zip $model_path