In [1]:
import os 
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_AUTO_GC'] = '1'

import gc
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pathlib
import json
import shutil

# from itertools import cycle, chain
from collections import namedtuple


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
data_path = '/kaggle/input/asl-fingerspelling'
metadata_file = data_path + '/train.csv'
sequence_path = data_path + '/train_landmarks'
supplemental_path = data_path + '/supplemental_landmarks'
supplemental_meta = data_path + '/supplemental_metadata.csv'

output_path = '/kaggle/working'
train_tfr_path = output_path + '/train_tfr'
supplemental_tfr_path = output_path + '/supplemental_tfr'

In [3]:
def make_path(pth):
    if not os.path.isdir(pth):
        os.mkdir(pth)
    else:
        shutil.rmtree(pth)
        os.mkdir(pth)

make_path(train_tfr_path)
make_path(supplemental_tfr_path)

In [4]:
lh_cols = [f'{i}_left_hand_{j}' for i in ('x', 'y') for j in range(21)]
rh_cols = [f'{i}_right_hand_{j}' for i in ('x', 'y') for j in range(21)]
pose_cols = [f'{i}_pose_{j}' for i in ('x', 'y') for j in range(33)]

selected_cols = lh_cols + rh_cols + pose_cols

In [5]:
metadata = pd.read_csv(metadata_file)
metadata = metadata[['path', 'file_id', 'sequence_id', 'phrase']]

In [6]:
with open(data_path + "/character_to_prediction_index.json", 'r') as f:
    character_map = json.load(f)
    character_map["<pad>"] = 59
rev_character_map = {j:i for i,j in character_map.items()}

In [7]:
def convert_to_tfrecord(metadata_file_path, output_path, max_files=4):
    metadata = pd.read_csv(metadata_file_path)
    metadata = metadata[['path', 'file_id', 'sequence_id', 'phrase']]
    metadata = metadata.sample(frac=1).reset_index(drop=True)
    
    for i, p in enumerate(metadata.path.unique()):
        subset = metadata[metadata.path == p]
        sequences = pd.read_parquet(data_path + '/' + p, columns=selected_cols).fillna(0)

        out_filename = output_path + '/' + str(subset.file_id.iloc[0]) + '.tfrecord'
        print(out_filename)

        with tf.io.TFRecordWriter(out_filename) as file_writer:
            for s in subset.sequence_id.unique():

                # handle troublesome short / nonexistent sequences
                seq = sequences[sequences.index == s]
                if len(seq) < 20:
                    continue

                meta_row = subset[subset.sequence_id == s]

                seq_features = tf.train.FeatureLists(feature_list={col: tf.train.FeatureList(feature=[tf.train.Feature(
                    float_list=tf.train.FloatList(value=seq[col]))]) for col in selected_cols})

                phrase_feature = tf.train.Features(feature={"phrase": tf.train.Feature(int64_list=tf.train.Int64List(
                    value=[character_map[c] for c in meta_row.phrase.iloc[0]]))})

                example = tf.train.SequenceExample(context=phrase_feature,
                                                   feature_lists=seq_features).SerializeToString()

                file_writer.write(example)
        
        if i >= max_files:
            return



In [8]:
convert_to_tfrecord(metadata_file, train_tfr_path, max_files = 100)
# convert_to_tfrecord(supplemental_meta, supplemental_tfr_path, max_files = 100)

/kaggle/working/train_tfr/939623093.tfrecord
/kaggle/working/train_tfr/1865557033.tfrecord
/kaggle/working/train_tfr/234418913.tfrecord
/kaggle/working/train_tfr/614661748.tfrecord
/kaggle/working/train_tfr/1905462118.tfrecord
/kaggle/working/train_tfr/527708222.tfrecord
/kaggle/working/train_tfr/1320204318.tfrecord
/kaggle/working/train_tfr/169560558.tfrecord
/kaggle/working/train_tfr/349393104.tfrecord
/kaggle/working/train_tfr/5414471.tfrecord
/kaggle/working/train_tfr/128822441.tfrecord
/kaggle/working/train_tfr/638508439.tfrecord
/kaggle/working/train_tfr/1906357076.tfrecord
/kaggle/working/train_tfr/1134756332.tfrecord
/kaggle/working/train_tfr/388576474.tfrecord
/kaggle/working/train_tfr/1662742697.tfrecord
/kaggle/working/train_tfr/1019715464.tfrecord
/kaggle/working/train_tfr/175396851.tfrecord
/kaggle/working/train_tfr/1021040628.tfrecord
/kaggle/working/train_tfr/532011803.tfrecord
/kaggle/working/train_tfr/1405046009.tfrecord
/kaggle/working/train_tfr/1664666588.tfrecord
/k

In [9]:
trainfiles = os.listdir(train_tfr_path)
trainfiles = [f'{train_tfr_path}/{e}' for e in trainfiles]

# supp_files = os.listdir(supplemental_tfr_path)
# supp_files = [f'{supplemental_tfr_path}/{e}' for e in supp_files]
# mixed_train_files = list(chain(*zip(cycle(trainfiles), supp_train_files)))

print(f"Number of main training files: {len(trainfiles)}")
# print(f"Number of supp training files: {len(supp_files)}")

Number of main training files: 68


In [10]:
def parse_fn(serialized):
    # Define the features we want to extract
    
    # Parse the serialized record using the feature dictionary
    parsed = tf.io.parse_sequence_example(serialized, 
                                          context_features={'phrase': tf.io.VarLenFeature(tf.int64)},
                                          sequence_features={col: tf.io.VarLenFeature(tf.float32) for col in selected_cols})
    
    sequence = ([tf.sparse.to_dense(parsed[1][f]) for f in selected_cols])
    sequence = tf.transpose(tf.squeeze(sequence))
    
    phrase = tf.sparse.to_dense(parsed[0]['phrase'])
    phrase = tf.cast(phrase, tf.int32)
    
    return sequence, phrase

In [11]:
batch_size = 32
padding_values=(59.0, 59)
padded_shapes=([None, 150], [None])

def create_dataset(files):
    ds = tf.data.TFRecordDataset(files)
    ds = (
        ds
        .map(parse_fn, num_parallel_calls=(tf.data.AUTOTUNE))
        .shuffle(batch_size * 100)
        .padded_batch(batch_size, padding_values=padding_values, padded_shapes=padded_shapes)
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    )
    
    return ds

In [12]:
class Slicer(layers.Layer):
    # @tf.function(input_signature=[tf.TensorSpec(shape=[None, None, len(selected_cols)],
    #                                             dtype=tf.float32)])
    def __init__(self):
        super().__init__(name = "slicer")
    
    def call(self, x):
        lh, rh, pose = tf.split(x, [42, 42, 66], axis=-1)

        hand_shaper = layers.Reshape((-1, 2, 21))
        lh = hand_shaper(lh)
        lh = tf.transpose(lh, [0, 1, 3, 2])

        rh = hand_shaper(rh)
        rh = tf.transpose(rh, [0, 1, 3, 2])

        pose_shaper = layers.Reshape((-1, 2, 33))
        pose = pose_shaper(pose)
        pose = tf.transpose(pose, [0, 1, 3, 2])
        
        return (lh, rh, pose)

In [13]:
def CTCLoss(y_true, y_pred):
    # Compute the training-time loss value
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")

    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
    return loss

In [14]:
def CTCDecoder():
    def decoder(y_pred):
        
        input_shape = tf.keras.backend.shape(y_pred)
        input_length = tf.cast(input_shape[1], tf.int64)

        x = y_pred
        x = tf.squeeze(tf.math.argmax(x, axis=2))
        x = tf.cast(x, tf.int64)
        
        j0 = tf.constant(0, dtype=tf.int64)

        inp0 = tf.cast(x, tf.int64)
        pc0 = tf.constant(1000, dtype=tf.int64)
        
        out0 = tf.ones(shape=[1], dtype=tf.int64) * tf.constant(59, shape=[1], dtype=tf.int64)
        
        j_pc_inp_out_init = [j0, pc0, inp0, out0] 
        
        cond = lambda j, pc, inp, out: j < input_length
        
        body = lambda j, pc, inp, out: tf.cond(tf.equal(inp[j],pc), 
                                               lambda: (j + 1, inp[j], inp, out), 
                                               lambda: (j + 1, inp[j], inp, tf.concat([out, tf.expand_dims(inp[j], 0)], 0)))

        x = tf.while_loop(cond, body, j_pc_inp_out_init, 
                          shape_invariants=[j0.get_shape(), tf.TensorShape(None), 
                                            inp0.get_shape(), tf.TensorShape([None])])[3]

        x = x[x<59]

        x = tf.one_hot(x, 59)
        
        return x

    return tf.keras.layers.Lambda(decoder, name='outputs')

In [15]:
def PreProcessor():
    @tf.function(input_signature=[tf.TensorSpec(shape=[None, len(selected_cols)],
                                                dtype=tf.float32, name='inputs')])
    def preprocessor(x):
        
        x = tf.cast(x, tf.float32)

        x = x[None]
        
        x = tf.cond(tf.shape(x)[1] < 2, 
            lambda: tf.zeros((1, 2, len(selected_cols))), 
            lambda: tf.identity(x))

        x = x[0]
        
        x = tf.where(tf.math.is_nan(x), tf.zeros_like(x), x)

        x = x[None]
        
        return x
    
    return tf.keras.layers.Lambda(preprocessor, name='preprocessor')

In [16]:
class SpatialConv2D(layers.Layer):
    def __init__(self, filters, kernel_size, 
                 input_shape, name= None):
        
        super().__init__(name = name)

        pad_width = kernel_size[0]
        pad_height = kernel_size[1]

        self.padding = layers.ZeroPadding2D(padding=(pad_width, pad_height), 
                                            data_format='channels_last')

        self.conv = layers.Conv2D(filters = filters,
                                  kernel_size = kernel_size,
                                  input_shape = input_shape,
                                  strides=1,
                                  padding = "same",
                                  activation = "relu")

        self.maxpool = layers.MaxPooling2D(pool_size = (1, kernel_size[1]), 
                                            data_format = 'channels_last', strides=1)

        self.filters = filters

        self.out_shaper = layers.Reshape((-1, self.filters))

    
    def call(self, x):
        # x = self.padding(x)
        
        x = self.conv(x)

        x = self.maxpool(x)

        x = self.out_shaper(x)

        return x


In [17]:
def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):

    input_train = layers.Input((None, input_dim), name="inputs_train")
    input_prod = layers.Input((input_dim), name="inputs")
    
    preprocessor = PreProcessor()
    x = input_train
    x_p = preprocessor(input_prod)
    
    slicer = Slicer()
    x_l, x_r, x_o = slicer(x)
    x_lp, x_rp, x_op = slicer(x_p)

    conv_l_1 = SpatialConv2D(filters=256, 
                           kernel_size=[1, 21],
                           input_shape=(None, None, 21, 2),
                           name = "left_hand_conv_1")
    x_l_c1 = conv_l_1(x_l)
    x_lp_c1 = conv_l_1(x_lp)

    conv_r_1 = SpatialConv2D(filters=256, 
                           kernel_size=[1, 21],
                           input_shape=(None, None, 21, 2),
                           name= "right_hand_conv_1")
    x_r_c1 = conv_r_1(x_r)
    x_rp_c1 = conv_r_1(x_rp)

    conv_p_1 = SpatialConv2D(filters=128, 
                           kernel_size=[1, 33],
                           input_shape=(None, None, 33, 2),
                           name = "pose_conv_1")
    x_o_c1 = conv_p_1(x_o)
    x_op_c1 = conv_p_1(x_op)

    # Wide convolutions

    conv_l_2 = SpatialConv2D(filters=64, 
                           kernel_size=[7, 21],
                           input_shape=(None, None, 21, 2),
                           name = "left_hand_conv_2")
    x_l_c2 = conv_l_2(x_l)
    x_lp_c2 = conv_l_2(x_lp)
    
    conv_r_2 = SpatialConv2D(filters=64, 
                           kernel_size=[7, 21],
                           input_shape=(None, None, 21, 2),
                           name= "right_hand_conv_2")
    x_r_c2 = conv_r_2(x_r)
    x_rp_c2 = conv_r_2(x_rp)

    conv_p_2 = SpatialConv2D(filters=32, 
                           kernel_size=[7, 33],
                           input_shape=(None, None, 33, 2),
                           name = "pose_conv_2")
    x_o_c2 = conv_p_2(x_o)
    x_op_c2 = conv_p_2(x_op)

    
    # Even Wider convolutions

    conv_l_3 = SpatialConv2D(filters=32, 
                           kernel_size=[21, 21],
                           input_shape=(None, None, 21, 2),
                           name = "left_hand_conv_3")
    x_l_c3 = conv_l_3(x_l)
    x_lp_c3 = conv_l_3(x_lp)
    
    conv_r_3 = SpatialConv2D(filters=32, 
                           kernel_size=[21, 21],
                           input_shape=(None, None, 21, 2),
                           name= "right_hand_conv_3")
    x_r_c3 = conv_r_3(x_r)
    x_rp_c3 = conv_r_3(x_rp)

    conv_p_3 = SpatialConv2D(filters=16, 
                           kernel_size=[21, 33],
                           input_shape=(None, None, 33, 2),
                           name = "pose_conv_3")
    x_o_c3 = conv_p_3(x_o)
    x_op_c3 = conv_p_3(x_op)


    # Concatenate convolutional outputs
    
    lh_cat = layers.Concatenate(name="lh_cat")
    x_l = lh_cat([x_l_c1, x_l_c2, x_l_c3])
    x_lp = lh_cat([x_lp_c1, x_lp_c2, x_lp_c3])

    rh_cat = layers.Concatenate(name="rh_cat")
    x_r = rh_cat([x_r_c1, x_r_c2, x_r_c3])
    x_rp = rh_cat([x_rp_c1, x_rp_c2, x_rp_c3])

    pose_cat = layers.Concatenate(name="pose_cat")
    x_o = pose_cat([x_o_c1, x_o_c2, x_o_c3])
    x_op = pose_cat([x_op_c1, x_op_c2, x_op_c3])

    maxhand = layers.Maximum(name="maxhand")
    x_h = maxhand([x_l, x_r])
    x_hp = maxhand([x_lp, x_rp])

    cat2 = layers.Concatenate(name="cat2")
    x = cat2([x_h, x_o])
    x_p = cat2([x_hp, x_op])
    
    n1 = layers.LayerNormalization(name="ln1")
    x = n1(x)
    x_p = n1(x_p)

    maxpool = layers.MaxPooling1D(pool_size=2,
                                  name="maxpool")
    x = maxpool(x)
    x_p = maxpool(x_p)

    rnn_outputs = []
    rnn_outputs_p = []
    
    for i in range(1, rnn_layers + 1):
        recurrent = layers.LSTM(
            units=rnn_units,
            activation="tanh",
            use_bias=True,
            return_sequences=True,
            name=f"rnn_{i}",
        )

        rnn_out = recurrent(x)
        rnn_out_p = recurrent(x_p)
        
        do_1 = layers.Dropout(rate=0.1)
        rnn_out = do_1(rnn_out)
        rnn_out_p = do_1(rnn_out_p)

        rnn_outputs.append(rnn_out)
        rnn_outputs_p.append(rnn_out_p)

    cat3 = layers.Concatenate(name="cat3")
    x = cat3(rnn_outputs)
    x_p = cat3(rnn_outputs_p)
    
    # Dense layer
    d_1 = layers.Dense(units=(rnn_units * rnn_layers * 2), name="dense_1")
    x = d_1(x)
    x_p = d_1(x_p)
    
    relu_d = layers.ReLU(name="dense_1_relu")
    x = relu_d(x)
    x_p = relu_d(x_p)
    
    do_2 = layers.Dropout(rate=0.1)
    x = do_2(x)
    x_p = do_2(x_p)
    
    # Classification layer
    output = layers.Dense(units=output_dim + 1, activation="softmax")
    x_o = output(x)
    x_op = output(x_p)
    
    # Model
    model = keras.Model(inputs=input_train, outputs=x_o, name="DeepASL")
    prod_model = keras.Model(inputs=input_prod, outputs=CTCDecoder()(x_op), name="DeepASL_p")
    # Optimizer
    lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
        initial_learning_rate=0.001,
        decay_steps=1500,
        decay_rate=0.98,
        staircase=True
    )
    opt = keras.optimizers.Adam(learning_rate=lr_schedule)
    # Compile the model and return
    model.compile(optimizer=opt, loss=CTCLoss)
    return model, prod_model


# Get the model
model, prod_model = build_model(
    input_dim=150,
    output_dim=len(character_map) - 1,
    rnn_units=640,
    rnn_layers=2,
)

prod_model.summary(line_length=110)


Model: "DeepASL_p"
______________________________________________________________________________________________________________
 Layer (type)                       Output Shape            Param #      Connected to                         
 inputs (InputLayer)                [(None, 150)]           0            []                                   
                                                                                                              
 preprocessor (Lambda)              (1, None, 150)          0            ['inputs[0][0]']                     
                                                                                                              
 slicer (Slicer)                    multiple                0            ['preprocessor[0][0]']               
                                                                                                              
 left_hand_conv_1 (SpatialConv2D)   multiple                11008        ['slicer[1][0]']    

In [18]:
# plot_model(prod_model, show_layer_names=True, to_file='model.png')
# Image('model.png')

In [19]:
def train_round(trainfiles, epochs=7, start_at=0):
    
    train_set = create_dataset(trainfiles)
    
    model.fit(
        train_set,
        epochs=epochs,
        initial_epoch=start_at,
        verbose=2
    )
    
    del train_set

    keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    gc.collect()

    return


In [20]:
train_round(trainfiles, 56)

print("First round down")

Epoch 1/56
2022/2022 - 989s - loss: 64.1687 - 989s/epoch - 489ms/step
Epoch 2/56
2022/2022 - 514s - loss: 48.7952 - 514s/epoch - 254ms/step
Epoch 3/56
2022/2022 - 500s - loss: 33.1633 - 500s/epoch - 247ms/step
Epoch 4/56
2022/2022 - 495s - loss: 29.1556 - 495s/epoch - 245ms/step
Epoch 5/56
2022/2022 - 495s - loss: 27.0992 - 495s/epoch - 245ms/step
Epoch 6/56
2022/2022 - 497s - loss: 25.5732 - 497s/epoch - 246ms/step
Epoch 7/56
2022/2022 - 493s - loss: 24.2440 - 493s/epoch - 244ms/step
Epoch 8/56
2022/2022 - 492s - loss: 23.0834 - 492s/epoch - 243ms/step
Epoch 9/56
2022/2022 - 495s - loss: 21.9295 - 495s/epoch - 245ms/step
Epoch 10/56
2022/2022 - 491s - loss: 20.8267 - 491s/epoch - 243ms/step
Epoch 11/56
2022/2022 - 494s - loss: 19.7570 - 494s/epoch - 245ms/step
Epoch 12/56
2022/2022 - 493s - loss: 18.6902 - 493s/epoch - 244ms/step
Epoch 13/56
2022/2022 - 491s - loss: 17.6883 - 491s/epoch - 243ms/step
Epoch 14/56
2022/2022 - 494s - loss: 16.7761 - 494s/epoch - 244ms/step
Epoch 15/56
202

In [21]:
export_dir = output_path + '/exports'

make_path(export_dir)

In [22]:
# prod_model.save(export_dir)

In [23]:
converter = tf.lite.TFLiteConverter.from_keras_model(prod_model)

converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, 
                                       tf.lite.OpsSet.SELECT_TF_OPS]

tflite_model = converter.convert()

In [24]:
tflite_file = f'model.tflite'

with open(tflite_file, 'wb') as f:
    f.write(tflite_model)

In [25]:
interpreter = tf.lite.Interpreter(model_content=tflite_model)
signatures = interpreter.get_signature_list()
print(signatures)

{'serving_default': {'inputs': ['inputs'], 'outputs': ['outputs']}}


In [26]:
infargs = {"selected_columns" : selected_cols}
infargs_file = 'inference_args.json'

with open(infargs_file, "w") as json_file:
    json.dump(infargs, json_file)

In [27]:
print(tflite_file)
print(infargs_file)

model.tflite
inference_args.json


In [28]:
!zip submission.zip 'model.tflite' 'inference_args.json'

  adding: model.tflite (deflated 7%)
  adding: inference_args.json (deflated 83%)
