### [ pytorch transformer solution ]

local validation CV:

fold-2 by : https://www.kaggle.com/code/clemchris/asl-sign-detection-pytorch-lightning  
stratification by participant id (i.e. train and validation participant id does not overlap)

- time_taken =  ~30 msec per video (or 25 min for all 40_000 hidden test video)
- crop entropy loss = 1.9538569450378418
- topk[0] = 0.5876061120543293
- topk[1] = 0.7024900962082626
- topk[2] = 0.755461233729485
- topk[3] = 0.7844934917940012
- topk[4] = 0.8033955857385399

LB = 0.62

setting :
embed_dim = 512  
length    = 60  
num_head  = 4  
num_block = 1  

it seems that transformer solution easily gets of out memory.   
need to investigate more on optimal setting.  

updates:

- for experiment results, refer to:
https://www.kaggle.com/competitions/asl-signs/discussion/391265  

- it is better to use keras input_net (shape normalisation) for reasons explained here
https://www.kaggle.com/competitions/asl-signs/discussion/390935#2176801  
https://www.kaggle.com/competitions/asl-signs/discussion/393655#2177124  

- add features like distance between points in same frame, velocity, etc


![https://i.ibb.co/XVxP67c/Selection-999-1440.png](https://i.ibb.co/XVxP67c/Selection-999-1440.png)



In [1]:
#pytorch model

import torch
import torch.nn.functional as F
import torch.nn as nn

#num_landmark = 543
max_length = 80
num_class  = 250
num_point  = 82  # LIP, LHAND, RHAND

def pack_seq(
    seq,
):
    length = [len(s) for s in seq]
    batch_size = len(seq)
    num_landmark=seq[0].shape[1]

    x = torch.zeros((batch_size, max(length), num_landmark, 3)).to(seq[0].device)
    x_mask = torch.zeros((batch_size, max(length))).to(seq[0].device)
    for b in range(batch_size):
        L = length[b]
        x[b, :L] = seq[b][:L]
        x_mask[b, L:] = 1
    x_mask = (x_mask>0.5)
    x = x.reshape(batch_size,-1,num_landmark*3)
    return x, x_mask


class FeedForward(nn.Module):
    def __init__(self, embed_dim, hidden_dim):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, embed_dim),
        )
    def forward(self, x):
        return self.mlp(x)


#https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
class MultiHeadAttention(nn.Module):
    def __init__(self,
            embed_dim,
            num_head,
            batch_first,
        ):
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim,
            num_heads=num_head,
            bias=True,
            add_bias_kv=False,
            kdim=None,
            vdim=None,
            dropout=0.0,
            batch_first=batch_first,
        )

    def forward(self, x, x_mask):
        out, _ = self.mha(x,x,x, key_padding_mask=x_mask)
        return out


def positional_encoding(length, embed_dim):
    dim = embed_dim//2

    position = np.arange(length)[:, np.newaxis]     # (seq, 1)
    dim = np.arange(dim)[np.newaxis, :]/dim   # (1, dim)

    angle = 1 / (10000**dim)         # (1, dim)
    angle = position * angle    # (pos, dim)

    pos_embed = np.concatenate(
        [np.sin(angle), np.cos(angle)],
        axis=-1
    )
    pos_embed = torch.from_numpy(pos_embed).float()
    return pos_embed

class TransformerBlock(nn.Module):
    def __init__(self,
        embed_dim,
        num_head,
        out_dim,
        batch_first=True,
    ):
        super().__init__()
        self.attn  = MultiHeadAttention(embed_dim, num_head,batch_first)
        self.ffn   = FeedForward(embed_dim, out_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(out_dim)

    def forward(self, x, x_mask=None):
        x = x + self.attn((self.norm1(x)), x_mask)
        x = x + self.ffn((self.norm2(x)))
        return x

class Net(nn.Module):

    def __init__(self, num_class=num_class):
        super().__init__()
        self.output_type = ['inference', 'loss']

        num_block = 1
        embed_dim = 1024
        num_head  = 8

        pos_embed = positional_encoding(max_length, embed_dim)
        # self.register_buffer('pos_embed', pos_embed)
        self.pos_embed = nn.Parameter(pos_embed)

        self.cls_embed = nn.Parameter(torch.zeros((1, embed_dim)))
        self.x_embed = nn.Sequential(
            nn.Linear(num_point * 3, embed_dim, bias=False),
        )

        self.encoder = nn.ModuleList([
            TransformerBlock(
                embed_dim,
                num_head,
                embed_dim,
            ) for i in range(num_block)
        ])
        self.logit = nn.Linear(embed_dim, num_class)

    def forward(self, batch):
        length = [len(x) for x in batch['xyz']]
        xyz = batch['xyz']

        x, x_mask = pack_seq(xyz)
        B,L,_ = x.shape
        x = self.x_embed(x)
        x = x + self.pos_embed[:L].unsqueeze(0)

        x = torch.cat([
            self.cls_embed.unsqueeze(0).repeat(B,1,1),
            x
        ],1)
        x_mask = torch.cat([
            torch.zeros(B,1).to(x_mask),
            x_mask
        ],1)


        #x = F.dropout(x,p=0.25,training=self.training)
        for block in self.encoder:
            x = block(x,x_mask)

        cls = x[:,0]
        cls = F.dropout(cls,p=0.4,training=self.training)
        logit = self.logit(cls)

        output = {}
        if 'loss' in self.output_type:
            output['label_loss'] = F.cross_entropy(logit, batch['label'])

        if 'inference' in self.output_type:
            output['sign'] = torch.softmax(logit,-1)

        return output


def pre_process(xyz):
    xyz = xyz - xyz[~torch.isnan(xyz)].mean(0,keepdims=True) #noramlisation to common mean
    xyz = xyz / xyz[~torch.isnan(xyz)].std(0, keepdims=True)
    
    lip = xyz[:, LIP]
    lhand = xyz[:, LHAND]
    rhand = xyz[:, RHAND]
    xyz = torch.cat([ #(none, 82, 3)
        lip,
        lhand,
        rhand,
    ],1)
    xyz[torch.isnan(xyz)] = 0
    xyz = xyz[:max_length]
    return xyz





In [2]:
#pytorch model for tflite conversion

#simplfiy for one video input 
max_length = 96  #reduce this if gets out of memory error

class InputNet(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.max_length = max_length 
  
    def forward(self, xyz):
        xyz = xyz - xyz[~torch.isnan(xyz)].mean(0,keepdim=True) #noramlisation to common maen
        xyz = xyz / xyz[~torch.isnan(xyz)].std(0, keepdim=True)

        LIP = [
            61, 185, 40, 39, 37, 0, 267, 269, 270, 409,
            291, 146, 91, 181, 84, 17, 314, 405, 321, 375,
            78, 191, 80, 81, 82, 13, 312, 311, 310, 415,
            95, 88, 178, 87, 14, 317, 402, 318, 324, 308,
        ]
        #LHAND = np.arange(468, 489).tolist()
        #RHAND = np.arange(522, 543).tolist()

        lip = xyz[:, LIP]
        lhand = xyz[:, 468:489]
        rhand = xyz[:, 522:543]
        xyz = torch.cat([  # (none, 82, 3)
            lip,
            lhand,
            rhand,
        ], 1)
        xyz[torch.isnan(xyz)] = 0
        x = xyz[:self.max_length]
        return x


#overwrite the model used in training ....

# use fix dimension
class MultiHeadAttention(nn.Module):
    def __init__(self,
            embed_dim,
            num_head,
            batch_first,
        ):
        super().__init__()
        self.mha = nn.MultiheadAttention(
            embed_dim,
            num_heads=num_head,
            bias=True,
            add_bias_kv=False,
            kdim=None,
            vdim=None,
            dropout=0.0,
            batch_first=batch_first,
        )
    #https://github.com/pytorch/text/blob/60907bf3394a97eb45056a237ca0d647a6e03216/torchtext/modules/multiheadattention.py#L5
    def forward(self, x):
        # out,_ = self.mha(x,x,x,need_weights=False)
        # out,_ = F.multi_head_attention_forward(
        #     x, x, x,
        #     self.mha.embed_dim,
        #     self.mha.num_heads,
        #     self.mha.in_proj_weight,
        #     self.mha.in_proj_bias,
        #     self.mha.bias_k,
        #     self.mha.bias_v,
        #     self.mha.add_zero_attn,
        #     0,#self.mha.dropout,
        #     self.mha.out_proj.weight,
        #     self.mha.out_proj.bias,
        #     training=False,
        #     key_padding_mask=None,
        #     need_weights=False,
        #     attn_mask=None,
        #     average_attn_weights=False
        # )
 
        #qkv = F.linear(x, self.mha.in_proj_weight, self.mha.in_proj_bias)
        #qkv = qkv.reshape(-1,3,1024)
        #q,k,v = qkv[[0],0], qkv[:,1],  qkv[:,2]

        q = F.linear(x[:1], self.mha.in_proj_weight[:1024], self.mha.in_proj_bias[:1024]) #since we need only cls
        k = F.linear(x, self.mha.in_proj_weight[1024:2048], self.mha.in_proj_bias[1024:2048])
        v = F.linear(x, self.mha.in_proj_weight[2048:], self.mha.in_proj_bias[2048:]) 
        q = q.reshape(-1, 8, 128).permute(1, 0, 2)
        k = k.reshape(-1, 8, 128).permute(1, 2, 0)
        v = v.reshape(-1, 8, 128).permute(1, 0, 2)
        dot  = torch.matmul(q, k) * (1/128**0.5) # H L L
        attn = F.softmax(dot, -1)  #   L L
        out  = torch.matmul(attn, v)  #   L H dim
        out  = out.permute(1, 0, 2).reshape(-1, 1024)
        out  = F.linear(out, self.mha.out_proj.weight, self.mha.out_proj.bias)  
        return out

# remove mask
class TransformerBlock(nn.Module):
    def __init__(self,
        embed_dim,
        num_head,
        out_dim,
        batch_first=True,
    ):
        super().__init__()
        self.attn  = MultiHeadAttention(embed_dim, num_head,batch_first)
        self.ffn   = FeedForward(embed_dim, out_dim)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(out_dim)

    def forward(self, x): 
        x = x[:1] + self.attn((self.norm1(x)))
        x = x + self.ffn((self.norm2(x)))
        return x

class SingleNet(nn.Module):

    def __init__(self, num_class=num_class):
        super().__init__()
        self.num_block = 1
        self.embed_dim = 1024
        self.num_head  = 8
        self.max_length = max_length
        self.num_point = num_point

        pos_embed = positional_encoding(max_length, self.embed_dim)
        self.pos_embed = nn.Parameter(pos_embed)

        self.cls_embed = nn.Parameter(torch.zeros((1, self.embed_dim)))
        self.x_embed = nn.Sequential(
            nn.Linear(num_point * 3, self.embed_dim, bias=False),
        )

        self.encoder = nn.ModuleList([
            TransformerBlock(
                self.embed_dim,
                self.num_head,
                self.embed_dim,
                batch_first=False
            ) for i in range(self.num_block)
        ])
        self.logit = nn.Linear(self.embed_dim, num_class)

    def forward(self, xyz):
        L = xyz.shape[0]
        x_embed = self.x_embed(xyz.flatten(1)) 
        x = x_embed[:L] + self.pos_embed[:L]
        x = torch.cat([
            self.cls_embed,
            x
        ],0)
        #x = x.unsqueeze(1)

        #for block in self.encoder: x = block(x) #remove tflite loop
        x = self.encoder[0](x)
        cls = x[[0]]
        logit = self.logit(cls)
        return logit
    

In [3]:
#pytorch to onnx to tflite
if 0:
    
    name='transformer-pool-2b' 
    input_onnx_file   = f'{fold_dir}/{name}.input.onnx'
    single_onnx_file  = f'{fold_dir}/{name}.single.onnx' 
    input_tf_file    = f'{fold_dir}/input_tf'
    single_tf_file   = f'{fold_dir}/single_tf'
    tf_file     = f'{fold_dir}/tf'
    tflite_file = f'{fold_dir}/{name}-{max_length}.tflite'

    def run_convert_onnx(): 
        if 1:
            torch.onnx.export(
                input_net,
                #torch.jit.script(input_net),
                #torch.jit.trace(input_net, torch.zeros(100,num_landmark,3)),          # model being run 
                torch.zeros((100,num_landmark,3)), # model input (or a tuple for multiple inputs)
                input_onnx_file,             # where to save the model (can be a file or file-like object)
                export_params = True,        # store the trained parameter weights inside the model file
                opset_version = 12,          # the ONNX version to export the model to
                do_constant_folding=True,    # whether to execute constant folding for optimization 
                input_names =  ['inputs'],    # the model's input names
                output_names = ['outputs'],   # the model's output names
                dynamic_axes={
                    'inputs': {0: 'length'},
                    #'output': {0: 'length'},
                },
                #verbose = True,
            )
            torch.onnx.export(
                single_net,         
                #torch.jit.script(single_net),
                #torch.jit.trace(single_net, torch.zeros(max_length,82,3)),           

                torch.zeros((max_length,82,3)), 
                single_onnx_file,             
                export_params = True,         
                opset_version = 12, 
                do_constant_folding=True,      
                input_names =  ['inputs'],     
                output_names = ['outputs'],  
                dynamic_axes={
                    'inputs': {0: 'length'},
                },
                #verbose = True,
            )
            print('torch.onnx.export() passed !!')

        if 1:
            for f in [input_onnx_file, single_onnx_file]:
                if f is None: continue
                model = onnx.load(f)
                onnx.checker.check_model(model)
                model_simple, check = onnxsim.simplify(model)
                onnx.save(model_simple, f)
            print('onnx simplify() passed !!')


    def run_convert_tflite():
        if 1:
            tf_rep = prepare(onnx.load(input_onnx_file))
            tf_rep.export_graph(input_tf_file) 
            tf_rep = prepare(onnx.load(single_onnx_file))
            tf_rep.export_graph(single_tf_file) 
            print('tf_rep.export_graph() passed !!')

        if 1:
            class TFModel(tf.Module):
                def __init__(self):
                    super(TFModel, self).__init__()
                    self.input  = tf.saved_model.load(input_tf_file)
                    self.single = tf.saved_model.load(single_tf_file)
                    self.input.trainable = False
                    self.single.trainable = False

                @tf.function(input_signature=[
                    tf.TensorSpec(shape=[None, 543, 3], dtype=tf.float32, name='inputs')
                ])
                def call(self, input):
                    y = {}
                    x = self.input(**{'inputs': input})['outputs']
                    y['outputs'] = self.single(**{'inputs': x})['outputs'][0]
                    return y

            tfmodel = TFModel()
            tf.saved_model.save(tfmodel, tf_file, signatures={'serving_default': tfmodel.call})
            print('tf.saved_model() passed !!')

        if 1:
            converter = tf.lite.TFLiteConverter.from_saved_model(tf_file)
            # converter.target_spec.supported_ops = [
            #     tf.lite.OpsSet.TFLITE_BUILTINS,  # enable TensorFlow Lite ops.
            #     tf.lite.OpsSet.SELECT_TF_OPS  # enable TensorFlow ops.
            # ]
            # converter.optimizations = [tf.lite.Optimize.DEFAULT]
            #converter.allow_custom_ops = True
            #converter.experimental_new_converter = True 
            tf_lite_model = converter.convert()
            with open(tflite_file, 'wb') as f:
                f.write(tf_lite_model)
            print('tflite convert() passed !!')
 
    run_convert_onnx()
    run_convert_tflite()
    

In [4]:
#submission
#tflite_file = '/kaggle/input/asl-demo/transformer-pool-2b.tflite'   #max_length =180
#tflite_file = '/kaggle/input/asl-demo/transformer-pool-2b-96.tflite' #max_length =96 
#tflite_file = '/kaggle/input/asl-demo/transformer-pool-2c-512-80-fixed-int8.tflite'

#tflite_file = '/kaggle/input/asl-demo/transfomer-60-256-lip-hand-my-part-3a-int8.tflite'
#tflite_file = '/kaggle/input/asl-demo/run10-fold1-swa-transfomer-60-512-lip-hand-crop-center-00a-int8.tflite'
#tflite_file = '/kaggle/input/asl-demo/run15.tflite'
tflite_file = '/kaggle/input/asl-demo/run20-aug3-xyz2.tflite'




mode = 'submit' #debug #submit



import pandas as pd
import numpy as np
import os
import shutil
from datetime import datetime
from timeit import default_timer as timer


if mode in ['debug']:  
    try:
        import tflite_runtime
    except:
        !pip install tflite-runtime

    import tflite_runtime.interpreter as tflite   
    import tflite_runtime
    print(tflite_runtime.__version__)
    #'2.11.0'
    
    #import tensorflow as tf
    #print(tf.__version__)
    # 2.11.0

print('import ok')
'''
Your model must also require less than 40 MB in memory and 
perform inference with less than 100 milliseconds of latency per video. 
Expect to see approximately 40,000 videos in the test set. 
We allow an additional 10 minute buffer for loading the data and miscellaneous overhead.

'''
def time_to_str(t, mode='min'):
    if mode=='min':
        t  = int(t)/60
        hr = t//60
        min = t%60
        return '%2d hr %02d min'%(hr,min)

    elif mode=='sec':
        t   = int(t)
        min = t//60
        sec = t%60
        return '%2d min %02d sec'%(min,sec)

    else:
        raise NotImplementedError

        
ROWS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

if mode in ['debug']: 
 
    interpreter = tflite.Interpreter(tflite_file)
    prediction_fn = interpreter.get_signature_runner('serving_default')

    valid_df = pd.read_csv('/kaggle/input/asl-demo/train_prepared.csv') 
    valid_df = valid_df[valid_df.fold==2].reset_index(drop=True)
    valid_df = valid_df[:4_000]
    valid_num = len(valid_df)
    valid = {
        'sign':[],
    }

    start_timer = timer()
    for t, d in valid_df.iterrows():

        pq_file = f'/kaggle/input/asl-signs/{d.path}'
        #print(pq_file)
        xyz = load_relevant_data_subset(pq_file)

        output = prediction_fn(inputs=xyz)
        p = output['outputs'].reshape(-1)

        valid['sign'].append(p)

        #---
        if t%100==0:
            time_taken = timer() - start_timer
            print('\r %8d / %d  %s'%(t,valid_num,time_to_str(time_taken,'sec')),end='',flush=True)

    print('\n')


    truth = valid_df.label.values
    sign  = np.stack(valid['sign'])
    predict = np.argsort(-sign, -1)
    correct = predict==truth.reshape(valid_num,1)
    topk = correct.cumsum(-1).mean(0)[:5]


    print(f'time_taken = {time_to_str(time_taken,"sec")}')
    print(f'time_taken for LB = {time_taken*1000/valid_num:05f} msec\n')
    for i in range(5):
        print(f'topk[{i}] = {topk[i]}')  
    print('----- end -----\n')




shutil.copyfile(tflite_file, 'model.tflite') 
!zip submission.zip  'model.tflite'
!ls

print('tflite_file:', tflite_file)
print(f'submit ok')

# '''

# 2.11.0
# import ok

# ######################################################
# embed_dim = 1024
# max_length=180

#      7900 / 8000   7 min 49 sec
# time_taken =  7 min 49 sec
# time_taken for LB = 58.693773 msec

# topk[0] = 0.588625
# topk[1] = 0.702
# topk[2] = 0.755375
# topk[3] = 0.785375
# topk[4] = 0.804125

 
# ----- end -----

# updating: model.tflite (deflated 8%)
# __notebook_source__.ipynb  model.tflite  submission.zip
# submit ok



# ######################################################
# embed_dim = 1024
# max_length=96

# import ok
#      7900 / 8000   6 min 23 sec

# time_taken =  6 min 23 sec
# time_taken for LB = 47.972998 msec

# topk[0] = 0.58425
# topk[1] = 0.696375
# topk[2] = 0.748125
# topk[3] = 0.77825
# topk[4] = 0.797125



# ######################################################
# embed_dim = 512
# max_length = 80

# 2.11.0
# import ok
#      7900 / 8000   5 min 44 sec

# time_taken =  5 min 44 sec
# time_taken for LB = 43.067440 msec

# topk[0] = 0.57525
# topk[1] = 0.690625
# topk[2] = 0.74
# topk[3] = 0.77175
# topk[4] = 0.79375
# ----- end -----

# transformer-pool-2c-512-80-cut.tflite

# time_taken =  4 min 53 sec
# time_taken for LB = 36.710013 msec

# topk[0] = 0.574875
# topk[1] = 0.69025
# topk[2] = 0.73975
# topk[3] = 0.7715
# topk[4] = 0.79375

# ######################################################
# ldd --version | head -n1

# 2.11.0
# import ok
#     17600 / 17670   9 min 39 sec

# time_taken =  9 min 39 sec
# time_taken for LB = 32.815303 msec

# topk[0] = 0.5780418788907753
# topk[1] = 0.6922467458970005
# topk[2] = 0.7432937181663837
# topk[3] = 0.7735144312393888
# topk[4] = 0.7942275042444822
# ----- end -----

# updating: model.tflite (deflated 8%)
# __notebook_source__.ipynb  model.tflite  submission.zip
# submit ok

# ---
# int8

#    17600 / 17670  11 min 34 sec

# time_taken = 11 min 34 sec
# time_taken for LB = 39.286630 msec

# topk[0] = 0.5782116581777024
# topk[1] = 0.6921335597057159
# topk[2] = 0.7434069043576683
# topk[3] = 0.7740237691001698
# topk[4] = 0.7953027730616865
# '''

# import ok
#     17600 / 17670   9 min 48 sec

# time_taken =  9 min 48 sec
# time_taken for LB = 33.307542 msec

# topk[0] = 0.5782116581777024
# topk[1] = 0.6921335597057159
# topk[2] = 0.7434634974533108
# topk[3] = 0.7740237691001698
# topk[4] = 0.7951895868704019
# ----- end -----

#   adding: model.tflite (deflated 15%)
# __notebook_source__.ipynb  model.tflite  submission.zip
# tflite_file: /kaggle/input/asl-demo/transformer-pool-2c-512-80-fixed-int8.tflite
# submit ok


import ok
  adding: model.tflite (deflated 13%)
__notebook__.ipynb  model.tflite  submission.zip
tflite_file: /kaggle/input/asl-demo/run20-aug3-xyz2.tflite
submit ok
