In [None]:
pip install python-levenshtein tqdm pyarrow wandb

In [None]:
import tensorflow as tf
import os
import numpy as np
import random
import json
import math
import pickle
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import tensorflow_addons as tfa
import Levenshtein as lev
import pyarrow.parquet as pq
from tqdm.auto import tqdm

# Set random seeds for reproducibility
SEED = 42
def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
set_seeds()

# Landmark indices and configurations
LPOSE = [13, 15, 17, 19, 21]
RPOSE = [14, 16, 18, 20, 22]
POSE = LPOSE + RPOSE
FRAME_LEN = 128  # Reduced from 384 to match competition
CHANNELS = 384  # Model dimension

# Create feature columns for landmarks
X = [f'x_right_hand_{i}' for i in range(21)] + [f'x_left_hand_{i}' for i in range(21)] + [f'x_pose_{i}' for i in POSE]
Y = [f'y_right_hand_{i}' for i in range(21)] + [f'y_left_hand_{i}' for i in range(21)] + [f'y_pose_{i}' for i in POSE]
Z = [f'z_right_hand_{i}' for i in range(21)] + [f'z_left_hand_{i}' for i in range(21)] + [f'z_pose_{i}' for i in POSE]
SEL_COLS = X + Y + Z

# Hand and pose indices
RHAND_IDX = [i for i, x in enumerate(X) if 'right_hand' in x]
LHAND_IDX = [i for i, x in enumerate(X) if 'left_hand' in x]
POSE_IDX = [i for i, x in enumerate(X) if 'pose' in x]

# Special tokens
pad_token = 'P'
start_token = '<'
end_token = '>'
pad_token_idx = 59
start_token_idx = 60
end_token_idx = 61

# Training configuration
BATCH_SIZE = 64 * 4  # Increased batch size
MAX_LEN = 340  # Maximum sequence length
DEBUG = False  # Debug mode flag
N_EPOCHS = 1500
N_WARMUP_EPOCHS = 5
LR_MAX = 1e-3
WD_RATIO = 0.05

# Configure TPU if available
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print("Using TPU:", tpu.master())
except:
    strategy = tf.distribute.get_strategy()
    print("Using default strategy:", strategy)

print("Number of replicas:", strategy.num_replicas_in_sync)

# Adjust batch size based on strategy
GLOBAL_BATCH_SIZE = BATCH_SIZE * strategy.num_replicas_in_sync

# Load character mapping
def load_character_map(vocab_path):
    with open(vocab_path, "r") as f:
        char_to_num = json.load(f)
    
    # Add special tokens
    char_to_num[pad_token] = pad_token_idx
    char_to_num[start_token] = start_token_idx
    char_to_num[end_token] = end_token_idx
    
    num_to_char = {j:i for i,j in char_to_num.items()}
    return char_to_num, num_to_char

# Data augmentation functions
def spatial_random_affine(xyz,
    scale=(0.8, 1.2),
    shear=(-0.15, 0.15),
    shift=(-0.1, 0.1),
    degree=(-30, 30)):
    
    xy = xyz[..., :2]
    z = xyz[..., 2:]
    
    # Apply scale
    if scale is not None:
        scale = tf.random.uniform((), *scale)
        xy *= scale
    
    # Apply shear
    if shear is not None:
        shear_x = shear_y = tf.random.uniform((), *shear)
        if tf.random.uniform(()) < 0.5:
            shear_x = 0.
        else:
            shear_y = 0.
        shear_mat = tf.constant([[1., shear_x], [shear_y, 1.]], dtype=xy.dtype)
        xy = tf.matmul(xy, shear_mat)
    
    # Apply rotation
    if degree is not None:
        angle = tf.random.uniform((), *degree) * math.pi / 180
        cos = tf.cos(angle)
        sin = tf.sin(angle)
        rot_mat = tf.constant([[cos, -sin], [sin, cos]], dtype=xy.dtype)
        xy = tf.matmul(xy, rot_mat)
    
    # Apply shift
    if shift is not None:
        shift = tf.random.uniform((), *shift)
        xy += shift
    
    return tf.concat([xy, z], axis=-1)

def temporal_mask(x, size=(0.2, 0.4), mask_value=float('nan')):
    length = tf.shape(x)[0]
    mask_size = tf.random.uniform((), *size)
    mask_size = tf.cast(tf.cast(length, tf.float32) * mask_size, tf.int32)
    mask_offset = tf.random.uniform((), 0, length - mask_size, dtype=tf.int32)
    
    mask = tf.ones((length,), dtype=tf.bool)
    mask_indices = tf.range(mask_offset, mask_offset + mask_size)
    mask = tf.tensor_scatter_nd_update(mask, mask_indices[:, tf.newaxis], tf.zeros_like(mask_indices, dtype=tf.bool))
    
    masked_x = tf.where(mask[:, tf.newaxis, tf.newaxis], x, mask_value)
    return masked_x

def augment_fn(x, always=False):
    if tf.random.uniform(()) < 0.75 or always:
        x = spatial_random_affine(x)
    if tf.random.uniform(()) < 0.5 or always:
        x = temporal_mask(x)
    return x

# Model architecture components
class SqueezeformerBlock(tf.keras.layers.Layer):
    def __init__(self, dim, num_heads=8, dropout_rate=0.1):
        super().__init__()
        self.dim = dim
        
        # Self attention
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.mhsa = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads, 
            key_dim=dim//num_heads,
            dropout=dropout_rate
        )
        
        # First feedforward
        self.ff1_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.ff1 = tf.keras.Sequential([
            tf.keras.layers.Dense(dim*4, activation='swish'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(dim),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        
        # Convolution module
        self.conv_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.conv1 = tf.keras.layers.Conv1D(dim*2, 1)
        self.glu = tf.keras.layers.Lambda(lambda x: x[:,:,:dim] * tf.nn.sigmoid(x[:,:,dim:]))
        self.depthwise_conv = tf.keras.layers.DepthwiseConv1D(3, padding='same')
        self.batch_norm = tf.keras.layers.BatchNormalization()
        self.pointwise_conv = tf.keras.layers.Conv1D(dim, 1)
        self.conv_dropout = tf.keras.layers.Dropout(dropout_rate)
        
        # Second feedforward
        self.ff2_norm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.ff2 = tf.keras.Sequential([
            tf.keras.layers.Dense(dim*4, activation='swish'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(dim),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        
    def call(self, x, training=False):
        # First feedforward
        residual = x
        x = self.ff1_norm(x)
        x = self.ff1(x, training=training)
        x = residual + x
        
        # Self attention
        residual = x
        x = self.norm1(x)
        x = self.mhsa(x, x, training=training)
        x = residual + x
        
        # Convolution module
        residual = x
        x = self.conv_norm(x)
        x = self.conv1(x)
        x = self.glu(x)
        x = self.depthwise_conv(x)
        x = self.batch_norm(x, training=training)
        x = self.pointwise_conv(x)
        x = self.conv_dropout(x, training=training)
        x = residual + x
        
        # Second feedforward
        residual = x
        x = self.ff2_norm(x)
        x = self.ff2(x, training=training)
        x = residual + x
        
        return x

class ASLModel(tf.keras.Model):
    def __init__(self, num_classes=62, dim=384, num_layers=4, dropout_rate=0.1):
        super().__init__()
        self.dim = dim
        
        # Input processing
        self.input_projection = tf.keras.layers.Dense(dim, use_bias=False)
        self.input_norm = tf.keras.layers.BatchNormalization()
        
        # Feature extractors
        self.face_projection = tf.keras.layers.Dense(dim//4)
        self.pose_projection = tf.keras.layers.Dense(dim//4)
        self.hand_projection = tf.keras.layers.Dense(dim//2)
        
        # Encoder layers
        self.encoder_layers = [
            SqueezeformerBlock(dim, dropout_rate=dropout_rate)
            for _ in range(num_layers)
        ]
        
        # Output layers
        self.pre_classifier = tf.keras.layers.Dense(dim*2, activation='relu')
        self.dropout = tf.keras.layers.Dropout(0.5)
        self.classifier = tf.keras.layers.Dense(num_classes)
    
    def call(self, inputs, training=False):
        # Project and normalize input
        x = self.input_projection(inputs)
        x = self.input_norm(x, training=training)
        
        # Process through encoder layers
        for encoder_layer in self.encoder_layers:
            x = encoder_layer(x, training=training)
        
        # Final classification
        x = self.pre_classifier(x)
        x = self.dropout(x, training=training)
        x = self.classifier(x)
        
        return x

# Data processing functions
def process_landmarks(data):
    """Extract and normalize landmarks"""
    # Extract hand landmarks
    right_hand = tf.gather(data, RHAND_IDX, axis=1)
    left_hand = tf.gather(data, LHAND_IDX, axis=1)
    pose = tf.gather(data, POSE_IDX, axis=1)
    
    # Detect dominant hand
    right_nan = tf.reduce_sum(tf.cast(tf.math.is_nan(right_hand), tf.float32))
    left_nan = tf.reduce_sum(tf.cast(tf.math.is_nan(left_hand), tf.float32))
    
    # Use hand with fewer NaN values
    hand = tf.cond(right_nan > left_nan,
                  lambda: left_hand,
                  lambda: right_hand)
    
    # Normalize coordinates
    mean = tf.reduce_mean(hand, axis=1, keepdims=True)
    std = tf.math.reduce_std(hand, axis=1, keepdims=True)
    std = tf.where(std < 1e-6, 1.0, std)
    
    hand = (hand - mean) / std
    pose = (pose - mean) / std
    
    # Combine features
    features = tf.concat([hand, pose], axis=1)
    return features

def create_tf_dataset(file_paths, char_to_num, batch_size=BATCH_SIZE, is_training=False):
    """Create TensorFlow dataset from TFRecord files"""
    
    def parse_tfrecord(example_proto):
        feature_description = {
            'frames': tf.io.VarLenFeature(tf.float32),
            'phrase': tf.io.FixedLenFeature([], tf.string)
        }
        parsed_features = tf.io.parse_single_example(example_proto, feature_description)
        
        # Process frames
        frames = tf.sparse.to_dense(parsed_features['frames'])
        frames = tf.reshape(frames, [-1, len(SEL_COLS)])
        frames = process_landmarks(frames)
        
        if is_training:
            frames = augment_fn(frames)
        
        # Process phrase
        phrase = parsed_features['phrase']
        phrase = tf.strings.unicode_split(phrase, 'UTF-8')
        phrase = table.lookup(phrase)
        phrase = tf.pad(phrase, [[0, MAX_LEN - tf.shape(phrase)[0]]], constant_values=pad_token_idx)
        
        return frames, phrase
    
    # Create lookup table
    table = tf.lookup.StaticHashTable(
        tf.lookup.KeyValueTensorInitializer(
            keys=list(char_to_num.keys()),
            values=list(char_to_num.values()),
        ),
        default_value=pad_token_idx
    )
    
    # Create dataset
    dataset = tf.data.TFRecordDataset(file_paths, compression_type='GZIP')
    
    if is_training:
        dataset = dataset.shuffle(10000)
    
    dataset = (dataset
              .map(parse_tfrecord, num_parallel_calls=tf.data.AUTOTUNE)
              .batch(batch_size)
              .prefetch(tf.data.AUTOTUNE))
    
    return dataset

# Loss and metrics functions
def CTCLoss(labels, logits):
    """CTC Loss implementation"""
    label_length = tf.reduce_sum(tf.cast(labels != pad_token_idx, tf.int32), axis=-1)
    logit_length = tf.ones(tf.shape(logits)[0], dtype=tf.int32) * tf.shape(logits)[1]
    
    loss = tf.nn.ctc_loss(
        labels=tf.cast(labels, tf.int32),
        logits=logits,
        label_length=label_length,
        logit_length=logit_length,
        blank_index=pad_token_idx,
        logits_time_major=False
    )
    return tf.reduce_mean(loss)

def calculate_levenshtein(predictions, targets, num_to_char):
    """Calculate normalized Levenshtein distance"""
    distances = []
    prediction_lengths = []
    
    for pred, target in zip(predictions, targets):
        pred_text = ''.join([num_to_char.get(x, '') for x in pred if x not in [pad_token_idx, start_token_idx, end_token_idx]])
        target_text = ''.join([num_to_char.get(x, '') for x in target if x not in [pad_token_idx, start_token_idx, end_token_idx]])
        
        distance = lev.distance(pred_text, target_text)
        distances.append(distance)
        prediction_lengths.append(len(target_text))
    
    return 1 - (sum(distances) / sum(prediction_lengths))

# Learning rate schedule
def get_lr_schedule(initial_lr, warmup_epochs, total_epochs, steps_per_epoch):
    warmup_steps = warmup_epochs * steps_per_epoch
    total_steps = total_epochs * steps_per_epoch
    
    def lr_schedule(step):
        # Linear warmup
        if step < warmup_steps:
            return initial_lr * (step / warmup_steps)
        
        # Cosine decay
        progress = (step - warmup_steps) / (total_steps - warmup_steps)
        return initial_lr * 0.5 * (1 + tf.cos(math.pi * progress))
    
    return lr_schedule

# Custom callbacks
class LevenshteinCallback(tf.keras.callbacks.Callback):
    def __init__(self, validation_data, num_to_char):
        super().__init__()
        self.validation_data = validation_data
        self.num_to_char = num_to_char
        self.best_score = -float('inf')
    
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % 5 == 0:  # Calculate every 5 epochs to save time
            predictions = []
            targets = []
            
            for batch in self.validation_data:
                pred = self.model(batch[0], training=False)
                pred = tf.argmax(pred, axis=-1)
                predictions.extend(pred.numpy())
                targets.extend(batch[1].numpy())
            
            score = calculate_levenshtein(predictions, targets, self.num_to_char)
            logs['levenshtein_score'] = score
            
            if score > self.best_score:
                self.best_score = score
                self.model.save_weights(f'best_model_epoch_{epoch}.h5')
            
            print(f'\nLevenshtein Score: {score:.4f}')

# Training function
def train_model(strategy, train_dataset, val_dataset, num_to_char, config):
    with strategy.scope():
        # Create model
        model = ASLModel(
            num_classes=len(num_to_char),
            dim=config['dim'],
            num_layers=config['num_layers'],
            dropout_rate=config['dropout_rate']
        )
        
        # Optimizer
        optimizer = tfa.optimizers.RectifiedAdam(
            learning_rate=config['initial_lr'],
            weight_decay=config['weight_decay'],
            sma_threshold=4.0
        )
        optimizer = tfa.optimizers.Lookahead(optimizer, sync_period=5)
        
        # Compile model
        model.compile(
            optimizer=optimizer,
            loss=CTCLoss,
            #run_eagerly=True  # For debugging
        )
    
    # Learning rate schedule
    steps_per_epoch = len(train_dataset)
    lr_schedule = get_lr_schedule(
        config['initial_lr'],
        config['warmup_epochs'],
        config['epochs'],
        steps_per_epoch
    )
    
    # Callbacks
    callbacks = [
        tf.keras.callbacks.LearningRateScheduler(lr_schedule),
        tf.keras.callbacks.ModelCheckpoint(
            'checkpoint_epoch_{epoch}.h5',
            save_weights_only=True,
            save_freq='epoch'
        ),
        LevenshteinCallback(val_dataset, num_to_char),
        tf.keras.callbacks.TensorBoard(log_dir='./logs'),
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=20,
            restore_best_weights=True
        )
    ]
    
    # Train
    history = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=config['epochs'],
        callbacks=callbacks,
        verbose=1
    )
    
    return model, history

# Main execution
def main():
    # Configuration
    config = {
        'dim': 384,
        'num_layers': 4,
        'dropout_rate': 0.1,
        'initial_lr': 1e-3,
        'weight_decay': 0.05,
        'warmup_epochs': 5,
        'epochs': 100,
        'batch_size': GLOBAL_BATCH_SIZE
    }
    
    # Load character mapping
    char_to_num, num_to_char = load_character_map('character_to_prediction_index.json')
    
    # Create TFRecord datasets
    train_files = sorted(tf.io.gfile.glob('train/*.tfrecord'))
    val_files = sorted(tf.io.gfile.glob('val/*.tfrecord'))
    
    train_dataset = create_tf_dataset(
        train_files,
        char_to_num,
        batch_size=config['batch_size'],
        is_training=True
    )
    
    val_dataset = create_tf_dataset(
        val_files,
        char_to_num,
        batch_size=config['batch_size'],
        is_training=False
    )
    
    # Train model
    model, history = train_model(
        strategy,
        train_dataset,
        val_dataset,
        num_to_char,
        config
    )
    
    # Plot training history
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Validation'])
    
    plt.subplot(1, 2, 2)
    plt.plot(history.history['levenshtein_score'])
    plt.title('Levenshtein Score')
    plt.ylabel('Score')
    plt.xlabel('Epoch')
    plt.show()
    
    # Save final model
    model.save_weights('final_model.h5')

if __name__ == "__main__":
    main()