In [9]:
# Video Snipping
# frame = int(timestamp_ms * fps / 1000)
# Vocab is a dictionary mapping words to class IDs.

import numpy as np

def pad_or_truncate(seq, max_len):
    """
    Pads or truncates a (seq_len, feature_dim) array to shape (max_len, feature_dim).
    """
    seq_len, feature_dim = seq.shape

    if seq_len == max_len:
        return seq
    elif seq_len < max_len:
        pad_len = max_len - seq_len
        pad = np.zeros((pad_len, feature_dim), dtype=seq.dtype)
        return np.concatenate([seq, pad], axis=0)
    else:
        return seq[:max_len]

def convert_word_segments_to_frames(word_segments, fps):
    frame_segments = []
    for start_ms, end_ms, word in word_segments:
        start_frame = int(start_ms * fps / 1000)
        end_frame = int(end_ms * fps / 1000)
        frame_segments.append((start_frame, end_frame, word))
    return frame_segments

vocab = {'set': 0, 'white': 1, 'with': 2, 'p': 3, 'four': 4, 'please': 5}

def extract_word_clips_from_features(features_dim, features_2d1, features_2d2, frame_segments, vocab, max_len=30):
    """
    features_dim: shape (total_frames, dim_features)
    features_2d1: shape (total_frames, 2d1_features)
    features_2d2: shape (total_frames, 2d2_features)
    frame_segments: list of (start_frame, end_frame, word)
    vocab: dict mapping words to class indices
    max_len: number of frames to pad/truncate to
    """
    clips_dim, clips_2d1, clips_2d2, labels = [], [], [], []

    for start_f, end_f, word in frame_segments:
        if word == 'sil' or word not in vocab:
            continue

        x_dim = features_dim[start_f:end_f+1]
        x_2d1 = features_2d1[start_f:end_f+1]
        x_2d2 = features_2d2[start_f:end_f+1]

        x_dim = pad_or_truncate(x_dim, max_len)
        x_2d1 = pad_or_truncate(x_2d1, max_len)
        x_2d2 = pad_or_truncate(x_2d2, max_len)

        clips_dim.append(x_dim)
        clips_2d1.append(x_2d1)
        clips_2d2.append(x_2d2)
        labels.append(vocab[word])

    return np.array(clips_dim), np.array(clips_2d1), np.array(clips_2d2), np.array(labels)



In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class LipReadingModel(nn.Module):
    def __init__(self, dim_input_size, twoD_input_size, hidden_size, num_classes):
        super(LipReadingModel, self).__init__()

        self.lstm_dim = nn.LSTM(dim_input_size, hidden_size, batch_first=True)
        self.lstm_2d_1 = nn.LSTM(twoD_input_size, hidden_size, batch_first=True)
        self.lstm_2d_2 = nn.LSTM(twoD_input_size, hidden_size, batch_first=True)

        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 3, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x_dim, x_2d_1, x_2d_2):
        # LSTM expects (batch, sequence, feature)
        _, (h_dim, _) = self.lstm_dim(x_dim)
        _, (h_2d_1, _) = self.lstm_2d_1(x_2d_1)
        _, (h_2d_2, _) = self.lstm_2d_2(x_2d_2)

        # Use only the final hidden state
        h_cat = torch.cat((h_dim[-1], h_2d_1[-1], h_2d_2[-1]), dim=1)
        return self.fc(h_cat)
    
    

In [11]:
NUM_CLASSES = 8  # Example number of classes, adjust as needed

In [12]:
model = LipReadingModel(
    dim_input_size = 74,
    twoD_input_size = 148, # Assuming 74 landmarks with 2D coordinates (x, y) and velocities
    hidden_size = 148,  # Hidden size for LSTM layers (can be adjusted)
    num_classes = NUM_CLASSES  # TODO Define the number of classes for your task (words, phonemes, etc.)
)
num_epochs = 10
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


for epoch in range(num_epochs):
    model.train()

    for x_dim_batch, x_2d_1_batch, x_2d_2_batch, y_batch in data_loader:        
        outputs = model(x_dim_batch, x_2d_1_batch, x_2d_2_batch)
        loss = criterion(outputs, y_batch)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')


NameError: name 'data_loader' is not defined