In [None]:
import sys
sys.path.append(".")
import importlib
import datasets.lip_to_word_dataset 
import util.cv_utils
importlib.reload(datasets.lip_to_word_dataset)
importlib.reload(util.cv_utils)
from datasets.lip_to_word_dataset import LipReadingWordDataset
from util.cv_utils import collate_fn

import torch
from torch.utils.data import Dataset
from pathlib import Path
from torch.utils.data import DataLoader


In [None]:
# Load data
folder = Path(r"C:\Projects\Lip_Reading\GRID\processed")
dataset = LipReadingWordDataset(folder)

train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [None]:
# Data size for model
processed_file = Path(r"C:\Projects\Lip_Reading\GRID\processed\s1\bbaf2n_data.pth") 
data = torch.load(processed_file, map_location='cpu', weights_only= False)
# Access the first word/sample's feature tensor shape
x_feat = data["x_feat"][0]  # first sample's engineered features
x_coords = data["x_coords"][0]  # first sample's coords
x_veloc = data["x_veloc"][0]  # first sample's velocities
y_label = data["y_labels"][0]

print(f"x_feat shape: {x_feat.shape}")   # should print something like (seq_len, feat_dim)
print(f"x_coords shape: {x_coords.shape}")
print(f"x_veloc shape: {x_veloc.shape}")
print(f"Label: {y_label}")


x_feat shape: (6, 8)
x_coords shape: (6, 18)
x_veloc shape: (6, 18)
Label: 4


In [41]:
# Define a simple model
import torch.nn as nn
import torch.nn.functional as F

class LipReadingModel(nn.Module):
    def __init__(self, x_feat, x_coords, x_veloc, hidden_size, num_classes):
        super(LipReadingModel, self).__init__()

        input_size = x_feat + x_coords + x_veloc
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x_dim, x_2d1, x_2d2):
        x = torch.cat([x_dim, x_2d1, x_2d2], dim=2) 
        _, (h_n, _) = self.lstm(x)  
        out = self.fc(h_n[-1])      
        return out

    def __getitem__(self, idx):
        x_dim = torch.tensor(self.X_dim[idx], dtype=torch.float32)
        x2d1  = torch.tensor(self.X_2d1[idx], dtype=torch.float32)
        x2d2  = torch.tensor(self.X_2d2[idx], dtype=torch.float32)
        label = torch.tensor(self.y[idx], dtype=torch.long)  # labels must be long for CrossEntropyLoss
        return x_dim, x2d1, x2d2, label

In [42]:
model = LipReadingModel(
    x_feat = 8,
    x_coords = 18,
    x_veloc = 18,
    hidden_size = 128,
    num_classes = 55
)