In [68]:
import sys
sys.path.append(".")
import importlib
import datasets.lip_to_word_dataset 
import util.cv_utils
importlib.reload(datasets.lip_to_word_dataset)
importlib.reload(util.cv_utils)
from datasets.lip_to_word_dataset import LipReadingWordDataset
from util.cv_utils import collate_fn

import torch
from torch.utils.data import Dataset
from pathlib import Path
from torch.utils.data import DataLoader


In [69]:
# Load data
folder = Path(r"C:\Projects\Lip_Reading\GRID\processed")
dataset = LipReadingWordDataset(folder)

train_loader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)

In [70]:
# Data size for model
processed_file = Path(r"C:\Projects\Lip_Reading\GRID\processed\s1\bbaf2n_data.pth") 
data = torch.load(processed_file, map_location='cpu', weights_only= False)
# Access the first word/sample's feature tensor shape
x_feat = data["x_feat"][0]  # first sample's engineered features
x_coords = data["x_coords"][0]  # first sample's coords
x_veloc = data["x_veloc"][0]  # first sample's velocities
y_label = data["y_labels"][0]

print(f"x_feat shape: {x_feat.shape}")   # should print something like (seq_len, feat_dim)
print(f"x_coords shape: {x_coords.shape}")
print(f"x_veloc shape: {x_veloc.shape}")
print(f"Label: {y_label}")


x_feat shape: (6, 8)
x_coords shape: (6, 18)
x_veloc shape: (6, 18)
Label: 4


In [71]:
# Define a simple model
import torch.nn as nn
import torch.nn.functional as F

class LipReadingModel(nn.Module):
    def __init__(self, x_feat, x_coords, x_veloc, hidden_size, num_classes):
        super(LipReadingModel, self).__init__()

        input_size = x_feat + x_coords + x_veloc
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x_dim, x_2d1, x_2d2):
        x = torch.cat([x_dim, x_2d1, x_2d2], dim=2) 
        _, (h_n, _) = self.lstm(x)  
        out = self.fc(h_n[-1])      
        return out

    def __getitem__(self, idx):
        x_dim = torch.tensor(self.X_dim[idx], dtype=torch.float32)
        x2d1  = torch.tensor(self.X_2d1[idx], dtype=torch.float32)
        x2d2  = torch.tensor(self.X_2d2[idx], dtype=torch.float32)
        label = torch.tensor(self.y[idx], dtype=torch.long)  # labels must be long for CrossEntropyLoss
        return x_dim, x2d1, x2d2, label

In [72]:
model = LipReadingModel(
    x_feat = 8,
    x_coords = 18,
    x_veloc = 18,
    hidden_size = 128,
    num_classes = 55
)

In [73]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# TODO investigate methods to optimize speed of each epoch
for epoch in range(10):
    model.train()
    total_loss = 0.0
    for x1, x2, x3, y in train_loader:

        # Move data to device
        x1 = x1.to(device)
        x2 = x2.to(device)
        x3 = x3.to(device)
        y = y.to(device)
        
        # Forward pass
        outputs = model(x1, x2, x3)
        loss = criterion(outputs, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


Epoch 1: Loss = 2.4321
Epoch 2: Loss = 3.5586
Epoch 3: Loss = 4.8392
Epoch 4: Loss = 2.5649
Epoch 5: Loss = 3.9665
Epoch 6: Loss = 5.1631
Epoch 7: Loss = 2.4857
Epoch 8: Loss = 2.9445
Epoch 9: Loss = 1.7474
Epoch 10: Loss = 1.0427
