In [33]:
import sys
sys.path.append(".")
import torch

In [34]:
from torch.utils.data import Dataset
from pathlib import Path


speakers = ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10']

    
    

class LipReadingWordDataset(Dataset):
    # constructor
    # pass in path object of proccessed folder
    def __init__(self, processed_dir):
        self.index_map = []
        processed_dir = Path(processed_dir)
        for folder in processed_dir.iterdir():
            folder = Path(folder)
            for data_file in folder.glob("*.pth"):            
                data = torch.load(data_file, map_location= 'cpu', weights_only=False)
                num_words = len(data["y_labels"])
                for i in range(num_words):
                    self.index_map.append((data_file, i)) 
                    # load in the indicies and path of each word token, and load actual data in real time for optimization

    def __len__(self):
        # Return total amount of word tokens
        return len(self.index_map)

    def __getitem__(self, idx):
        # Return part of sequence for each word using index map (index being which word in the file)
        data_path, index = self.index_map[idx]
        if not hasattr(self, 'cache'):
            self.cache = {}  # Initialize per worker
        if data_path not in self.cache:
            self.cache[data_path] = torch.load(data_path, map_location= 'cpu', weights_only= False)
        # load from cache
        data = self.cache[data_path]
        
        # Load data into tensors at runtime
        x_feat, x_coords, x_veloc, x_acc, y_labels = data["x_feat"][index], data["x_coords"][index], data["x_veloc"][index], data["x_acc"][index], data["y_labels"][index]
        return torch.tensor(x_feat, dtype= torch.float32), torch.tensor(x_coords, dtype= torch.float32), torch.tensor(x_veloc, dtype= torch.float32), torch.tensor(x_acc, dtype=torch.float32), torch.tensor(y_labels, dtype= torch.long)
        


In [35]:
from torch.utils.data import Sampler
import random
from collections import defaultdict

# class SpeakerBalancedSampler(Sampler):
#     def __init__(self, dataset, batch_size):
#         self.dataset = dataset
#         self.batch_size = batch_size
        
#         self.speaker_to_indices = defaultdict(list)
#         #for idx, sample in enumerate(dataset.samples):
            

In [36]:
import importlib
import util.cv_utils
importlib.reload(util.cv_utils)
from util.cv_utils import collate_fn
from torch.utils.data import DataLoader

folder = Path(r"C:\Projects\Lip_Reading\GRID\training")
dataset = LipReadingWordDataset(folder)

train_loader = DataLoader(
    dataset, 
    batch_size=64, #  64
    shuffle=True, 
    collate_fn=collate_fn,
    pin_memory= True
    )


In [37]:
# CNN + RNN model
import torch.nn as nn
import torch.nn.functional as F

class LipReadingModel_CNN_BiLSTM(nn.Module):
    def __init__(self, x_feat, x_coords, x_veloc, x_acc, hidden_size, num_classes):
        super().__init__()

        # Combine all feature dims
        input_size = x_feat + x_coords + x_veloc + x_acc

        # --- CNN frontend ---
        self.cnn = nn.Sequential(
            nn.Conv1d(in_channels=input_size, out_channels=128, kernel_size=2, padding=1), # Test 2 instead of 3, as 2-d coordinates ...
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Conv1d(in_channels=128, out_channels=256, kernel_size=2, padding=1),  # Test 2 instead of 3, as 2-d coordinates ...
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3)
        )

        # --- BiLSTM backend --- (RNN)
        self.lstm = nn.LSTM(
            input_size=256,        # after CNN
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            bidirectional=True,
            dropout=0.3
        )

        # --- Fully connected classifier ---
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x_1, x_2, x_3, x_4):
        # Concatenate along feature dimension
        x = torch.cat([x_1, x_2, x_3, x_4], dim=2)  # shape: [B, T, F]

        # CNN expects (B, F, T)
        x = x.transpose(1, 2)  # -> [B, F, T]
        x = self.cnn(x)        # -> [B, 256, T]
        x = x.transpose(1, 2)  # -> [B, T, 256]

        # LSTM
        _, (h_n, _) = self.lstm(x)  
        h_n = torch.cat([h_n[-2], h_n[-1]], dim=1)  # last layer's forward & backward

        # Classifier
        out = self.fc(h_n)
        return out


In [38]:
import torch.optim as optim
model = LipReadingModel_CNN_BiLSTM(
    x_feat = 8,
    x_coords = 18,
    x_veloc = 18,
    x_acc = 18,
    hidden_size = 256, # 256
    num_classes = 55
)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=3)


In [47]:
# Automatic Mixed Precision
from torch.cuda.amp import autocast, GradScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scaler = torch.GradScaler()
model = model.to(device)

for epoch in range(3):
    model.train()
    total_loss = 0.0
    for x1, x2, x3, x4, y in train_loader:
        x1, x2, x3, x4, y = x1.to(device), x2.to(device), x3.to(device), x4.to(device), y.to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        with torch.autocast(device_type= 'cuda'):
            outputs = model(x1, x2, x3, x4)
            loss = criterion(outputs, y)

        # Backpropagation
        scaler.scale(loss).backward()
        
        # Unscale before clipping
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        # Step with scaled
        scaler.step(optimizer)
        scaler.update()
    
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


Epoch 1: Loss = 1.1811
Epoch 2: Loss = 0.7360
Epoch 3: Loss = 1.1534


In [40]:
vocab = {
    "<pad>": 0,
    "<sos>": 1,
    "<eos>": 2,
    "sp": 3,
    "bin": 4,
    "lay": 5,
    "place": 6,
    "set": 7,
    "blue": 8,
    "green": 9,
    "red": 10,
    "white": 11,
    "at": 12,
    "by": 13,
    "in": 14,
    "with": 15,
    "zero": 16,
    "one": 17,
    "two": 18,
    "three": 19,
    "four": 20,
    "five": 21,
    "six": 22,
    "seven": 23,
    "eight": 24,
    "nine": 25,
    "again": 26,
    "now": 27,
    "please": 28,
    "soon": 29,
    "a": 30,
    "b": 31,
    "c": 32,
    "d": 33,
    "e": 34,
    "f": 35,
    "g": 36,
    "h": 37,
    "i": 38,
    "j": 39,
    "k": 40,
    "l": 41,
    "m": 42,
    "n": 43,
    "o": 44,
    "p": 45,
    "q": 46,
    "r": 47,
    "s": 48,
    "t": 49,
    "u": 50,
    "v": 51,
    "x": 52,
    "y": 53,
    "z": 54
}
inverse_vocab = {v: k for k, v in vocab.items()}

In [63]:
# Adding Metrics and Evaluation
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
import pandas as pd

# Test off of unknown speakers!
folder =  Path(r"C:\Projects\Lip_Reading\GRID\testing")
testing_dataset = LipReadingWordDataset(folder)
val_loader = DataLoader(
    testing_dataset, 
    batch_size=64, #  64
    shuffle=True, 
    collate_fn=collate_fn
    )


all_preds = []
all_labels = []

model.eval()
with torch.no_grad():
    for x1, x2, x3, x4, y in val_loader:
        x1 = x1.to(device)
        x2 = x2.to(device)
        x3 = x3.to(device)
        x4 = x4.to(device)
        y = y.to(device)

        outputs = model(x1, x2, x3, x4)
        predicted = outputs.argmax(1)

        all_preds.extend(predicted.cpu().numpy())
        all_labels.extend(y.cpu().numpy())

# Compute metrics
acc = np.mean(np.array(all_preds) == np.array(all_labels))
print(f"Final Training Accuracy: {acc:.2%}")

labels_list = list(range(len(vocab)))
target_names = [inverse_vocab[i] for i in labels_list]

# Generate detailed report
report = classification_report(all_labels, all_preds, labels=labels_list, target_names=target_names, output_dict=True)
df = pd.DataFrame(report).transpose()  # Transpose to have labels as rows
df.to_csv("classification_report_v2.csv", index=True)

cm = confusion_matrix(all_labels, all_preds, labels = labels_list)
cm_df = pd.DataFrame(
    cm,
    index=[inverse_vocab[i] for i in labels_list],      # True labels as row names
    columns=[inverse_vocab[i] for i in labels_list]     # Predicted labels as column names
)
cm_df.to_csv("confusion_matrix_v2.csv")



ValueError: num_samples should be a positive integer value, but got num_samples=0