In [389]:
# Combine CSV files into a DataSet
import pandas as pd
import numpy as np
import os
import glob
import ast
import torch
from torch.utils.data import Dataset



In [390]:
# Pandas interprets tuples as strings in its cells, so we must convert back in friendly lists
def str_to_tuple(cell_str):
    if(isinstance(cell_str,str)):
        return list(ast.literal_eval(cell_str))
    else:
        return cell_str
    
def load_word_features(folder, vocab):
    X_dim_list, X_2d1_list, X_2d2_list, labels = [], [], [], []
    # Find all engineered features CSVs
    for file in glob.glob(os.path.join(folder, "Engineered_Features_*.csv")):
        # Extract word name (everything after last underscore, before .csv)
        word = os.path.basename(file).split("_")[-1].replace(".csv", "")
        # Construct matching filenames for other features
        file_coords = file.replace("Engineered_Features", "Coordinates")
        file_vel = file.replace("Engineered_Features", "Velocities")
        
        
        # Load CSVs into tensors
        df_dim_csv = pd.read_csv(file)
        df_dim_csv = df_dim_csv.drop(columns=["Unnamed: 0"])
        df_dim = torch.tensor(df_dim_csv.to_numpy(), dtype=torch.float32)
        
        df_2d1_csv = pd.read_csv(file_coords)
        df_2d1_csv = df_2d1_csv.drop(columns=["Unnamed: 0"])
        # print(df_2d1_csv)
        df_2d1_tuple = df_2d1_csv.map(str_to_tuple)
        df_2d1 = np.array(df_2d1_tuple.to_numpy().tolist(), dtype= np.float32) # convert from inferred object type to true list
        
        df_2d2_csv = pd.read_csv(file_vel)
        df_2d2_csv = df_2d2_csv.drop(columns=["Unnamed: 0"])
        df_2d2_tuple = df_2d2_csv.map(str_to_tuple)
        df_2d2 = np.array(df_2d2_tuple.to_numpy().tolist(), dtype= np.float32) # convert from inferred object type to true list

        
        # Append to lists
        X_dim_list.append(df_dim)  # Skip first row (header)
        X_2d1_list.append(df_2d1)
        X_2d2_list.append(df_2d2)
        labels.append(word)

    labels = [vocab[word] for word in labels]
    
    # Return a list containing features for each word
    return (X_dim_list), (X_2d1_list), (X_2d2_list), np.array(labels)



In [391]:
# D Path
# folder = r"C:\Users\User\OneDrive\Documents\Projects\Lip-Reading\notebooks\test_data"
# R Path
folder = r"C:\Projects\Lip_Reading\notebooks\test_data"
grid_vocab = {
    "<pad>": 0,
    "<sos>": 1,
    "<eos>": 2,
    "sp": 3,
    "bin": 4,
    "lay": 5,
    "place": 6,
    "set": 7,
    "blue": 8,
    "green": 9,
    "red": 10,
    "white": 11,
    "at": 12,
    "by": 13,
    "in": 14,
    "with": 15,
    "zero": 16,
    "one": 17,
    "two": 18,
    "three": 19,
    "four": 20,
    "five": 21,
    "six": 22,
    "seven": 23,
    "eight": 24,
    "nine": 25,
    "again": 26,
    "now": 27,
    "please": 28,
    "soon": 29,
    "a": 30,
    "b": 31,
    "c": 32,
    "d": 33,
    "e": 34,
    "f": 35,
    "g": 36,
    "h": 37,
    "i": 38,
    "j": 39,
    "k": 40,
    "l": 41,
    "m": 42,
    "n": 43,
    "o": 44,
    "p": 45,
    "q": 46,
    "r": 47,
    "s": 48,
    "t": 49,
    "u": 50,
    "v": 51,
    "x": 52,
    "y": 53,
    "z": 54
}
X_dim, X_2d1, X_2d2, labels = load_word_features(folder, grid_vocab)




In [392]:
# Encode Labels (Words -> IDs)
# ALREADY ENCODED
# words = sorted(set(labels))
# vocab = {w: i for i, w in enumerate(words)}
# y = np.array([vocab[w] for w in labels])



In [None]:
# Create Pytorch Dataset
class LipReadingWordDataset(Dataset):
    def __init__(self, X_dim, X_2d1, X_2d2, y):
        self.X_dim = X_dim
        self.X_2d1 = X_2d1
        self.X_2d2 = X_2d2
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        # Return part of sequence for each word
        return torch.tensor(self.X_dim[idx]), torch.tensor(self.X_2d1[idx]).flatten(start_dim=1), torch.tensor(self.X_2d2[idx]).flatten(start_dim=1), torch.tensor(self.y[idx])


# Create a DataLoader (Data conversion from lists into tensors)
from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    X_dim, X_2d1, X_2d2, y = zip(*batch) # unpack getItem data
    
    X_dim_padded = pad_sequence(X_dim, batch_first=True) # outputs 3dim tensor of (batch_size, max_seq_len, feature_size) "True Tensors"
    X_2d1_padded = pad_sequence(X_2d1, batch_first=True)
    X_2d2_padded = pad_sequence(X_2d2, batch_first=True)
    
    y_tensor = torch.stack(y)  # since labels are already tensors (dtype long)

    return X_dim_padded, X_2d1_padded, X_2d2_padded, y_tensor

# def collate_fn(batch):
#     X_dim, X_2d1, X_2d2, y = zip(*batch)  # unpack
    
#     # Stack directly (no pad_sequence needed)
#     X_dim_stacked = torch.stack(X_dim)
#     X_2d1_stacked = torch.stack(X_2d1)
#     X_2d2_stacked = torch.stack(X_2d2)
#     y_tensor = torch.stack(y)
    
#     return X_dim_stacked, X_2d1_stacked, X_2d2_stacked, y_tensor


    
from torch.utils.data import DataLoader
dataset = LipReadingWordDataset(X_dim, X_2d1, X_2d2, labels)

# Test for prediction 
# train_loader = DataLoader(dataset, batch_size=6, shuffle=False, collate_fn=collate_fn)

train_loader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

[12  4  8 35 27 18]


In [394]:
# Define a simple model
import torch.nn as nn
import torch.nn.functional as F

class LipReadingModel(nn.Module):
    def __init__(self, dim_features, coords_features, vel_features, hidden_size, num_classes):
        super(LipReadingModel, self).__init__()

        input_size = dim_features + coords_features*2 + vel_features*2
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x_dim, x_2d1, x_2d2):
        # Concatenate features (batch, frames, features)
        x = torch.cat([x_dim, x_2d1, x_2d2], dim=2) 

        # Pass through LSTM
        _, (h_n, _) = self.lstm(x)  
        
        # Use final hidden state
        out = self.fc(h_n[-1])      
        return out

    def __getitem__(self, idx):
        x_dim = torch.tensor(self.X_dim[idx], dtype=torch.float32)
        x2d1  = torch.tensor(self.X_2d1[idx], dtype=torch.float32)
        x2d2  = torch.tensor(self.X_2d2[idx], dtype=torch.float32)
        label = torch.tensor(self.y[idx], dtype=torch.long)  # labels must be long for CrossEntropyLoss
        return x_dim, x2d1, x2d2, label

In [395]:
# Instantiate the model
model = LipReadingModel(
    dim_features=X_dim[0].shape[1], 
    coords_features=X_2d1[0].shape[1], 
    vel_features=X_2d2[0].shape[1], 
    hidden_size=128, 
    num_classes=len(grid_vocab)
)


In [396]:
# Define loss and optimizer
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# # Debugging
# for batch in train_loader:
#     print([type(t) for t in batch])
#     break


# Train the model 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

for epoch in range(100):
    model.train()
    total_loss = 0.0
    for x_dim_batch, x2d1_batch, x2d2_batch, y_batch in train_loader:

        # Move data to device
        x_dim_batch = x_dim_batch.to(device)
        x2d1_batch = x2d1_batch.to(device)
        x2d2_batch = x2d2_batch.to(device)
        y_batch = y_batch.to(device)

        # Forward pass
        outputs = model(x_dim_batch, x2d1_batch, x2d2_batch)
        loss = criterion(outputs, y_batch)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        total_loss += loss.item()
        
    print(f"Epoch {epoch+1}: Loss = {loss.item():.4f}")


tensor([25, 40, 25, 25, 40, 40])
['nine', 'k', 'nine', 'nine', 'k', 'k']
Epoch 1: Loss = 4.1288
tensor([25, 40, 25, 25, 40, 40])
['nine', 'k', 'nine', 'nine', 'k', 'k']
Epoch 2: Loss = 3.9838
tensor([25,  9, 25, 25,  9,  9])
['nine', 'green', 'nine', 'nine', 'green', 'green']
Epoch 3: Loss = 3.8538
tensor([25, 18, 25, 25, 18, 18])
['nine', 'two', 'nine', 'nine', 'two', 'two']
Epoch 4: Loss = 3.7351
tensor([12,  4, 25, 35,  4,  4])
['at', 'bin', 'nine', 'f', 'bin', 'bin']
Epoch 5: Loss = 3.6211
tensor([12,  4, 35, 35,  4,  4])
['at', 'bin', 'f', 'f', 'bin', 'bin']
Epoch 6: Loss = 3.5135
tensor([12,  4,  8, 35,  4,  4])
['at', 'bin', 'blue', 'f', 'bin', 'bin']
Epoch 7: Loss = 3.4142
tensor([12,  4,  8, 35,  4,  4])
['at', 'bin', 'blue', 'f', 'bin', 'bin']
Epoch 8: Loss = 3.3108
tensor([12,  4,  8, 35,  4, 18])
['at', 'bin', 'blue', 'f', 'bin', 'two']
Epoch 9: Loss = 3.2038
tensor([12,  4,  8, 35, 18, 18])
['at', 'bin', 'blue', 'f', 'two', 'two']
Epoch 10: Loss = 3.0971
tensor([12, 18,  8

  return torch.tensor(self.X_dim[idx]), torch.tensor(self.X_2d1[idx]).flatten(start_dim=1), torch.tensor(self.X_2d2[idx]).flatten(start_dim=1), torch.tensor(self.y[idx])


tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 29: Loss = 1.0867
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 30: Loss = 1.0391
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 31: Loss = 0.9948
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 32: Loss = 0.9574
tensor([12,  4,  8, 35, 18, 18])
['at', 'bin', 'blue', 'f', 'two', 'two']
Epoch 33: Loss = 0.9223
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 34: Loss = 0.8932
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 35: Loss = 0.8648
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 36: Loss = 0.8399
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 37: Loss = 0.8166
tensor([12, 18,  8, 35, 18, 18])
['at', 'two', 'blue', 'f', 'two', 'two']
Epoch 38: Loss = 0.7946
tensor([12, 18,  8, 

In [398]:
# Test the model
model.eval()
index_to_word = {
    0: "<pad>",
    1: "<sos>",
    2: "<eos>",
    3: "sp",
    4: "bin",
    5: "lay",
    6: "place",
    7: "set",
    8: "blue",
    9: "green",
    10: "red",
    11: "white",
    12: "at",
    13: "by",
    14: "in",
    15: "with",
    16: "zero",
    17: "one",
    18: "two",
    19: "three",
    20: "four",
    21: "five",
    22: "six",
    23: "seven",
    24: "eight",
    25: "nine",
    26: "again",
    27: "now",
    28: "please",
    29: "soon",
    30: "a",
    31: "b",
    32: "c",
    33: "d",
    34: "e",
    35: "f",
    36: "g",
    37: "h",
    38: "i",
    39: "j",
    40: "k",
    41: "l",
    42: "m",
    43: "n",
    44: "o",
    45: "p",
    46: "q",
    47: "r",
    48: "s",
    49: "t",
    50: "u",
    51: "v",
    52: "x",
    53: "y",
    54: "z"
}

with torch.no_grad():
    pred = model(x_dim_batch, x2d1_batch, x2d2_batch).argmax(1)
    pred_list = pred.tolist()
    print(pred)
    pred_words = [index_to_word[i] for i in pred_list]
    print(pred_words)


tensor([12,  4,  8, 35, 27, 18])
['at', 'bin', 'blue', 'f', 'now', 'two']
