### Library Imports and Setup

In [1]:
# Libraries and Dependancies
import os  
import cv2
import imageio
import numpy as np
from typing import List 
import tensorflow as tf
from matplotlib import pyplot as plt

In [2]:
# GPU Setup
physical_devices = tf.config.list_physical_devices("GPU")
try:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
except:
    pass

### Video Preprocessing

In [3]:
def load_video(path:str) -> List[float]:
    # Loading video from directory
    cap = cv2.VideoCapture(path)
    frames = []
    # Convert video into frames
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret, frame = cap.read()
        if ret == False:
            print("ERROR while accessing the file...")
        # Convert image to grayscale
        frame = tf.image.rgb_to_grayscale(frame)
        # Crop the image for mouth region only
        frames.append(frame[190:236, 80:220])
    cap.release()
    
    # Normalization of video frames
    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames, tf.float32))
    preprocessed_frames = tf.cast((frames - mean), tf.float32) / std
    return preprocessed_frames

In [4]:
# Sample testing 
load_video("scratchpad\\data\\s1\\swwv8p.mpg")
# The shape of the tensor is shown to be (75, 46, 140, 1)
# The format of the shape is :
#          (noofFrames, Height of frame, Width of frame, noofChannels)

<tf.Tensor: shape=(0,), dtype=float32, numpy=array([], dtype=float32)>

In [5]:
# Possible vocabulary for the model to predict
vocabulary = [x for x in "abcdefghijklmnopqrstuvwxyz123456789'?!"]

In [6]:
# Encoding the vocabulary to numbers and vice-versa
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocabulary, oov_token="")
num_to_char = tf.keras.layers.StringLookup(
    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
)

print(f"The vocabulary is: {char_to_num.get_vocabulary()}")
print(f"size = {char_to_num.vocabulary_size()}")

The vocabulary is: ['', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '1', '2', '3', '4', '5', '6', '7', '8', '9', "'", '?', '!']
size = 39


In [7]:
# Loading textual data from .align files
def load_alignments(path:str) -> List[str]: 
    with open(path, 'r') as f: 
        lines = f.readlines() 
    tokens = []
    for line in lines:
        line = line.split()
        # Here, sil refers to silence according to the dataset. 
        # So it shouldnt be considered as a token.
        if line[2] != 'sil': 
            tokens = [*tokens,' ',line[2]]
        # Till here the data is extracted from .align files and kept in the tokens array
    # The tokens array is encoded to numbers 
    return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'),(-1)))[1:]

In [9]:
# Loading textual data(from .align files) and video clips(from .mpg files)
def load_data(path: str): 
    # Directory notations may differ depending on OS
    file_name = os.path.splitext(os.path.basename(path))[0] # Windows only 
    # Searching the correct .mpg file for the given file
    
    
    video_path = os.path.join('data','s1',f'{file_name}.mpg')
    # Searching the correct .align file for the given file 
    alignment_path = os.path.join(os.path.dirname(video_path), f"{file_name}.align")
    # Loading data
    frames = load_video(video_path) 
    alignments = load_alignments(alignment_path)
    
    return frames, alignments

In [11]:
# Testing
test_path = 'scratchpad\\data\\s1\\bbal6n.mpg'
frames, alignments = load_data(test_path)
plt.imshow(frames[40])


FileNotFoundError: [Errno 2] No such file or directory: 'data\\s1\\bbal6n.align'

### Data Pipelining