In [1]:
#1.installing and importing dependencies


In [3]:
pip list

Package                       Version
----------------------------- --------------------
absl-py                       1.4.0
alabaster                     0.7.12
altair                        5.0.1
anaconda-client               1.11.0
anaconda-navigator            2.3.2
anaconda-project              0.11.1
anyio                         3.5.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arrow                         1.2.2
astroid                       2.11.7
astropy                       5.1
astunparse                    1.6.3
atomicwrites                  1.4.0
attrs                         21.4.0
Automat                       20.2.0
autopep8                      1.6.0
Babel                         2.9.1
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
backports.tempfile            1.0
backports.weakref             1.0.post1
bcrypt                        3.2.0
beautifulsoup4                4.11.1
bi

In [2]:
pip install opencv-python matplotlib imageio gdown tensorflow

Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.6.0 requires daal==2021.4.0, which is not installed.
numba 0.55.1 requires numpy<1.22,>=1.18, but you have numpy 1.24.3 which is incompatible.



Collecting opencv-python
  Downloading opencv_python-4.8.0.76-cp37-abi3-win_amd64.whl (38.1 MB)
     ---------------------------------------- 38.1/38.1 MB 6.3 MB/s eta 0:00:00
Collecting gdown
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Collecting tensorflow
  Using cached tensorflow-2.13.0-cp39-cp39-win_amd64.whl (1.9 kB)
Collecting tensorflow-intel==2.13.0
  Using cached tensorflow_intel-2.13.0-cp39-cp39-win_amd64.whl (276.5 MB)
Collecting libclang>=13.0.0
  Downloading libclang-16.0.6-py2.py3-none-win_amd64.whl (24.4 MB)
     ---------------------------------------- 24.4/24.4 MB 5.7 MB/s eta 0:00:00
Collecting termcolor>=1.1.0
  Downloading termcolor-2.3.0-py3-none-any.whl (6.9 kB)
Collecting grpcio<2.0,>=1.24.3
  Downloading grpcio-1.58.0-cp39-cp39-win_amd64.whl (4.3 MB)
     ---------------------------------------- 4.3/4.3 MB 5.8 MB/s eta 0:00:00
Collecting astunparse>=1.6.0
  Downloading astunparse-1.6.3-py2.py3-none-any.whl (12 kB)
Collecting gast<=0.4.0,>=0.2.1
  Downlo

In [4]:
import os 
import cv2
import tensorflow as tf
import numpy as np
from typing import List
from matplotlib import pyplot as plt
import imageio

In [None]:
#os: For file system operations.
#cv2: For computer vision and image manipulation.
#tensorflow (or tf): For machine learning and deep learning.
#numpy (or np): For numerical data manipulation.
#typing.List: For specifying function argument types.
#matplotlib.pyplot: For creating visualizations.
#imageio: For reading and writing image and video formats.

In [5]:
 tf.config.list_physical_devices('GPU')

[]

In [7]:
physical_devices = tf.config.list_physical_devices('GPU')
try:
    tf.config.experimental.set_memory_growth(physical_devices[0],True)
except:
    pass

In [None]:
#2.Build data loading Function

In [None]:
import gdown


In [None]:
url = 'http://drive.google.com/uc?id=1Y1vDLix35-U8fd-gqwRcWXAXm8JwjL'
output = 'data.zip'
gdown.download(url,output,quiet=False)
gdown.extractall('data.zip')

In [None]:
#This code downloads a file from a Google Drive URL using the gdown library, saves it as 'data.zip', and then extracts its contents into the current working directory using the zipfile library.

In [None]:
#2.data loading function 

In [None]:
def load_video(path:str)->list[float]:
    
    cap = cv2.videoCapture(path)
    frames = []
    for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
        ret,frame = cap.read()
        frame = tf.image.rgb_to_grayscale(frame)
        frames.append(frame[190:236:,80:220,:])
    cap.release()
    
    mean = tf.math.reduce_mean(frames)
    std = tf.math.reduce_std(tf.cast(frames,tf,float32))
    return tf.cast((frames - mean),tf.float32)/std

In [None]:
#This function takes a video file path, loads the video, processes its frames by converting them to grayscale and cropping a specific region, calculates statistical values (mean and standard deviation) for the frames, and returns the frames as a list of floating-point numbers that have been normalized with zero mean and scaled by the standard deviation, making them suitable for further analysis or machine learning tasks.
#(frame[190:236:,80:220,:]) It appends a cropped version of the frame (the lip part)) to the frames list.

In [None]:
vocab = [x for x in "abcdefghijklmnopqrstuvxyz'?!123456789"]

In [None]:


char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="", mask_token="", invert=True)

print(f"The vocabulary is: {char_to_num.get_vocabulary()} (size = {char_to_num.vocabulary_size()})")


In [None]:
#So, the code initializes two layers for character-to-number and number-to-character mappings, and it prints out information about the vocabulary size and the actual vocabulary list
#hese layers are a crucial part of text preprocessing in NLP tasks, enabling the conversion between text and numerical representations while providing flexibility to handle out-of-vocabulary tokens. The printed vocabulary information is helpful for understanding the characteristics of your text data and configuring your models accordingly.

In [None]:
from typing import List

def load_alignments(path: str, char_to_num) -> List[str]:
    with open(path, 'r') as f:
        lines = f.readlines()
    
    tokens = []
    
    for line in lines:
        line = line.split()
        if line[2] != 'sil':
            tokens.extend([' ', line[2]])
    
    
    indices = char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]
    
    return indices


In [None]:
#It opens a file you specify (with a path) and reads what's inside.
#It looks at each line in the file and breaks it into smaller pieces (like words).
#If a certain condition is met (in this case, if the third piece is not the word "sil"), it keeps track of those pieces.
#It takes those pieces and turns them into numbers (imagine turning letters into numbers in a secret code).
#It gives you the list of numbers it found.
#This code could be used for things like turning spoken words into a series of numbers that a computer can understand or aligning words in a transcription to words in an audio recording.

In [None]:
def load_data(path:str):
    path = bytes.decode(path.numpy())
    file_name = path.split('\\')[-1].split('.')[0]
    video_path = os.path.join('data','s1',f'{file_name}.aligmnet)'
    frames = load_video(video_path)
    alignment = load_alignment(alignment_path)
    
    return frames,alignment

In [None]:
# this function is designed to prepare and retrieve data related to video frames and alignment, using a provided path and other assumptions about the directory structure and data sources. It's important to ensure that the necessary functions (load_video and load_alignment) and data sources (alignment_path) are correctly defined and set up in your code for this function to work as intended.

In [None]:
test_path = ',\\data\\s1\\bba16n.mpg'

In [None]:
tf.convert_to_tensor(test_path).numpy().decode(utf-8).split('\\')[-1].spilt(',')

In [None]:
load_data(tf.convert_to_tensor(test_path))

In [None]:
#code takes a test path, processes it to obtain path components, and then loads data using the load_data function 

In [None]:
frames,alignments=load_data(tf.convert_to_tensor(test_path))

In [None]:
def mappable_function(path:str)-.List[str]:
    result = tf.py_function(load_data,[path],(tf.float32,tf.int64))
    return result

In [None]:
#The mappable_function takes a path as input, applies the load_data function to it as a TensorFlow computation, and returns the result as a tuple of a float tensor and an integer tensor.

In [None]:
#2.Create Data Pipeline

In [None]:
from matplotlib import pyplot as plt 

In [None]:

file_pattern = '/data/s1/*.mpg'
file_paths = tf.data.Dataset.list_files(file_pattern)
data = file_paths.shuffle(500)
data = data.map(mappable_function)
data = data.padded_batch(2, padded_shapes=([75, None, None, None]))
data = data.prefetch(tf.data.AUTOTUNE)


In [None]:
#In summary, this code sets up a data pipeline that loads and preprocesses video data from files, shuffles the data for randomness, batches it with padding for model input, and prefetches data for efficient training.

In [None]:
#3.Design the Deep Neural Network

In [1]:
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler


In [None]:
#import os: This statement imports the os module, which is used for interacting with the operating system, allowing you to work with files, directories, and paths.
#import tensorflow as tf: This imports the TensorFlow library, a popular deep learning framework used for building and training neural networks.
#from tensorflow.keras.models import Sequential: It imports the Sequential class, which is a Keras feature used to create a linear stack of neural network layers.
#from tensorflow.keras.layers import ...: These lines import various layer types and components from Keras, which are building blocks for constructing neural networks. Some notable components include convolutional layers (Conv3D), recurrent layers (LSTM), fully connected layers (Dense), dropout layers (Dropout), bidirectional layers (Bidirectional), pooling layers (MaxPool3D), activation functions (Activation), reshaping layers (Reshape), dropout layers for 3D data (SpatialDropout3D), batch normalization layers (BatchNormalization), and time-distributed layers (TimeDistributed).
#from tensorflow.keras.optimizers import Adam: This line imports the Adam optimizer, a widely used optimization algorithm for training neural networks.
#from tensorflow.keras.callbacks import ...: These lines import various callback functions provided by Keras for enhancing the training process. Notable callbacks include ModelCheckpoint for saving model checkpoints during training and LearningRateScheduler for adjusting learning rates during training based on a predefined schedule.

In [None]:

model = Sequential()

model.add(Conv3D(128, 3, input_shape=(75, 46, 140, 1), padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D(pool_size=(1, 2, 2)))

model.add(Conv3D(256, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D(pool_size=(1, 2, 2)))


model.add(Conv3D(75, 3, padding='same'))
model.add(Activation('relu'))
model.add(MaxPool3D(pool_size=(1, 2, 2)))


model.add(TimeDistributed(Flatten()))


model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))


model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
model.add(Dropout(0.5))


model.add(Dense(char_to_num.vocabulary_size() + 1, kernel_initializer='he_normal', activation='softmax'))


optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


model.summary()


In [None]:
#The model is defined as a Sequential model, which means that layers are added sequentially.
#Convolutional layers (Conv3D) are added with ReLU activations and max-pooling layers.
#The TimeDistributed layer is used to apply the Flatten operation to the output of the convolutional layers, making it compatible with the LSTM layers.
#Bidirectional LSTM layers are added with dropout to improve model performance and reduce overfitting.
#Finally, a Dense output layer is added with a softmax activation function to produce the model's predictions.
#The model is compiled with an Adam optimizer and sparse categorical cross-entropy loss ( it's a classification task). Also, accuracy is monitored as a metric.
#A summary of the model's architecture is printed to provide an overview of the layer shapes and the total number of trainable parameters.

In [None]:
model.summary()

In [None]:
#provides a summary of your neural network model, including information about the layers, the number of trainable parameters, and the output shapes at each layer. This summary is incredibly useful for understanding the architecture of your model and diagnosing potential issues.

In [None]:
#4.Set up and Training the model

In [None]:
def scheduler(epoch,lr):
    if epoch<30:
        return lr
    else:
        return lr *tf.math.exp(-0.1)

In [None]:
#In summary, this scheduler function keeps the learning rate constant for the first 30 epochs and then applies an exponential decay to reduce the learning rate as training continues. Learning rate scheduling is a technique used to improve the convergence and stability of training deep neural networks.

In [None]:
def CTCLoss(y_true,y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0],dtype="int64")
    input_lenght = tf.cast(tf.shape(y_pred)[1],dtype="int64")
    label_lenght = tf.cast(tf.shape(y_true)[1],dtype="int64")
    
    input_lengh = input_lengh * tf.ones(shape=(batch_len,1),dtype="int64")
    label_lengh = label_lengh *tf.ones(shape =(batch_len,1),dtype="int64")
    
    loss= tf.keras.backend.ctc_batch_cost(y_true,y_pred,input_lengh,label_lengh)
    return loss

In [None]:
#This custom CTC loss function can be used as the loss function in a Keras model for training tasks that involve sequence-to-sequence mapping, where aligning input and target sequences can be challenging. CTC loss helps the model learn to make predictions while handling variable-length input sequences and their corresponding labels.

In [None]:
class ProduceExample(tf.keras.callbacks.Callback): 
    def __init__(self, dataset) -> None: 
        self.dataset = dataset.as_numpy_iterator()
    
    def on_epoch_end(self, epoch, logs=None) -> None:
        data = self.dataset.next()
        yhat = self.model.predict(data[0])
        decoded = tf.keras.backend.ctc_decode(yhat, [75,75], greedy=False)[0][0].numpy()
        for x in range(len(yhat)):           
            print('Original:', tf.strings.reduce_join(num_to_char(data[1][x])).numpy().decode('utf-8'))
            print('Prediction:', tf.strings.reduce_join(num_to_char(decoded[x])).numpy().decode('utf-8'))
            print('~'*100)

In [None]:
#This callback provides a way to visually inspect and compare the model's predictions with the actual ground truth labels at the end of each training epoch. It is particularly useful for tasks involving sequence data, such as text recognition or speech recognition, to track the model's performance and identify any issues or improvements during training.

In [None]:

model.compile(optimizer=Adam(learning_rate=0.0001), loss=CTCLoss)

In [None]:
# model to use the Adam optimizer with a specific learning rate and to minimize the CTC loss during training. When you call model.fit() to train the model later on, it will use these settings for optimization.

In [None]:
checkpoint_callback = ModelCheckpoint(os.path.join('models','checkpoint'), monitor='loss', save_weights_only=True)

In [None]:
#ModelCheckpoint callback is a valuable tool for managing and monitoring the training of machine learning models, allowing you to save model states at key points during training.

In [None]:
schedule_callback = LearningRateScheduler(scheduler)
example_callback = ProduceExample(test)

In [None]:
#These callbacks enhance your training process by providing mechanisms for fine-tuning the learning rate and for monitoring and visualizing model predictions, making it easier to understand and improve the performance of your machine learning model.






In [None]:
model.fit(train, validation_data=test, epochs=100, callbacks=[checkpoint_callback, schedule_callback, example_callback])

In [None]:
#this model.fit call trains your model on the train dataset, evaluates it on the test dataset, and applies the specified callbacks to optimize and monitor the training process over 100 epochs.

In [None]:
#5.make a prediction

In [None]:
url = 'https://drive.google.com/uc?id=1vWscXs4Vt0a_1IH1-ct2TCgXAZT-N3_Y'
output = 'checkpoints.zip'
gdown.download(url, output, quiet=False)
gdown.extractall('checkpoints.zip', 'models')

In [None]:
model.load_weights('models/checkpoint')

In [None]:
test_data = test.as_numpy_iterator()
sample = test_data.next()
yhat = model.predict(sample[0])

In [None]:
print('~'*100, 'REAL TEXT')
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in sample[1]]

In [None]:
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[75,75], greedy=True)[0][0].numpy()

In [None]:
print('~'*100, 'PREDICTIONS')
[tf.strings.reduce_join([num_to_char(word) for word in sentence]) for sentence in decoded]