<a href="https://colab.research.google.com/github/tasnimislamraisa/Python_Learning/blob/deep-Learning/LipReading_Optimized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# üé• Lip Reading with TensorFlow (Optimized with .npz Preprocessing)

In [3]:
!pip install gdown
!apt install -y ffmpeg

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [4]:
import os
import cv2
import gdown
import zipfile
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping
from matplotlib import pyplot as plt

In [5]:
url = 'https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL'
gdown.download(url, 'data.zip', quiet=False)
with zipfile.ZipFile('data.zip', 'r') as zip_ref:
    zip_ref.extractall('data')

Downloading...
From (original): https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL
From (redirected): https://drive.google.com/uc?id=1YlvpDLix3S-U8fd-gqRwPcWXAXm8JwjL&confirm=t&uuid=fc42da1b-7574-492e-9712-9f48abdaae07
To: /content/data.zip
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 423M/423M [00:06<00:00, 62.4MB/s]


In [6]:
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
num_to_char = tf.keras.layers.StringLookup(vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True)

def preprocess_and_save(video_dir='./data/s1', align_dir='./data/alignments/s1', out_dir='./npz_data'):
    os.makedirs(out_dir, exist_ok=True)
    for file in os.listdir(video_dir):
        if not file.endswith('.mpg'):
            continue
        file_name = os.path.splitext(file)[0]
        video_path = os.path.join(video_dir, file)
        align_path = os.path.join(align_dir, f"{file_name}.align")

        cap = cv2.VideoCapture(video_path)
        frames = []
        for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
            ret, frame = cap.read()
            frame = tf.image.rgb_to_grayscale(frame)
            frames.append(frame[190:236, 80:220, :])
        cap.release()
        frames = tf.cast(frames, tf.float32)
        frames = (frames - tf.math.reduce_mean(frames)) / tf.math.reduce_std(frames)

        with open(align_path, 'r') as f:
            lines = f.readlines()
        tokens = []
        for line in lines:
            line = line.split()
            if line[2] != 'sil':
                tokens.extend([' ', line[2]])
        alignments = char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]

        np.savez_compressed(os.path.join(out_dir, f"{file_name}.npz"),
                            frames=frames.numpy(), alignments=alignments.numpy())




In [7]:
import os

print("Current working directory:", os.getcwd())
print("Contents:", os.listdir())


Current working directory: /content
Contents: ['.config', 'data', 'data.zip', 'sample_data']


In [8]:
import os

for root, dirs, files in os.walk("data"):
    print("üìÅ", root)
    for d in dirs:
        print("   ‚îî‚îÄ‚îÄ", d)


üìÅ data
   ‚îî‚îÄ‚îÄ data
üìÅ data/data
   ‚îî‚îÄ‚îÄ alignments
   ‚îî‚îÄ‚îÄ s1
üìÅ data/data/alignments
   ‚îî‚îÄ‚îÄ s1
üìÅ data/data/alignments/s1
üìÅ data/data/s1


In [None]:
preprocess_and_save(video_dir='data/data/s1', align_dir='data/data/alignments/s1')


In [None]:
def load_npz(file_path):
    with np.load(file_path) as data:
        return data['frames'], data['alignments']

def tf_data_wrapper(file_path):
    def _load(file):
        f = file.numpy().decode()
        frames, alignments = load_npz(f)
        return frames, alignments

    video, labels = tf.py_function(_load, [file_path], (tf.float32, tf.int64))
    video.set_shape([None, 46, 140, 1])
    labels.set_shape([None])
    return video, labels

'''
def get_datasets(npz_dir='./npz_data', batch_size=8):
    files = tf.data.Dataset.list_files(f"{npz_dir}/*.npz")
    dataset = files.map(tf_data_wrapper, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.cache()
    dataset = dataset.shuffle(100)
    dataset = dataset.padded_batch(batch_size, padded_shapes=([None, 46, 140, 1], [None]))
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset
'''

In [29]:
def get_datasets(npz_dir='./npz_data', batch_size=8, val_split=0.2, limit=50):
    files = sorted(tf.io.gfile.glob(f"{npz_dir}/*.npz"))[:limit]  # üî• limit to 100 files
    split_idx = int(len(files) * (1 - val_split))

    train_files = tf.data.Dataset.from_tensor_slices(files[:split_idx])
    val_files = tf.data.Dataset.from_tensor_slices(files[split_idx:])

    def pipeline(file_ds):
        ds = file_ds.map(tf_data_wrapper, num_parallel_calls=tf.data.AUTOTUNE)
        ds = ds.cache().shuffle(100).padded_batch(batch_size, padded_shapes=([None, 46, 140, 1], [None]))
        return ds.prefetch(tf.data.AUTOTUNE)

    return pipeline(train_files), pipeline(val_files)


In [31]:
import glob

npz_files = glob.glob('./npz_data/*.npz')
print(f"üì¶ Total preprocessed video samples: {len(npz_files)}")


üì¶ Total preprocessed video samples: 1000


In [33]:
def CTCLoss(y_true, y_pred):
    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
    input_len = tf.cast(tf.shape(y_pred)[1], dtype="int64")
    label_len = tf.cast(tf.shape(y_true)[1], dtype="int64")
    input_len = input_len * tf.ones((batch_len, 1), dtype="int64")
    label_len = label_len * tf.ones((batch_len, 1), dtype="int64")
    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_len, label_len)

def build_model():
    inputs = layers.Input(shape=(None, 46, 140, 1))
    x = layers.Conv3D(128, 3, padding='same', activation='relu')(inputs)
    x = layers.MaxPool3D((1, 2, 2))(x)
    x = layers.Conv3D(256, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool3D((1, 2, 2))(x)
    x = layers.Conv3D(75, 3, padding='same', activation='relu')(x)
    x = layers.MaxPool3D((1, 2, 2))(x)
    x = layers.TimeDistributed(layers.Flatten())(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(char_to_num.vocabulary_size() + 1, activation='softmax')(x)
    model = models.Model(inputs, outputs)
    model.compile(optimizer='adam', loss=CTCLoss)
    return model

In [34]:
from tensorflow.keras.callbacks import EarlyStopping

def get_callbacks():
    checkpoint = ModelCheckpoint('checkpoint.weights.h5', monitor='val_loss', save_weights_only=True)

    def scheduler(epoch, lr):
     return float(lr) if epoch < 30 else float(lr * tf.math.exp(-0.1).numpy())



    earlystop = EarlyStopping(
        monitor='val_loss',
        patience=5,            # Stop if no improvement for 5 consecutive epochs
        restore_best_weights=True,
        verbose=1
    )

    return [checkpoint, LearningRateScheduler(scheduler), earlystop]


In [2]:
train_data, val_data = get_datasets(batch_size=8, limit=50)


model = build_model()
callbacks = get_callbacks()

model.fit(
    train_data,
    validation_data=val_data,
    epochs=50,
    callbacks=callbacks
)

NameError: name 'get_datasets' is not defined

In [1]:
sample_path = './npz_data/bras9a.npz'
frames, labels = load_npz(sample_path)

yhat = model.predict(tf.expand_dims(frames, 0))
#decoded = tf.keras.backend.ctc_decode(yhat, input_length=[yhat.shape[1]], greedy=True)[0][0]
decoded = tf.keras.backend.ctc_decode(yhat, input_length=[yhat.shape[1]], greedy=False, beam_width=10)[0][0]


predicted = tf.strings.reduce_join(num_to_char(decoded[0])).numpy().decode()
actual = tf.strings.reduce_join(num_to_char(tf.convert_to_tensor(labels))).numpy().decode()

print("‚úÖ Actual:   ", actual)
print("üîÆ Predicted:", predicted)

NameError: name 'load_npz' is not defined

# **test accuracy**

In [None]:
def evaluate_batch(dataset, num_to_char):
    import editdistance
    total_cer = 0
    total_samples = 0

    for sample in dataset.take(1):  # You can increase `.take()` for more batches
        videos, labels = sample
        yhat = model.predict(videos)
        decoded = tf.keras.backend.ctc_decode(yhat, input_length=[yhat.shape[1]] * yhat.shape[0], greedy=False, beam_width=10)[0][0]

        for i in range(len(decoded)):
            pred = tf.strings.reduce_join(num_to_char(decoded[i])).numpy().decode().strip()
            actual = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode().strip()
            cer = editdistance.eval(actual, pred) / max(len(actual), 1)
            total_cer += cer
            total_samples += 1
            print(f"‚úÖ Actual:   {actual}\nüîÆ Predicted: {pred}\nüìè CER: {cer:.2%}\n{'-'*40}")

    print(f"üìä Average CER over batch: {(total_cer / total_samples):.2%}")


In [None]:
evaluate_batch(val_data, num_to_char)


In [13]:
!pip install editdistance




In [14]:
import editdistance

def calculate_cer(true_text, pred_text):
    cer = editdistance.eval(true_text.strip(), pred_text.strip()) / max(len(true_text.strip()), 1)
    return cer


In [15]:
cer_score = calculate_cer(actual, predicted)
print(f"üìè CER (Character Error Rate): {cer_score:.2%}")


üìè CER (Character Error Rate): 86.96%
