In [6]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, BatchNormalization, Reshape
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from glob import glob
import cv2

class LipReadingDataGenerator(tf.keras.utils.Sequence):
    def __init__(self, data_path, alignment_path, batch_size=32, frame_length=75, 
                 image_height=46, image_width=140, **kwargs):
        super().__init__(**kwargs)
        self.data_path = data_path
        self.alignment_path = alignment_path
        self.batch_size = batch_size
        self.frame_length = frame_length
        self.image_height = image_height
        self.image_width = image_width

        self.video_paths = sorted(glob(os.path.join(data_path, '*.mpg')))
        self.alignment_paths = sorted(glob(os.path.join(alignment_path, '*.align')))
        
        print(f"Found {len(self.video_paths)} video files and {len(self.alignment_paths)} alignment files")
        self.vocabulary = self._create_word_vocabulary()
            
        self.char_to_num = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token="")
        self.num_to_char = tf.keras.layers.StringLookup(
            vocabulary=self.vocabulary, oov_token="", invert=True)

    def _create_word_vocabulary(self):
        words = set()
        print(f"Processing alignment files from: {self.alignment_path}")
        
        for align_path in self.alignment_paths:
            try:
                with open(align_path, 'r') as f:
                    content = f.read().strip().split()
                    words.update([content[i] for i in range(2, len(content), 3)])
            except Exception as e:
                print(f"Error processing {align_path}: {str(e)}")
        
        words.discard('sil')
        vocabulary = sorted(list(words))
        
        if not vocabulary:
            print("No words found in alignment files. Using default vocabulary.")
            vocabulary = ['bin', 'blue', 'at', 'f', 'two', 'now']
        
        print(f"Vocabulary size: {len(vocabulary)}")
        return vocabulary

    def __len__(self):
        return max(1, len(self.video_paths) // self.batch_size)
    
    def _process_video(self, video_path):
        frames = []
        cap = cv2.VideoCapture(video_path)
        
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            mouth = gray[190:236, 80:220]
            mouth = cv2.resize(mouth, (self.image_width, self.image_height))
            frames.append(mouth)
        
        cap.release()
        
        frames = np.array(frames, dtype=np.float32)
        frames = (frames - frames.mean()) / (frames.std() + 1e-6)
        
        if len(frames) < self.frame_length:
            pad_length = self.frame_length - len(frames)
            frames = np.pad(frames, ((0, pad_length), (0, 0), (0, 0)), mode='constant')
        else:
            frames = frames[:self.frame_length]
        
        return frames
    
    def _process_alignment(self, alignment_path):
        with open(alignment_path, 'r') as f:
            content = f.read().strip().split()
        
        words = [content[i] for i in range(2, len(content), 3) if content[i] != 'sil']
        text = ' '.join(words)
        return self.char_to_num(tf.convert_to_tensor(text.split()))
    
    def __getitem__(self, idx):
        batch_videos = self.video_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_alignments = self.alignment_paths[idx * self.batch_size:(idx + 1) * self.batch_size]
        
        X = np.zeros((len(batch_videos), self.frame_length, self.image_height, self.image_width, 1))
        Y = np.zeros((len(batch_videos), len(self.vocabulary)))
        
        for i, (video_path, align_path) in enumerate(zip(batch_videos, batch_alignments)):
            frames = self._process_video(video_path)
            X[i] = frames.reshape(self.frame_length, self.image_height, self.image_width, 1)
            
            labels = self._process_alignment(align_path)
            Y[i] = tf.reduce_max(tf.one_hot(labels, len(self.vocabulary)), axis=0)
        
        return X, Y

def build_model(frame_length, image_height, image_width, vocabulary_size):
    model = Sequential([
        tf.keras.Input(shape=(frame_length, image_height, image_width, 1)), 
        Conv3D(64, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Conv3D(128, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Conv3D(256, kernel_size=(3, 3, 3), activation='relu'),
        MaxPool3D(pool_size=(1, 2, 2)),
        BatchNormalization(),
        
        Reshape((-1, 256)),
        
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.5),
        
        Bidirectional(LSTM(64)),
        Dropout(0.5),
        
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(vocabulary_size, activation='softmax')
    ])
    
    return model

def train_and_save_main_model(data_dir, alignment_dir, batch_size=32):
    print("Starting training for sentence-level prediction...")
    
    model_dir = "models_main"
    os.makedirs(model_dir, exist_ok=True)
    
    data_generator = LipReadingDataGenerator(data_dir, alignment_dir, batch_size=batch_size)
    
    model = build_model(
        frame_length=75,
        image_height=46,
        image_width=140,
        vocabulary_size=len(data_generator.vocabulary)
    )
    
    model.compile(
        optimizer=Adam(learning_rate=0.0001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    callbacks = [
        ModelCheckpoint(
            os.path.join(model_dir, 'lip_reading_main_best.keras'),
            save_best_only=True,
            monitor='accuracy'
        ),
        EarlyStopping(
            monitor='loss',
            patience=10,
            restore_best_weights=True
        )
    ]

    print("Training started...")
    model.fit(
        data_generator,
        epochs=10,
        callbacks=callbacks
    )

    final_model_path = os.path.join(model_dir, 'lip_reading_main_final.h5')
    model.save(final_model_path)
    print(f"Final model saved: {final_model_path}")

    vocab_path = os.path.join(model_dir, 'vocabulary_main.txt')
    with open(vocab_path, 'w') as f:
        f.write('\n'.join(data_generator.vocabulary))
    print(f"Vocabulary saved: {vocab_path}")

if __name__ == "__main__":
    data_dir = r"/Users/sureshkumar/Desktop/Mouthmap/datas/s1"
    alignment_dir = r"/Users/sureshkumar/Desktop/Mouthmap/datas/alignments/s1"
    
    print("Training main model...")
    train_and_save_main_model(data_dir, alignment_dir)

Training main model...
Starting training for sentence-level prediction...
Found 50 video files and 50 alignment files
Processing alignment files from: /Users/sureshkumar/Desktop/Mouthmap/datas/alignments/s1
Vocabulary size: 31


2025-01-23 22:01:31.135273: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1
2025-01-23 22:01:31.135338: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-01-23 22:01:31.135343: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-01-23 22:01:31.135371: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-01-23 22:01:31.135401: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Training started...
Epoch 1/10


2025-01-23 22:01:36.819374: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 174s/step - accuracy: 0.0000e+00 - loss: 20.1747
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 238s/step - accuracy: 0.1250 - loss: 20.1745
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m187s[0m 187s/step - accuracy: 0.1250 - loss: 20.1744
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 174s/step - accuracy: 0.1250 - loss: 20.1742
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 158s/step - accuracy: 0.1250 - loss: 20.1741
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 160s/step - accuracy: 0.1250 - loss: 20.1739
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m239s[0m 239s/step - accuracy: 0.1250 - loss: 20.1738
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 146s/step - accuracy: 0.1250 - loss: 20.1736
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━



Final model saved: models_main/lip_reading_main_final.h5
Vocabulary saved: models_main/vocabulary_main.txt


In [19]:
def evaluate_model(model_dir, validation_data_dir):
    model = load_model(os.path.join(model_dir, 'lip_reading_main_final.h5'))
    validation_generator = LipReadingDataGenerator(validation_data_dir, val_alignment_dir, batch_size=32)
    loss, accuracy = model.evaluate(validation_generator)
    print(f"Validation Loss: {loss}, Validation Accuracy: {accuracy}")

# Example usage
if __name__ == "__main__":
    model_directory = "/Users/sureshkumar/Desktop/Mouthmap/tf_envi/models_main"
    validation_data_dir = r"/Users/sureshkumar/Desktop/Mouthmap/datas/validatation_data"
    val_alignment_dir=r"/Users/sureshkumar/Desktop/Mouthmap/datas/validatation_alignments"
    # Update this path
    evaluate_model(model_directory, validation_data_dir)



Found 51 video files and 51 alignment files
Processing alignment files from: /Users/sureshkumar/Desktop/Mouthmap/datas/validatation_alignments
Vocabulary size: 31
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 28s/step - accuracy: 0.2188 - loss: 20.3880
Validation Loss: 20.388019561767578, Validation Accuracy: 0.21875


In [26]:
def predict_with_main_model(model_dir, video_path):
    vocab_path = os.path.join(model_dir, 'vocabulary_main.txt')
    with open(vocab_path, 'r') as f:
        vocabulary = f.read().splitlines()

    data_generator = LipReadingDataGenerator("", "")
    data_generator.vocabulary = vocabulary
    data_generator.char_to_num = tf.keras.layers.StringLookup(
        vocabulary=vocabulary, oov_token="")
    data_generator.num_to_char = tf.keras.layers.StringLookup(
        vocabulary=vocabulary, oov_token="", invert=True)

    model = load_model(os.path.join(model_dir, 'lip_reading_main_final.h5'))

    frames = data_generator._process_video(video_path)
    frames = frames.reshape(1, data_generator.frame_length, 
                            data_generator.image_height, 
                            data_generator.image_width, 1)

    prediction = model.predict(frames)
    predicted_indices = tf.argmax(prediction, axis=1).numpy()  # Convert to NumPy array

    # Convert indices to characters
    predicted_text = ' '.join([vocabulary[int(idx)] for idx in predicted_indices])  # Use vocabulary directly

    return predicted_text

# Example usage
if __name__ == "__main__":
    model_directory = "/Users/sureshkumar/Desktop/Mouthmap/tf_envi/models_main"
    test_video = r"/Users/sureshkumar/Desktop/Mouthmap/Mouthmap/data/s1/bbbf6n.mpg"

    print("\nMaking predictions...")
    sentence_prediction = predict_with_main_model(model_directory, test_video)
    print(f"Predicted sentence: {sentence_prediction}")


Making predictions...
Found 0 video files and 0 alignment files
Processing alignment files from: 
No words found in alignment files. Using default vocabulary.
Vocabulary size: 6




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
Predicted sentence: in
