# Auto-AVSR Tutorial
**Authors**: [Pingchuan Ma](https://mpc001.github.io/), [Alexandros Haliassos](https://dblp.org/pid/257/3052.html), [Adriana Fernandez-Lopez](https://scholar.google.com/citations?user=DiVeQHkAAAAJ), [Honglie Chen](https://scholar.google.com/citations?user=HPwdvwEAAAAJ), [Stavros Petridis](https://ibug.doc.ic.ac.uk/people/spetridis), [Maja Pantic](https://ibug.doc.ic.ac.uk/people/mpantic).

This tutorial shows how to use Auto-AVSR model to perform speech recognition (ASR, VSR, and AV-ASR), crop mouth ROIs or extract visual speech features.

**Disclaimer**: Please note that both the VSR model and AV-ASR model have been trained with videos that were pre-processed by RetinaFace. For the purpose of improving inference speed, we use mediapipe instead.

In [None]:
#!git clone https://github.com/mpc001/Visual_Speech_Recognition_for_Multiple_Languages.git

In [None]:
#!pip install torch torchvision torchaudio
#!pip install opencv-python
#!pip install scipy
#!pip install scikit-image
#!pip install av
#!pip install six
#!pip install mediapipe
#!pip install ffmpeg-python

In [1]:
import os
import torch
from pipelines.model import AVSR
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector

In [None]:
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

## Building an inference pipeline


In [2]:
class InferencePipeline(torch.nn.Module):
    def __init__(self, modality, model_path, model_conf, detector="mediapipe", face_track=False, device="cpu"):
        super(InferencePipeline, self).__init__()
        self.device = device
        # modality configuration
        self.modality = modality
        self.dataloader = AVSRDataLoader(modality, detector=detector)
        self.model = AVSR(modality, model_path, model_conf, rnnlm=None, rnnlm_conf=None, penalty=0.0, ctc_weight=0.1, lm_weight=0.0, beam_size=40, device=device)
        if face_track and self.modality in ["video", "audiovisual"]:
            self.landmarks_detector = LandmarksDetector()
        else:
            self.landmarks_detector = None


    def process_landmarks(self, data_filename, landmarks_filename):
        if self.modality == "audio":
            return None
        if self.modality in ["video", "audiovisual"]:
            landmarks = self.landmarks_detector(data_filename)
            return landmarks


    def forward(self, data_filename, landmarks_filename=None):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        transcript = self.model.infer(data)
        return transcript

    def extract_features(self, data_filename, landmarks_filename=None, extract_resnet_feats=False):
        assert os.path.isfile(data_filename), f"data_filename: {data_filename} does not exist."
        landmarks = self.process_landmarks(data_filename, landmarks_filename)
        data = self.dataloader.load_data(data_filename, landmarks)
        with torch.no_grad():
            if isinstance(data, tuple):
                enc_feats = self.model.model.encode(data[0].to(self.device), data[1].to(self.device), extract_resnet_feats)
            else:
                enc_feats = self.model.model.encode(data.to(self.device), extract_resnet_feats)
        return enc_feats

## Auto-AVSR functions

### Infer the noisy clip using a video stream


1. Download a VSR checkpoint

In [3]:
#https://huggingface.co/quakumei/Visual_Speech_Recognition_for_Multiple_Languages/tree/main

2. Build a VSR pipeline

In [4]:
modality = "video"
model_conf = "../models/LRS3_V_WER19.1/model.json"
model_path = "../models/LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

I0000 00:00:1761408717.715261 22169651 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M4 Max
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
I0000 00:00:1761408717.718799 22169651 gl_context.cc:369] GL version: 2.1 (2.1 Metal - 89.4), renderer: Apple M4 Max
W0000 00:00:1761408717.718991 22172761 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


W0000 00:00:1761408717.729809 22172773 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.


3. Infer the noisy clip using the video stream

In [6]:
transcript = pipeline("/Users/vishnou/Documents/echo-charlie/data/videos/macron_1.mp4")
print(transcript)



TO YOU SHE WAS YOUR QUEEN TO US SHE WAS THE QUEEN TO US ALL SHE WOULD BE WITH US FOREVER


### Infer the noisy clip using both audio and visual streams

1. Download a AV-ASR checkpoint

In [None]:
#https://huggingface.co/quakumei/Visual_Speech_Recognition_for_Multiple_Languages/tree/main

2. Build an AV-ASR pipeline

In [None]:
modality = "audiovisual"
model_conf = "../models/LRS3_AV_WER0.9/model.json"
model_path = "../models/LRS3_AV_WER0.9/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

3. Infer the noisy clip using both audio and video streams

In [None]:
transcript = pipeline("../data/noisy_clip.mp4")
print(transcript)

### Crop mouth ROIs


In [None]:
import cv2
import torchvision
from pipelines.data.data_module import AVSRDataLoader
from pipelines.detectors.mediapipe.detector import LandmarksDetector

def save2vid(filename, vid, frames_per_second):
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    torchvision.io.write_video(filename, vid, frames_per_second)

def preprocess_video(src_filename, dst_filename):
    landmarks = landmarks_detector(src_filename)
    data = dataloader.load_data(src_filename, landmarks)
    fps = cv2.VideoCapture(src_filename).get(cv2.CAP_PROP_FPS)
    save2vid(dst_filename, data, fps)
    return

dataloader = AVSRDataLoader(modality="video", speed_rate=1, transform=False, detector="mediapipe", convert_gray=False)
landmarks_detector = LandmarksDetector()

In [None]:
preprocess_video(src_filename="data/clip.mp4", dst_filename="/data/roi.mp4")

### Extract visual-only features

In [None]:
modality = "video"
model_conf = "models/LRS3_V_WER19.1/model.json"
model_path = "models/LRS3_V_WER19.1/model.pth"
pipeline = InferencePipeline(modality, model_path, model_conf, face_track=True)

[**Option 1**]. Extract features from the output of Conformer.

In [None]:
features = pipeline.extract_features("/data/clip.mp4")
print(features.size())

[**Option 2**]. Extract features from the output of ResNet.

In [None]:
features = pipeline.extract_features("/data/clip.mp4", extract_resnet_feats=True)
print(features.size())

In [None]:
import openai
import os

BOSON_API_KEY = "bai-Diz6JrS6rquzG1HSby-07fYX0AEgNJrCXKx0n6qr8F06ACSz"

client = openai.Client(
    api_key=BOSON_API_KEY,
    base_url="https://hackathon.boson.ai/v1"
)

response = client.chat.completions.create(
    model="Qwen3-32B-non-thinking-Hackathon",
    messages=[
        {"role": "system", "content": "I got this text from a lipreading model. Please correct any misheard or miswritten or incoherent words given the context to make it sound like what the speaker likely said, without adding or removing any content."},
        {"role": "user", "content": "THE OTHER DAY I HEARD SOMEONE COMPARE TRUMP TO THE NEIGHBOR WHO KEEPS RUNNING HIS LEAF BLOWER OUTSIDE DOOR WINDOW EVERY MINUTE OF EVERY DAY"}
    ],
    max_tokens=2048,
    temperature=0.0
)

print(response.choices[0].message.content)