In [None]:
import os, sys
import git

root = git.Repo('.', search_parent_directories=True).working_tree_dir
os.chdir(root)
print(f"Changed working directory to {root}")

FFMPEG = '/mmfs1/gscratch/cse/bandhav/miniconda3/envs/avhubert_gpu/bin/ffmpeg'
os.environ['PATH'] = f'{os.path.dirname(FFMPEG)}:' + os.environ['PATH']

In [None]:
from inference.avhubert import *

In [None]:
import IPython.display as ipd
from IPython.display import HTML

def play_video(video_path, width=200):
    mp4 = open(video_path,'rb').read()
    data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
    return ipd.display(HTML(f"""
    <video width={width} controls>
        <source src="{data_url}" type="video/mp4">
    </video>
    """))

In [None]:
face_predictor_path = f"{root}/data/misc/shape_predictor_68_face_landmarks.dat"
mean_face_path = f"{root}/data/misc/20words_mean_face.npy"
ckpt_path = f"{root}/data/checkpoints/base_vox_433h.pt"
cnn_detector_path = f'{root}/data/misc/mmod_human_face_detector.dat'

In [None]:
def show_roi(video_path, cnn_detector_path=None):
    # Create a temporary file for mouth_roi_path
    with tempfile.NamedTemporaryFile(suffix='.mp4') as mouth_roi:
        mouth_roi_path = mouth_roi.name

        # Call the preprocess_video function
        preprocess_video(
            video_path, mouth_roi_path, face_predictor_path, mean_face_path,
            cnn_detector_path=cnn_detector_path
        )

        play_video(mouth_roi_path)

In [None]:
utils.import_user_module(Namespace(user_dir=work_dir))
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])

In [None]:
import cv2
import tempfile
from argparse import Namespace
import fairseq
from fairseq import checkpoint_utils, options, tasks, utils
from fairseq.dataclass.configs import GenerationConfig
from IPython.display import HTML

def predict(video_path, models, saved_cfg, task):
    num_frames = int(cv2.VideoCapture(video_path).get(cv2.CAP_PROP_FRAME_COUNT))
    data_dir = tempfile.mkdtemp()
    tsv_cont = ["/\n", f"test-0\t{video_path}\t{None}\t{num_frames}\t{int(16_000*num_frames/25)}\n"]
    label_cont = ["DUMMY\n"]
    with open(f"{data_dir}/test.tsv", "w") as fo:
        fo.write("".join(tsv_cont))
    with open(f"{data_dir}/test.wrd", "w") as fo:
        fo.write("".join(label_cont))

    modalities = ["video"]
    gen_subset = "test"
    gen_cfg = GenerationConfig(beam=20)
    models = [model.eval().cuda() for model in models]

    saved_cfg.task.modalities = modalities
    saved_cfg.task.data = data_dir
    saved_cfg.task.label_dir = data_dir
    task = tasks.setup_task(saved_cfg.task)
    task.load_dataset(gen_subset, task_cfg=saved_cfg.task)
    generator = task.build_generator(models, gen_cfg)

    def decode_fn(x):
        dictionary = task.target_dictionary
        symbols_ignore = generator.symbols_to_strip_from_output
        symbols_ignore.add(dictionary.pad())
        return task.datasets[gen_subset].label_processors[0].decode(x, symbols_ignore)

    itr = task.get_batch_iterator(dataset=task.dataset(gen_subset)).next_epoch_itr(shuffle=False)
    sample = next(itr)
    sample = utils.move_to_cuda(sample)
    hypos = task.inference_step(generator, models, sample)
    ref = decode_fn(sample['target'][0].int().cpu())
    hypo = hypos[0][0]['tokens'].int().cpu()
    hypo = decode_fn(hypo)

    return hypo

video_path = f"/mmfs1/gscratch/intelligentsystems/common_datasets/VoxCeleb2/mp4/id00017/7t6lfzvVaTM/00003.mp4"

play_video(video_path)

with tempfile.NamedTemporaryFile(suffix='.mp4') as mouth_roi:
    mouth_roi_path = mouth_roi.name

    # Call the preprocess_video function
    preprocess_video(
        video_path, mouth_roi_path, face_predictor_path, mean_face_path,
        cnn_detector_path=cnn_detector_path
    )
    
    ckpt_path = "data/checkpoints/base_vox_433h.pt"
    hypo = predict(mouth_roi_path, models, saved_cfg, task)
    print(f'{hypo=}')