# Download & Install

In [None]:
!pip install -q ffmpeg-python TTS

In [None]:
import librosa
import ffmpeg
import torch
import logging
# from torch.serialization import safe_globals
import builtins
from TTS.api import TTS
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer

In [None]:
!git clone https://github.com/bytedance/LatentSync.git
%cd LatentSync 
!pip install -q -r requirements.txt
!source setup_env.sh

In [None]:
stt_id_model = "openai/whisper-large-v3-turbo"
stt_processor = AutoProcessor.from_pretrained(stt_id_model, language="vi", task="transcribe")
stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(stt_id_model)

In [None]:
t2t_id_model = "vinai/vinai-translate-vi2en-v2"
t2t_tokenizer = AutoTokenizer.from_pretrained(t2t_id_model, src_lang="vi_VN")
t2t_model = AutoModelForSeq2SeqLM.from_pretrained(t2t_id_model)

In [None]:
# Fix UnpicklingError when loading xtts_v2
orig_torch_load = torch.load

def torch_wrapper(*args, **kwargs):
    logging.warning("[comfyui-unsafe-torch] I have unsafely patched `torch.load`.  The `weights_only` option of `torch.load` is forcibly disabled.")
    kwargs['weights_only'] = False

    return orig_torch_load(*args, **kwargs)

torch.load = torch_wrapper

NODE_CLASS_MAPPINGS = {}
__all__ = ['NODE_CLASS_MAPPINGS']

In [None]:
builtins.input = lambda prompt="": "y" # auto enter 'y'
tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)

# Pipeline

## 1. Extract audio from audio - ffmpeg

In [None]:
def getAudio(src, dst):
    (
        ffmpeg
        .input(src)
        .output(dst, ac=1, ar='16000')  # mono, 16kHz
        .overwrite_output()
        .run()
    )

## 2. Automatic Speech Recognition - [whisper-large-v3-turbo](https://github.com/openai/whisper)

In [None]:
def speech2Text(src, model, processor):
    speech, sr = librosa.load(src, sr=16000)
    input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
        vi_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    return vi_text

## 3. Text to Text - [vinai-translate-vi2en ](https://github.com/VinAIResearch/VinAI_Translate)

In [None]:
def translate(vi_text, model, tokenizer):
    input_ids = tokenizer(vi_text, padding=True, return_tensors="pt")
    output_ids = model.generate(
        **input_ids,
        decoder_start_token_id=tokenizer.lang_code_to_id["en_XX"],
        num_return_sequences=1,
        num_beams=5,
        early_stopping=True
    )
    en_text = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
    
    return en_text

## 4. Text to Speech - [XTTS-v2](https://github.com/coqui-ai/TTS)

In [None]:
def text2Speech(en_text, model, save_path, speaker_wav):
    model.tts_to_file(text=en_text, file_path=save_path, speaker_wav=speaker_wav, language="en") # default sample rate: 24000 
    return

## 5. Audio to Video - [LatentSync](https://github.com/bytedance/LatentSync/tree/main)

In [None]:
import subprocess

def audio2video(src_video, src_audio, dst_video, checkpoint="checkpoints/latentsync_unet.pt", steps=10, scale=1.0):
    cmd = [
        "python", "-m", "scripts.inference",
        "--unet_config_path", "configs/unet/stage2.yaml",
        "--inference_ckpt_path", checkpoint,
        "--video_path", src_video,
        "--audio_path", src_audio,
        "--video_out_path", dst_video,
        "--inference_steps", str(steps),
        "--guidance_scale", str(scale),
    ]
    subprocess.run(cmd, check=True)

# Inference

In [None]:
def inference(vi_video, vi_audio, en_audio, en_video, stt_model=stt_model, stt_processor=stt_processor, t2t_model=t2t_model, t2t_tokenizer=t2t_tokenizer):
    import time
    start = time.time()
    
    getAudio(vi_video, vi_audio)
    
    vi_text = speech2Text(vi_audio, stt_model, stt_processor)
    print('Extract Vietnamese text:', vi_text)
    
    en_text = translate(vi_text, t2t_model, t2t_tokenizer)
    print('Translate to English text:', en_text)
    
    text2Speech(en_text, tts_model, en_audio, vi_audio)
    print('Convert English text to speech oke')
    
    audio2video(vi_video, en_audio, en_video)
    print('Generate new video oke')

    end = time.time()
    
    print(f"Inference time: {end - start:.2f}s")

In [None]:
VI_VIDEO = "/kaggle/input/vid-translator/video.mp4"
VI_AUDIO = "/kaggle/working/vi_audio-1.wav"
EN_AUDIO = "/kaggle/working/en_audio-1.wav"
EN_VIDEO = "/kaggle/working/en_video-1.mp4"

inference(VI_VIDEO, VI_AUDIO, EN_AUDIO, EN_AUDIO)

In [None]:
from IPython.display import Audio
Audio(EN_AUDIO)

In [None]:
import moviepy.editor
moviepy.editor.ipython_display(EN_VIDEO)

In [None]:
VI_VIDEO = "/kaggle/input/vid-translator/video-3.mp4"
VI_AUDIO = "/kaggle/working/vi_audio-3.wav"
EN_AUDIO = "/kaggle/working/en_audio-3.wav"
EN_VIDEO = "/kaggle/working/en_video-3.mp4"

inference(VI_VIDEO, VI_AUDIO, EN_AUDIO, EN_AUDIO)

In [None]:
from IPython.display import Audio
Audio(EN_AUDIO)

In [None]:
import moviepy.editor
moviepy.editor.ipython_display(EN_VIDEO)