### Import Libraries

In [2]:
import os

os.environ["IMAGEIO_FFMPEG_EXE"] = "/opt/homebrew/bin/ffmpeg"
from IPython.display import Audio
import utils
from transformers import AutoProcessor, pipeline, SeamlessM4Tv2Model
import torchaudio
import torch
import numpy as np
import scipy
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

### Load processor and model

In [3]:
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to(DEVICE)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.04it/s]


### Translate Speech from PT to EN

In [38]:
final_array = []

for i in range(0, 30, 10):

    # cut the video in chunks of 10 seconds and convert the audio into wav format
    # the division of the video in chunks is due to a model limitation that cannot handle long audio inputs
    ffmpeg_extract_subclip("data/ZAAI_intro.mp4", 0 + i, 10 + i, targetname=f"data/chunk_{i}.mp4")
    utils.convert_to_wav(f"data/chunk_{i}.mp4")

    # load audio
    audio, orig_freq = torchaudio.load(f"data/chunk_{i}.wav")
    # convert into 16 kHz waveform array to match the training data
    audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000)
    # encode the speech into a speech embedding
    audio_inputs = processor(
        audios=audio,
        sampling_rate=16000,
        return_tensors="pt",
    ).to(DEVICE)
    # translate the audio from portuguese to english and generate a new waveform array
    audio_array_from_audio = (
        model.generate(**audio_inputs, tgt_lang="por", speaker_id=2)[0]
        .cpu()
        .numpy()
        .squeeze()
    )

    # append the different chunks
    final_array.append(audio_array_from_audio)

    torch.cuda.empty_cache()

Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
MoviePy - Writing audio in data/chunk_0.wav


                                                        

MoviePy - Done.




Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
MoviePy - Writing audio in data/chunk_10.wav


                                                        

MoviePy - Done.




Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
MoviePy - Writing audio in data/chunk_20.wav


                                                        

MoviePy - Done.




### Render the translated audio

In [39]:
sample_rate = model.config.sampling_rate
Audio(np.concatenate(final_array), rate=sample_rate)

### Save Audio

In [40]:
scipy.io.wavfile.write("data/translated_audio.wav", rate=sample_rate, data=np.concatenate(final_array))

### Combine Whisper with Seamless4MT v2 for better results

##### Whisper Transcription

In [41]:
# load Whisper Pipeline
# Set chunk_length_s to 60s to avoid output truncation since Whisper was trained with chunks of 30s
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-medium",
    chunk_length_s=60,
    device=DEVICE,
)

# to avoid direct translation from PT to EN
pipe.model.config.forced_decoder_ids = None

# transcript PT audio
audio, orig_freq = torchaudio.load("data/translated_audio.wav")
audio = torchaudio.functional.resample(
    audio, orig_freq=orig_freq, new_freq=16_000
)  # must be a 16 kHz waveform array
prediction = pipe(audio.numpy()[0], batch_size=8)["text"]

##### SeamlessM4T for T2ST

In [43]:
# T2ST
audio_array = []
for i in prediction.split("."):
    text_inputs = processor(text=i, src_lang="eng", return_tensors="pt").to(DEVICE)
    audio_array_from_text = (
        model.generate(**text_inputs, tgt_lang="por")[0].cpu().numpy().squeeze()
    )
    audio_array.append(audio_array_from_text)

audio_array = np.concatenate(audio_array)

# you can also check the translated test by running this
for i in prediction.split("."):
    text_inputs = processor(text=i, src_lang="eng", return_tensors="pt").to(DEVICE)
    output_tokens = model.generate(**text_inputs, tgt_lang="por", generate_speech=False)
    translated_text_from_text = processor.decode(
        output_tokens[0].tolist()[0], skip_special_tokens=True
    )
    print(translated_text_from_text)

Olá a todos, eu sou o Luiz, fundador e parceiro da ZI
Não temos limites de inovação e tecnologia da IA
Temos duas áreas principais em que a gente se concentra, a primeira é
Falo, nós construímos projetos de consultoria altamente complexos para clientes de empresas, bloqueando o valor real com IA, previsão de tempo, sistemas de recomendação
E a segunda área principal é de financiar startups de IA, onde ajudamos a desenvolver a tecnologia, mas também a ir em peromercada
Não, não.


##### Render Audio

In [44]:
sample_rate = model.config.sampling_rate
Audio(audio_array, rate=sample_rate)

##### Save Audio

In [None]:
scipy.io.wavfile.write(
    "data/best_translated_audio.wav", rate=sample_rate, data=audio_array
)