### Import Libraries

In [1]:
from IPython.display import Audio
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import utils
from transformers import AutoProcessor, pipeline, SeamlessM4Tv2Model
import torchaudio
import torch
import numpy as np
import scipy

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

### Load processor and model

In [None]:
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to(DEVICE)

### Translate Speech from PT to EN

In [None]:
final_array = []

for i in range(0, 60, 10):

    # cut the video in chunks of 10 seconds and convert the audio into wav format
    # the division of the video in chunks is due to a model limitation that cannot handle long audio inputs
    ffmpeg_extract_subclip("data/video.mp4", 0 + i, 10 + i, targetname="data/chunk.mp4")
    utils.convert_to_wav("data/chunk.mp4")

    # load audio
    audio, orig_freq = torchaudio.load("chunk.wav")
    # convert into 16 kHz waveform array to match the training data
    audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000)
    # encode the speech into a speech embedding
    audio_inputs = processor(
        audios=audio,
        sampling_rate=16000,
        return_tensors="pt",
    ).to(DEVICE)
    # translate the audio from portuguese to english and generate a new waveform array
    audio_array_from_audio = (
        model.generate(**audio_inputs, tgt_lang="eng", speaker_id=2)[0]
        .cpu()
        .numpy()
        .squeeze()
    )

    # append the different chunks
    final_array.append(audio_array_from_audio)

    torch.cuda.empty_cache()

### Render the translated audio

In [None]:
sample_rate = model.config.sampling_rate
Audio(np.concatenate(final_array), rate=sample_rate)

### Save Audio

In [None]:
scipy.io.wavfile.write("data/translated_audio.wav", rate=sample_rate, data=final_array)

### Combine Whisper with Seamless4MT v2 for better results

##### Whisper Transcription

In [None]:
# load Whisper Pipeline
# Set chunk_length_s to 60s to avoid output truncation since Whisper was trained with chunks of 30s
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-medium",
    chunk_length_s=60,
    device=DEVICE,
)

# to avoid direct translation from PT to EN
pipe.model.config.forced_decoder_ids = None

# transcript PT audio
audio, orig_freq = torchaudio.load("video.wav")
audio = torchaudio.functional.resample(
    audio, orig_freq=orig_freq, new_freq=16_000
)  # must be a 16 kHz waveform array
prediction = pipe(audio.numpy()[0], batch_size=8)["text"]

##### SeamlessM4T for T2ST

In [1]:
# T2ST
audio_array = []
for i in prediction.split("."):
    text_inputs = processor(text=i, src_lang="por", return_tensors="pt").to(DEVICE)
    audio_array_from_text = (
        model.generate(**text_inputs, tgt_lang="eng")[0].cpu().numpy().squeeze()
    )
    audio_array.append(audio_array_from_text)

audio_array = np.concatenate(audio_array)

# you can also check the translated test by running this
for i in prediction.split("."):
    text_inputs = processor(text=i, src_lang="por", return_tensors="pt").to(DEVICE)
    output_tokens = model.generate(**text_inputs, tgt_lang="eng", generate_speech=False)
    translated_text_from_text = processor.decode(
        output_tokens[0].tolist()[0], skip_special_tokens=True
    )
    print(translated_text_from_text)

Hi, my name is Luis Roque and I'm the co-founder and CEO of Hub.
Hub is a tech startup focused on supply chain management.
We are talking about a very broad scope of services to brands in the fashion segment, essentially, since from being a data flow based on slices of the chain, we actually have the entire chain and therefore have total visibility.
On top of that, we can develop all the component algorithms that we do and seek efficiency.
Currently, it manages more than 40 supply chains.


##### Render Audio

In [None]:
sample_rate = model.config.sampling_rate
Audio(audio_array, rate=sample_rate)

##### Save Audio

In [None]:
scipy.io.wavfile.write(
    "data/best_translated_audio.wav", rate=sample_rate, data=audio_array
)