### Import Libraries

In [1]:
from IPython.display import Audio
from moviepy.video.io.ffmpeg_tools import ffmpeg_extract_subclip
import utils
from transformers import AutoProcessor, pipeline, SeamlessM4Tv2Model
import torchaudio
import torch
import numpy as np
import scipy

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"

### Load processor and model

In [2]:
processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large").to(DEVICE)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:07<00:00,  3.97s/it]


### Direct Speech to Speech Translation using Seamless
 - Converted a 35 second sound to a 11 second sound which is not good since the cutting was not due to lack of speech

In [3]:
# convert video to wav
utils.convert_to_wav("data/ZAAI_intro.mp4")
# load audio
audio, orig_freq = torchaudio.load("data/ZAAI_intro.wav")
# convert into 16 kHz waveform array to match the training data
audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000)
# encode the speech into a speech embedding
audio_inputs = processor(audios=audio, sampling_rate=16000, return_tensors="pt")
# translate the audio from english to portuguese and generate a new waveform array
audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="por", speaker_id=2)[0].cpu().numpy().squeeze()

MoviePy - Writing audio in data/ZAAI_intro.wav


                                                                    

MoviePy - Done.




In [4]:
sample_rate = model.config.sampling_rate
Audio(audio_array_from_audio, rate=sample_rate)

### Translate Speech from EN to PT
- Better than before but not perfect, since we are creating chunks that do not take into consideration if the audio is in the middle of a sentence, it degradates the final result. Ideally, we would create chunks based on speech pauses.

In [5]:
final_array = []

for i in range(0, 30, 10):

    # cut the video in chunks of 5 seconds and convert the audio into wav format
    # the division of the video in chunks is due to a model limitation that cannot handle long audio inputs
    ffmpeg_extract_subclip("data/ZAAI_intro.mp4", 0 + i, 10 + i, targetname="data/chunk.mp4")
    utils.convert_to_wav("data/chunk.mp4")

    # load audio
    audio, orig_freq = torchaudio.load("data/chunk.wav")
    # convert into 16 kHz waveform array to match the training data
    audio = torchaudio.functional.resample(audio, orig_freq=orig_freq, new_freq=16_000)
    # encode the speech into a speech embedding
    audio_inputs = processor(
        audios=audio,
        sampling_rate=16000,
        return_tensors="pt",
    ).to(DEVICE)
    # translate the audio from english to portuguese and generate a new waveform array
    audio_array_from_audio = (
        model.generate(**audio_inputs, tgt_lang="por", speaker_id=2)[0]
        .cpu()
        .numpy()
        .squeeze()
    )

    # append the different chunks
    final_array.append(audio_array_from_audio)

    torch.cuda.empty_cache()

Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
MoviePy - Writing audio in data/chunk.wav


                                                        

MoviePy - Done.




Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
MoviePy - Writing audio in data/chunk.wav


                                                        

MoviePy - Done.




Moviepy - Running:
>>> "+ " ".join(cmd)
Moviepy - Command successful
MoviePy - Writing audio in data/chunk.wav


                                                        

MoviePy - Done.




### Render the translated audio

In [6]:
sample_rate = model.config.sampling_rate
Audio(np.concatenate(final_array), rate=sample_rate)

### Save Audio

In [7]:
scipy.io.wavfile.write("data/translated_audio.wav", rate=sample_rate, data=np.concatenate(final_array))

### Combine Whisper with Seamless4MT v2 for better results

##### Whisper Transcription

In [8]:
# load Whisper Pipeline
# Set chunk_length_s to 30s (if you have a larger audio you should set it up to a different value
# to avoid output truncation since Whisper was trained with chunks of 30s)
pipe = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-medium",
    device=DEVICE,
)

# to avoid direct translation from PT to EN
pipe.model.config.forced_decoder_ids = None

# transcript PT audio
audio, orig_freq = torchaudio.load("data/ZAAI_intro.wav")
audio = torchaudio.functional.resample(
    audio, orig_freq=orig_freq, new_freq=16_000
)  # must be a 16 kHz waveform array
prediction = pipe(audio.numpy()[0], batch_size=8)["text"]

Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


##### SeamlessM4T for T2ST

In [9]:
# T2ST
audio_array = []
for i in prediction.split("."):
    text_inputs = processor(text=i, src_lang="eng", return_tensors="pt").to(DEVICE)
    audio_array_from_text = (
        model.generate(**text_inputs, tgt_lang="por")[0].cpu().numpy().squeeze()
    )
    audio_array.append(audio_array_from_text)

audio_array = np.concatenate(audio_array)

# you can also check the translated test by running this
for i in prediction.split("."):
    text_inputs = processor(text=i, src_lang="eng", return_tensors="pt").to(DEVICE)
    output_tokens = model.generate(**text_inputs, tgt_lang="por", generate_speech=False)
    translated_text_from_text = processor.decode(
        output_tokens[0].tolist()[0], skip_special_tokens=True
    )
    print(translated_text_from_text)

Olá a todos, sou o Luís, fundador e sócio da Zai.
Estamos na beira da inovação e tecnologia de IA.
Temos duas áreas principais em que nos concentramos
Em primeiro lugar, construímos projetos de consultoria altamente complexos para clientes empresariais, desbloqueando valor real com IA gerativa, previsão de séries temporais, sistemas de recomendação, modelos de marketing
E a segunda área principal é que financiamos startups de núcleo de IA onde os ajudamos a desenvolver a tecnologia, mas também em sua estratégia de lançamento no mercado
Não, não.


##### Render Audio

In [10]:
sample_rate = model.config.sampling_rate
Audio(audio_array, rate=sample_rate)

##### Save Audio

In [11]:
scipy.io.wavfile.write(
    "data/best_translated_audio.wav", rate=sample_rate, data=audio_array
)