## Interface

In [3]:
import gradio as gr
import numpy as np
import os
import subprocess
import torch
import wave, struct
from transformers import pipeline, AutoProcessor, BarkModel

# Setup channel info
RATE = 16000 # Sample Rate
CHUNK = 1024 # Block Size

# load model and processor for v2t
pipe_v2t = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# load model and processor for t2v
model_id = "suno/bark-small"
processor = AutoProcessor.from_pretrained(model_id, torch_dtype=torch.float16)
model = BarkModel.from_pretrained(model_id)
model = model.to_bettertransformer()
#model.enable_cpu_offload()
sample_rate = model.generation_config.sample_rate

# load model for translation
pipe_translation = {
    'en': {
        'translator': pipeline("translation", model="Helsinki-NLP/opus-mt-es-en"),
        'voice_preset': "v2/en_speaker_5"
    },
    'fr': {
        'translator': pipeline("translation", model="Helsinki-NLP/opus-mt-es-fr"),
        'voice_preset': "v2/fr_speaker_5"
    },
    'ru': {
        'translator': pipeline("translation", model="Helsinki-NLP/opus-mt-es-ru"),
        'voice_preset': "v2/ru_speaker_5"
    },
}

def transcribe(filepath, target_lang='en'):
    WAVE_OUTPUT_FILENAME = 'tmp.wav'
    output_str = f'/opt/homebrew/bin/ffmpeg -y -i {filepath} -ar {RATE} {WAVE_OUTPUT_FILENAME}'
    subprocess.run(output_str, shell=True)

    # open the file for reading.
    waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'rb')

    n_frames = waveFile.getnframes()
    frames = []
    # looping from beginning of file to the end
    for _ in range(n_frames):
        data = waveFile.readframes(1)
        frames.extend(struct.unpack(f'<h', data))

    # cleanup stuff.
    waveFile.close()
    os.remove(WAVE_OUTPUT_FILENAME)
    array = np.array(frames, dtype=np.float16)
    array = array/32768.0

    output = pipe_v2t(array,
        max_new_tokens = 256,
        generate_kwargs = {
            "task": "transcribe",
            "language": "spanish",
        }
    )
    output = output['text']

    translation_output = pipe_translation[target_lang]['translator'](output, max_length=256)[0]['translation_text']

    inputs = processor(translation_output, voice_preset=pipe_translation[target_lang]['voice_preset'])
    audio_array = model.generate(**inputs)
    audio_array = audio_array.cpu().numpy().squeeze()
    return translation_output, (sample_rate, audio_array)


choices = [('English', 'en'), ('French', 'fr'), ('Russian', 'ru')]
interface = gr.Interface(fn=transcribe, 
                         inputs=[
                             gr.Audio(sources='microphone', type='filepath'),
                             gr.Dropdown(choices=choices, label='Translate to'),
                             ],
                         outputs=[
                             gr.Textbox(label='Voice to Text'), 
                             gr.Audio(label='Read Aloud', autoplay=True, visible=False)
                             ],
                         allow_flagging="never",
                         )

gr.close_all()
interface.launch()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


Closing server running on port: 7860
Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




ffmpeg version 7.0.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.3.9.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

fr_speaker_5_semantic_prompt.npy:   0%|          | 0.00/4.55k [00:00<?, ?B/s]

fr_speaker_5_coarse_prompt.npy:   0%|          | 0.00/13.4k [00:00<?, ?B/s]

fr_speaker_5_fine_prompt.npy:   0%|          | 0.00/26.7k [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
ffmpeg version 7.0.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with Apple clang version 15.0.0 (clang-1500.3.9.4)
  configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.0.1 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --

In [5]:
gr.close_all()

## Recording

In [18]:
import pyaudio # Soundcard audio I/O access library
import wave # Python 3 module for reading / writing simple .wav files

# Setup channel info
FORMAT = pyaudio.paInt16 # data type format
CHANNELS = 1 # Adjust to your number of channels
RATE = 16000 # Sample Rate
CHUNK = 1024 # Block Size
RECORD_SECONDS = 15 # Record time
WAVE_OUTPUT_FILENAME = "file.wav"



In [25]:
# Startup pyaudio instance
audio = pyaudio.PyAudio()

# start Recording
stream = audio.open(format=FORMAT, channels=CHANNELS,
                rate=RATE, input=True,
                frames_per_buffer=CHUNK)
print ("recording...")
frames = []

# Record for RECORD_SECONDS
for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)
print ("finished recording")


# Stop Recording
stream.stop_stream()
stream.close()
audio.terminate()

# Write your new .wav file with built in Python 3 Wave module
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
waveFile.setnchannels(CHANNELS)
waveFile.setsampwidth(audio.get_sample_size(FORMAT))
waveFile.setframerate(RATE)
waveFile.writeframes(b''.join(frames))
waveFile.close()

recording...
finished recording


## Playing wav file

In [26]:
import pyaudio
import wave

# open the file for reading.
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'rb')

# create an audio object
audio = pyaudio.PyAudio()

# open stream based on the wave object which has been input.
stream = audio.open(format = FORMAT,
                channels = CHANNELS,
                rate = RATE,
                output = True)

# read data (based on the chunk size)
data = waveFile.readframes(CHUNK)

# play stream (looping from beginning of file to the end)
while data:
    # writing to the stream is what *actually* plays the sound.
    stream.write(data)
    data = waveFile.readframes(CHUNK)

# cleanup stuff.
waveFile.close()
stream.close()    
audio.terminate()

## Reading the wav file

In [27]:
import wave, struct

# open the file for reading.
waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'rb')

# read data (based on the chunk size)
data = waveFile.readframes(CHUNK)

frames = []
# play stream (looping from beginning of file to the end)
while data:
    # writing to the stream is what *actually* plays the sound.
    frames.extend(struct.unpack(f'<{CHUNK}h', data))
    data = waveFile.readframes(CHUNK)

# cleanup stuff.
waveFile.close()

frames = [frame/32768.0 for frame in frames]

print(frames[:10])

[-0.001983642578125, -0.002777099609375, -0.00250244140625, -0.004486083984375, -0.00518798828125, -0.005859375, -0.0074462890625, -0.007568359375, -0.00732421875, -0.00885009765625]


## Transcribing

In [28]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
forced_decoder_ids = processor.get_decoder_prompt_ids(language="spanish", task="transcribe")

# load streaming dataset and read first audio sample
input_features = processor(frames, sampling_rate=RATE, return_tensors="pt").input_features

# generate token ids
predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

print(transcription[0])

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


 Se han detectado un elemento ISR y un puesto de mando enemigo de entidad grupo táctico. Identifica tres líneas de acción posibles.


In [29]:
## Save the transcription to a file
with open("transcription.txt", "w") as file:
    file.write(transcription[0])

## Text to Speech

In [21]:
## Loading the transcription from the file
with open("transcription.txt", "r") as file:
    transcription = file.read()

In [22]:
from transformers import AutoProcessor, BarkModel

processor = AutoProcessor.from_pretrained("suno/bark")
model = BarkModel.from_pretrained("suno/bark")

voice_preset = "v2/es_speaker_5"

inputs = processor(transcription, voice_preset=voice_preset, )

audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [24]:
inputs = processor(['¿Cuánto tiempo tomará esto?'], voice_preset=voice_preset, )

audio_array = model.generate(**inputs)
audio_array = audio_array.cpu().numpy().squeeze()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [26]:
import pyaudio, struct

# create an audio object
audio = pyaudio.PyAudio()

# open stream based on the wave object which has been input.
stream = audio.open(format = FORMAT,
                channels = CHANNELS,
                rate = 24000,
                output = True)

# denormalize audio
max_abs_array = max(abs(audio_array))
normalized_array = audio_array / (max_abs_array * 2) * 32768
normalized_array = normalized_array.astype('int16')

# play stream (looping from beginning of file to the end)
for index in range(0, len(normalized_array), CHUNK):
    # writing to the stream is what *actually* plays the sound.
    data = normalized_array[index:index+CHUNK]
    data = struct.pack(f'<{len(data)}h', *data)
    stream.write(data)

# cleanup stuff.
stream.close()    
audio.terminate()