In [136]:
!pip install speechbrain librosa matplotlib torchaudio

Collecting transformers
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.4 kB ? eta -:--:--
     ----------------- -------------------- 20.5/44.4 kB 330.3 kB/s eta 0:00:01
     -------------------------- ----------- 30.7/44.4 kB 262.6 kB/s eta 0:00:01
     -------------------------------------- 44.4/44.4 kB 313.2 kB/s eta 0:00:00
Collecting librosa
  Using cached librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.0-cp312-none-win_amd64.whl.metadata (6.9 kB)
Collecting audioread>=2.1.9 (from librosa)
  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Using cached soundfile-0.12.1-py2.py3-none-win_amd64.whl.metadata (14 kB)
Collecting pooch>=1.1 (

In [8]:
"""Author: yassine ibork"""

from speechbrain.pretrained import Tacotron2
from speechbrain.inference.vocoders import HIFIGAN
import tempfile
import torchaudio
import os

# Create a temporary directory for TTS and vocoder models
with tempfile.TemporaryDirectory() as tmpdir_tts, tempfile.TemporaryDirectory() as tmpdir_vocoder:
    # Initialize Tacotron2 TTS model
    tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir=tmpdir_tts)
    
    # Initialize HiFIGAN vocoder
    hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir=tmpdir_vocoder)
    
    # Running the TTS (text-to-mel-spectrogram)
    mel_output, mel_length, alignment = tacotron2.encode_text("Text-to-Speech (TTS) is a technology that converts written text into spoken words. It uses algorithms and voice synthesis techniques to produce speech that can be either robotic or human-like, depending on the quality of the synthesis engine. TTS systems are used in various applications, such as reading content aloud for people with visual impairments, voice assistants (e.g., Amazon Alexa, Google Assistant), and automated customer service. TTS solutions typically allow customization of language, voice type, speed, and tone, enabling more natural and context-appropriate audio output."
)
    
    # Running the vocoder (mel-spectrogram-to-waveform)
    waveforms = hifi_gan.decode_batch(mel_output)

# Save the audio file
output_directory = os.path.expanduser("~/GeneratedSpeech")  
output_file_path = os.path.join(output_directory, "generated_speech.wav")

# Save the waveform as an audio file
torchaudio.save(output_file_path, waveforms.squeeze(1), 22050)

print(f"Audio saved to {output_file_path}")


Audio saved to C:\Users\yibor/Documents/wsu/fall 2024/spoken language processing/presentation/GeneratedSpeech\generated_speech.wav
