### Import libraries

In [None]:
from transformers import pipeline
from transformers import AutoProcessor, AutoModel
from transformers import VitsModel, AutoTokenizer, set_seed

import scipy
from scipy.io.wavfile import write as write_wav

from IPython.display import Audio
import IPython.display as ipd

from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface

import torch
from ruaccent import RUAccent

### Bark

In [None]:
model_id = "suno/bark"
task = "text-to-speech"
text = "Hello, my dog is cooler than you!"
synthesiser = pipeline(task, model_id)

speech = synthesiser(
    text_inputs=text,
    forward_params={"do_sample": True},
)

filename = "bark_out.wav"
scipy.io.wavfile.write(
    filename=filename,
    rate=speech["sampling_rate"],
    data=speech["audio"]
)

### Bark-small

In [None]:
model_id = "suno/bark-small"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)
text = ["""Нажмите на кнопку Р У 5. Это включит свет"""
]

inputs = processor(
    text=text,
    return_tensors="pt",
)

speech_values = model.generate(
    **inputs,
    do_sample=True,
    pad_token_id=processor.tokenizer.pad_token_id
)

sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)

sampling_rate = model.generation_config.sample_rate
write_wav(
    "bark_out.wav",
    rate=sampling_rate,
    data=speech_values.cpu().numpy().squeeze()
)

### facebook/tts_transformer-ru-cv7_css10

In [None]:
model_id = "facebook/tts_transformer-ru-cv7_css10"
models, cfg, task = load_model_ensemble_and_task_from_hf_hub(
    model_id,
    arg_overrides={"vocoder": "hifigan", "fp16": False}
)
model = models[0]
TTSHubInterface.update_cfg_with_data_cfg(cfg, task.data_cfg)
generator = task.build_generator([model], cfg)

text = "Нажмите на кнопку Р У ПЯТЬ. Это включит свет"

sample = TTSHubInterface.get_model_input(task, text)
wav, rate = TTSHubInterface.get_prediction(task, model, generator, sample)

ipd.Audio(wav, rate=rate)

### utrobinmv/tts_ru_free_hf_vits_low_multispeaker

In [None]:
# load model
model_name = "utrobinmv/tts_ru_free_hf_vits_low_multispeaker"

model = VitsModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model.eval()

# load accentizer
accentizer = RUAccent()
accentizer.load(omograph_model_size='turbo', use_dictionary=True)

set_seed(555)  # make deterministic

speaker = 1 # 0-man, 1-woman 

text = """Нажмите на кнопку Р У 5. Это включит свет"""

# the placement of accents
text = accentizer.process_all(text)
print(text)

inputs = tokenizer(text, return_tensors="pt")
inputs['speaker_id'] = speaker

with torch.no_grad():
    output = model(**inputs).waveform
    
scipy.io.wavfile.write("tts_audio.wav", rate=model.config.sampling_rate,
                       data=output[0].cpu().numpy())

### TTS

In [None]:
import torch
from TTS.api import TTS

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# List available 🐸TTS models
print(TTS().list_models())

# Init TTS
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)

# Text to speech to a file
tts.tts_to_file(text="Нажмите на кнопку Р У ПЯТЬ. Это включит свет.", speaker_wav="/home/vitalii/projects/voice-assistant-interactive-guide/experiments/test.wav", language="ru", file_path="output.wav")

### parler-tts/parler_tts_mini_v0.1

In [None]:
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")

prompt = "Нажмите на кнопку РУ ПЯТЬ. Это включит свет."
description = "A female speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."

input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)


### Vosk

In [1]:
from vosk_tts import Model, Synth
model = Model(model_name="vosk-model-tts-ru-0.6-multi")
synth = Synth(model)

synth.synth("Нажмите на кнопку РУ ПЯТЬ. Это включит свет.", "out.wav", speaker_id=2)

### snakers4/silero-models

In [None]:
# V4
import torch

language = 'ru'
model_id = 'v4_ru'
sample_rate = 48000
speaker = 'baya'
device = torch.device('cpu')

model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                     model='silero_tts',
                                     language=language,
                                     speaker=model_id)
model.to(device)  # gpu or cpu

example_text = "ТУТ ВАШ ТЕКСТ"

audio = model.save_wav(text=example_text,
                       speaker=speaker,
                       sample_rate=sample_rate)