<a href="https://colab.research.google.com/github/trandinhson3086/End-to-End-Text-To-Speech/blob/main/E2E_TensorflowTTS_KSS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TensorflowTTS real time E2E-TTS demonstration

This notebook provides a demonstration of the realtime E2E-TTS using TensorflowTTS for Korea (Using KSS dataset)

- Github: https://github.com/TensorSpeech/TensorflowTTS
- Colab for Eng: https://colab.research.google.com/drive/1akxtrLZHKuMiQup00tzO2olCaN-y3KiD?usp=sharing

## Install

In [1]:
import os
!git clone https://github.com/TensorSpeech/TensorFlowTTS.git
os.chdir("TensorFlowTTS")
!pip install .
os.chdir("..")
import sys
sys.path.append("TensorFlowTTS/")

Cloning into 'TensorFlowTTS'...
remote: Enumerating objects: 10609, done.[K
remote: Counting objects: 100% (319/319), done.[K
remote: Compressing objects: 100% (161/161), done.[K
remote: Total 10609 (delta 183), reused 234 (delta 153), pack-reused 10290[K
Receiving objects: 100% (10609/10609), 133.30 MiB | 29.46 MiB/s, done.
Resolving deltas: 100% (5122/5122), done.
Processing /content/TensorFlowTTS
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
Collecting tensorflow-gpu==2.6.0
  Downloading tensorflow_gpu-2.6.0-cp37-cp37m-manylinux2010_x86_64.whl (458.3 MB)
[K     |████████████████████████████████| 458.3 MB 11 kB/s 
[?25hCollec

In [2]:
!pip install git+https://github.com/repodiac/german_transliterate.git#egg=german_transliterate
!pip install h5py==2.10.0
!pip install gradio

Collecting german_transliterate
  Cloning https://github.com/repodiac/german_transliterate.git to /tmp/pip-install-nyg47tos/german-transliterate_b0cf0a8713434d50b120b54c4c298794
  Running command git clone -q https://github.com/repodiac/german_transliterate.git /tmp/pip-install-nyg47tos/german-transliterate_b0cf0a8713434d50b120b54c4c298794
Collecting num2words
  Downloading num2words-0.5.10-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 3.5 MB/s 
Building wheels for collected packages: german-transliterate
  Building wheel for german-transliterate (setup.py) ... [?25l[?25hdone
  Created wheel for german-transliterate: filename=german_transliterate-0.1.3-py3-none-any.whl size=20830 sha256=dcc2b9ebe533f7eb9870b67a600a0e5130cf391dcdd4a1721b4668c5e36c38aa
  Stored in directory: /tmp/pip-ephem-wheel-cache-mt87uub9/wheels/77/17/55/6c6d2d33bd2b3b8a3741e12b17f0b18278861f64858bbcc228
Successfully built german-transliterate
Installing collected packages: num2words,

In [8]:
import tensorflow as tf

import yaml
import numpy as np
import matplotlib.pyplot as plt

import IPython.display as ipd

from tensorflow_tts.inference import AutoConfig
from tensorflow_tts.inference import TFAutoModel
from tensorflow_tts.inference import AutoProcessor

##Load Model

### Tacotron 2
tacotron2 = TFAutoModel.from_pretrained("tensorspeech/tts-tacotron2-kss-ko", name="tacotron2")

### FastSpeech2
fastspeech2 = TFAutoModel.from_pretrained("tensorspeech/tts-fastspeech2-kss-ko", name="fastspeech2")

### Multi-band MelGAN
mb_melgan = TFAutoModel.from_pretrained("tensorspeech/tts-mb_melgan-kss-ko", name="mb_melgan")

###Inference
processor = AutoProcessor.from_pretrained("tensorspeech/tts-tacotron2-kss-ko")

def do_synthesis(input_text, text2mel_model, vocoder_model, text2mel_name, vocoder_name):
  input_ids = processor.text_to_sequence(input_text)

  # text2mel part
  if text2mel_name == "TACOTRON":
    _, mel_outputs, stop_token_prediction, alignment_history = text2mel_model.inference(
        tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        tf.convert_to_tensor([len(input_ids)], tf.int32),
        tf.convert_to_tensor([0], dtype=tf.int32)
    )
  elif text2mel_name == "FASTSPEECH2":
    mel_before, mel_outputs, duration_outputs, _, _ = text2mel_model.inference(
        tf.expand_dims(tf.convert_to_tensor(input_ids, dtype=tf.int32), 0),
        speaker_ids=tf.convert_to_tensor([0], dtype=tf.int32),
        speed_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        f0_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
        energy_ratios=tf.convert_to_tensor([1.0], dtype=tf.float32),
    )
  else:
    raise ValueError("Only TACOTRON, FASTSPEECH2 are supported on text2mel_name")

  # vocoder part
  if vocoder_name == "MB-MELGAN":
    audio = vocoder_model.inference(mel_outputs)[0, :, 0]
  else:
    raise ValueError("Only MB_MELGAN are supported on vocoder_name")

  if text2mel_name == "TACOTRON":
    return mel_outputs.numpy(), alignment_history.numpy(), audio.numpy()
  else:
    return mel_outputs.numpy(), audio.numpy()

def visualize_attention(alignment_history):
  import matplotlib.pyplot as plt

  fig = plt.figure(figsize=(8, 6))
  ax = fig.add_subplot(111)
  ax.set_title(f'Alignment steps')
  im = ax.imshow(
      alignment_history,
      aspect='auto',
      origin='lower',
      interpolation='none')
  fig.colorbar(im, ax=ax)
  xlabel = 'Decoder timestep'
  plt.xlabel(xlabel)
  plt.ylabel('Encoder timestep')
  plt.tight_layout()
  plt.show()
  plt.close()

def visualize_mel_spectrogram(mels):
  mels = tf.reshape(mels, [-1, 80]).numpy()
  fig = plt.figure(figsize=(10, 8))
  ax1 = fig.add_subplot(311)
  ax1.set_title(f'Predicted Mel-after-Spectrogram')
  im = ax1.imshow(np.rot90(mels), aspect='auto', interpolation='none')
  fig.colorbar(mappable=im, shrink=0.65, orientation='horizontal', ax=ax1)
  plt.show()
  plt.close()

# -*- coding: utf-8 -*-
import numpy as np
import torch
import gradio as gr
import sys
from scipy.io.wavfile import write

def synthesize(input_text, speed):
    mels, alignment_history, audios = do_synthesis(input_text, tacotron2, mb_melgan, "TACOTRON", "MB-MELGAN")
    print('\n\nDone.\n')
    out_put='output.wav'    
    write(out_put, 22050, audios)
    return out_put, out_put

title = "A Speech-to-Text Engine for Korean"
description = "To use it, simply upload your filtered image, or click one of the examples to load them."   
examples = [
 ["사무실에 잠깐 들러 주실 수 있으세요?", 1],
 ["연말에는 한 해를 돌아볼 필요가 있다.", 1]   
]

ints=gr.inputs.Textbox(lines=15, label="Text Input")
speed=gr.inputs.Slider(minimum=0, maximum=2, default=1, label="Speed Regulator")
output1=gr.outputs.Audio( type="file", label="Audio")
output2=gr.outputs.File(label='Download File')
gr.Interface(fn=synthesize,  inputs=[ints, speed], outputs=[output1, output2],
    title=title,
    description=description, examples=examples).launch(share=True) 