<a href="https://colab.research.google.com/github/vishnubharadwaj1234/JoeTTS/blob/main/JoeTTS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [1]:
!pip install -U pip
!pip install TTS
!pip install --upgrade librosa

Collecting numpy==1.22.0 (from TTS)
  Using cached numpy-1.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
INFO: pip is looking at multiple versions of librosa to determine which version is compatible with other requirements. This could take a while.
Collecting librosa>=0.10.0 (from TTS)
  Using cached librosa-0.10.2-py3-none-any.whl.metadata (8.6 kB)
  Using cached librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
  Using cached librosa-0.10.0.post2-py3-none-any.whl.metadata (8.3 kB)
  Using cached librosa-0.10.0.post1-py3-none-any.whl.metadata (8.3 kB)
  Using cached librosa-0.10.0-py3-none-any.whl.metadata (8.3 kB)
Using cached numpy-1.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
Using cached librosa-0.10.0-py3-none-any.whl (252 kB)
Installing collected packages: numpy, librosa
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1

Collecting librosa
  Using cached librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting numpy!=1.22.0,!=1.22.1,!=1.22.2,>=1.20.3 (from librosa)
  Using cached numpy-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
  Using cached numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Using cached librosa-0.10.2.post1-py3-none-any.whl (260 kB)
Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Installing collected packages: numpy, librosa
  Attempting uninstall: numpy
    Found existing installation: numpy 1.22.0
    Uninstalling numpy-1.22.0:
      Successfully uninstalled numpy-1.22.0
  Attempting uninstall: librosa
    Found existing installation: librosa 0.10.0
    Uninstalling librosa-0.10.0:
      Successfully uninstalled librosa-0.10.0
[31mERROR: pip's depe

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
import os

from trainer import Trainer, TrainerArgs

from TTS.tts.configs.shared_configs import BaseDatasetConfig, CharactersConfig
from TTS.tts.configs.vits_config import VitsConfig
from TTS.tts.datasets import load_tts_samples
from TTS.tts.models.vits import Vits, VitsArgs, VitsAudioConfig
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.utils.audio import AudioProcessor

#Configuring Model

In [None]:
output_path = "/content/gdrive/MyDrive/JoeTTS"


dataset_config = BaseDatasetConfig(
    formatter="ljspeech", meta_file_train="transcript.txt", path=os.path.join(output_path, "MyTTSDataset/")
)

In [None]:
audio_config = VitsAudioConfig(
    sample_rate=44100, win_length=1024, hop_length=256, num_mels=80, mel_fmin=0, mel_fmax=None
)

In [None]:
character_config = CharactersConfig(
    characters_class= "TTS.tts.models.vits.VitsCharacters",
    characters= "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890",
    punctuations=" ,.-—':()",
    pad= "<PAD>",
    eos= "<EOS>",
    bos= "<BOS>",
    blank= "<BLNK>",
)

In [None]:
config = VitsConfig(
    audio=audio_config,
    characters=character_config,
    run_name="vits_vctk",
    batch_size=16,
    eval_batch_size=4,
    num_loader_workers=4,
    num_eval_loader_workers=4,
    run_eval=True,
    test_delay_epochs=0,
    epochs=1000,
    text_cleaner="basic_cleaners",
    use_phonemes=False,
    phoneme_language="en-us",
    phoneme_cache_path=os.path.join(output_path, "phoneme_cache"),
    compute_input_seq_cache=True,
    print_step=25,
    print_eval=False,
    save_best_after=1000,
    save_checkpoints=True,
    save_all_best=True,
    mixed_precision=True,
    max_text_len=250,  # change this if you have a larger VRAM than 16GB
    output_path=output_path,
    datasets=[dataset_config],
    cudnn_benchmark=False,
    test_sentences=[
        ["Hello world, my name is Joe text to speech."],
        ["Space, the final frontier."],
        ["These are the voyages of the star ship enterprise."],
        ["Its continuing mission, to explore strange new worlds."],
        ["To seek out new life and new civilizations."],
        ["To boldly go where no man has gone before."]
    ]
)

In [None]:
# Audio processor is used for feature extraction and audio I/O.
# It mainly serves to the dataloader and the training loggers.
ap = AudioProcessor.init_from_config(config)

# INITIALIZE THE TOKENIZER
# Tokenizer is used to convert text to sequences of token IDs.
# config is updated with the default characters if not defined in the config.
tokenizer, config = TTSTokenizer.init_from_config(config)

In [None]:
def formatter(root_path, manifest_file, **kwargs):  # pylint: disable=unused-argument
    """Assumes each line as ```<filename>|<transcription>```
    """
    txt_file = os.path.join(root_path, manifest_file)
    items = []
    speaker_name = "my_speaker"
    with open(txt_file, "r", encoding="utf-8-sig") as ttf:
        for line in ttf:
            cols = line.split("|")
            wav_file = f"/content/gdrive/MyDrive/JoeTTS/MyTTSDataset/wavs/{cols[0]}.wav"
            text = cols[1]
            # print(text)
            items.append({"text":text, "audio_file":wav_file, "speaker_name":speaker_name, "root_path": root_path})
    return items


In [None]:
train_samples, eval_samples = load_tts_samples(
dataset_config,
eval_split=True,
formatter=formatter)

In [None]:
# init model
model = Vits(config, ap, tokenizer, speaker_manager=None)

# init the trainer and 🚀
trainer = Trainer(
    TrainerArgs(),
    config,
    output_path,
    model=model,
    train_samples=train_samples,
    eval_samples=eval_samples,
)

#Training Model

In [None]:
trainer.fit()

Run the next 3 cells if you are continuing training, you also have to edit the train_tts.py file (see Zahrizhal's tutorial)

In [None]:
!git clone https://github.com/coqui-ai/TTS.git

In [None]:
!pip install --upgrade numpy

In [None]:
!python /content/TTS/TTS/bin/train_tts.py --continue_path /content/gdrive/MyDrive/JoeTTS/vits_vctk-May-21-2024_06+31PM-0000000

#Synthesizing Audio

In [None]:
import torch
from TTS.api import TTS

In [None]:
# Import necessary modules
from TTS.utils.synthesizer import Synthesizer

# Define the paths to your model and configuration files
model_path = "/content/gdrive/MyDrive/JoeTTS/vits_vctk-May-21-2024_06+31PM-0000000/best_model.pth"
config_path = "/content/gdrive/MyDrive/JoeTTS/vits_vctk-May-21-2024_06+31PM-0000000/config.json"

# Create a synthesizer object with the model and configuration files
synthesizer = Synthesizer(model_path, config_path)

# Define the text to be synthesized
#text = "Space, the final frontier. These are the voyages of the star ship enterprise. Its continuing mission, to explore strange new worlds. To seek out new life and new civilizations. To boldly go where no man has gone before."

# Synthesize speech
output_wav = synthesizer.tts(text)

# Save the output to a file
synthesizer.save_wav(output_wav, "output.wav")

#Gradio Interface

In [4]:
!pip install gradio
import gradio as gr
import torch
from TTS.api import TTS



In [5]:
# Import necessary modules
from TTS.utils.synthesizer import Synthesizer
# Define the paths to your model and configuration files
model_path = "/content/gdrive/MyDrive/JoeTTS/vits_vctk-May-21-2024_06+31PM-0000000/best_model.pth"
config_path = "/content/gdrive/MyDrive/JoeTTS/vits_vctk-May-21-2024_06+31PM-0000000/config.json"
# Create a synthesizer object with the model and configuration files
synthesizer = Synthesizer(model_path, config_path)

 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:44100
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024


  return torch.load(f, map_location=map_location, **kwargs)


In [6]:
def synthesize(text):
  # Synthesize speech
  output_wav = synthesizer.tts(text)
  # Save the output to a file
  synthesizer.save_wav(output_wav, "output.wav")
  return "output.wav"

In [16]:
interface = gr.Interface(
    title = 'JoeTTS: The Voice Clone of Mr. Kim',
    description = 'This is a voice clone trained on audio samples of former Java teacher Mr. Kim that was trained in only 11 hours!<br>Sentences to try:<br>- The sun sets behind the mountains, painting the sky in an orange hue<br>- The smell of coffee in the morning fills the room<br>- Star Trek: The Original Series paved the way for future science fiction franchises',
    fn = synthesize,
    inputs = gr.Textbox(label='Input text'),
    outputs = gr.Audio(label='Generated speech', type='filepath')
)

In [17]:
interface.launch(share=True)

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://e2b4028b8635aae652.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


