<a href="https://colab.research.google.com/github/tomarskt/voiceCollab/blob/main/skiveit_voice_multi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DeepVoice3: Multi-speaker text-to-speech demo

In this notebook, you can try DeepVoice3-based multi-speaker text-to-speech (en) using a model trained on [VCTK dataset](http://homepages.inf.ed.ac.uk/jyamagis/page3/page58/page58.html). The notebook is supposed to be executed on [Google colab](https://colab.research.google.com) so you don't have to setup your machines locally.

**Estimated time to complete**: 5 miniutes.

- Code: https://github.com/r9y9/deepvoice3_pytorch
- Audio samples: https://r9y9.github.io/deepvoice3_pytorch/

## Setup

### Install dependencies

In [None]:
import os
from os.path import exists, join, expanduser

# Clone
name = "deepvoice3_pytorch"
if not exists(name):
  ! git clone https://github.com/r9y9/$name
arr = os.listdir()
print(arr)
retval = os.getcwd()
print(retval)

! ls -la

In [None]:
%tensorflow_version 1.x

In [None]:
# Change working directory to the project dir 
#os.chdir(join(expanduser("~"), name))
os.chdir(join(os.getcwd(), name))

# Use pytorch v0.3.1
# skiveit - upgraded torch version to 1.7.0 for compatability
!pip install -q torch==1.7.0

# WARNING:tensorflow:The TensorFlow contrib module will not be included in TensorFlow 2.0.For more information, please see: * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md * https://github.com/tensorflow/addons * https://github.com/tensorflow/io (for I/O related ops)If you depend on functionality not listed there, please file an issue./usr/local/lib/python3.6/dist-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:100.) return torch._C._cuda_getDeviceCount() > 0

! pip install torch==1.7.0 torchvision==0.8.1+cpu torchaudio==0.7.0 -f https://download.pytorch.org/whl/torch_stable.html

In [None]:
%pylab inline
%tensorflow_version 1.x
! pip install -q librosa nltk

import torch
import numpy as np
import librosa
import librosa.display
import IPython
from IPython.display import Audio
# need this for English text processing frontend
import nltk
! python -m nltk.downloader cmudict

### Download a pre-trained model

In [None]:
checkpoint_path = "20171222_deepvoice3_vctk108_checkpoint_step000300000.pth"

In [None]:
if not exists(checkpoint_path):
  !curl -O -L "https://www.dropbox.com/s/uzmtzgcedyu531k/20171222_deepvoice3_vctk108_checkpoint_step000300000.pth"

### git checkout to the working commit

In [None]:
# Copy preset file (json) from master
# The preset file describes hyper parameters
! git checkout master --quiet

preset = "./presets/deepvoice3_vctk.json"
! ls -la
! cp -v $preset .
preset = "./deepvoice3_vctk.json"

# And then git checkout to the working commit
# This is due to the model was trained a few months ago and it's not compatible
# with the current master. 
! git checkout 0421749 --quiet
# Nov 10th 9:55pm - Aruneesh // Removing the -q (quiet option) to look at what pip install is really doing
! pip install  -e '.[train]'


## Synthesis

### Setup hyper parameters

In [None]:
import hparams
import json

# Newly added params. Need to inject dummy values
for dummy, v in [("fmin", 0), ("fmax", 0), ("rescaling", False),
                 ("rescaling_max", 0.999), 
                 ("allow_clipping_in_normalization", False)]:
  if hparams.hparams.get(dummy) is None:
    hparams.hparams.add_hparam(dummy, v)
    
# Load parameters from preset
with open(preset) as f:
  hparams.hparams.parse_json(f.read())

# Tell we are using multi-speaker DeepVoice3
hparams.hparams.builder = "deepvoice3_multispeaker"
  
# Inject frontend text processor
import synthesis
import train
from deepvoice3_pytorch import frontend
synthesis._frontend = getattr(frontend, "en")
train._frontend =  getattr(frontend, "en")

# alises
fs = hparams.hparams.sample_rate
hop_length = hparams.hparams.hop_size

### Define utility functions

In [None]:
def tts(model, text, p=0, speaker_id=0, fast=False, figures=True):
  from synthesis import tts as _tts
  waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
  if figures:
      visualize(alignment, spectrogram)
  IPython.display.display(Audio(waveform, rate=fs))
  
def visualize(alignment, spectrogram):
  label_fontsize = 16
  figure(figsize=(16,16))

  subplot(2,1,1)
  imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
  xlabel("Decoder timestamp", fontsize=label_fontsize)
  ylabel("Encoder timestamp", fontsize=label_fontsize)
  colorbar()

  subplot(2,1,2)
  librosa.display.specshow(spectrogram.T, sr=fs, 
                           hop_length=hop_length, x_axis="time", y_axis="linear")
  xlabel("Time", fontsize=label_fontsize)
  ylabel("Hz", fontsize=label_fontsize)
  tight_layout()
  colorbar()

### Load the model checkpoint

In [None]:
from train import build_model
from train import restore_parts, load_checkpoint
import importlib
importlib.reload(train)


print("HERE - 1")
print("HERE - 2")
model = build_model()
model = load_checkpoint(checkpoint_path, model, None, True)
print("HERE - 3")
print(model)

In [None]:
# from train import build_model
# from train import restore_parts, load_checkpoint

# print("HERE - 1")
# model = build_model()
# print(model)
# print("HERE - 2")
# model = load_checkpoint(checkpoint_path, model, None, True)
# print("HERE - 3")

In [None]:
# from train import build_model
# from train import restore_parts, load_checkpoint

# print("HERE - 1")
# model = build_model()
# #print(model)
# print("HERE - 2")
# model = load_checkpoint(checkpoint_path, model, None, True)
# print("HERE - 3")

In [None]:
# from train import build_model
# from train import restore_parts, load_checkpoint

# print("HERE - 1")
# model = build_model()
# #print(model)
# print("HERE - 2")
# model = load_checkpoint(checkpoint_path, model, None, True)
# print("HERE - 3")

### Generate speech

In [None]:
# Try your favorite senteneces:)
text = "Hi Aditya good morning, this is the progress with a sample data set and sample trained model.Generative adversarial network or variational auto-encoder.Once upon a time there was a dear little girl who was loved by every one who looked at her, but most of all by her grandmother, and there was nothing that she would not have given to the child. A text-to-speech synthesis system typically consists of multiple stages, such as a text analysis frontend, an acoustic model and an audio synthesis module."

N = 2
# Nov 10th 11pm - Aruneesh / testing with only 1 speaker to start with
# N= 108
print("Synthesizing \"{}\" with {} different speakers".format(text, N))
for speaker_id in range(N):
  print(speaker_id)
  # tts(model, text, speaker_id=speaker_id, figures=False)

In [None]:
# With attention plot
# tts(model, text, speaker_id=0, figures=True)

import importlib
importlib.reload(synthesis)

def tts(model, text, p=0, speaker_id=0, fast=False, figures=True):
  from synthesis import tts as _tts
  waveform, alignment, spectrogram, mel = _tts(model, text, p, speaker_id, fast)
  if figures:
      visualize(alignment, spectrogram)
  IPython.display.display(Audio(waveform, rate=fs))


tts(model, text, speaker_id=0, figures=True)


For details, please visit https://github.com/r9y9/deepvoice3_pytorch