# 1. Install necessary package
1. espnet(need installation: pip install espnet)
2. torch(need installation: pip install torch)
3. display tool(will change according to needs)

*This is not needed for data generation, but will be needed for future step:\
pip install -q espnet==0.10.3 pyopenjtalk==0.1.5 parallel_wavegan==0.5.3 espnet_model_zoo

# 2. Text2speech model demo


## 2.1 Configure params - English

In [49]:
#@title Choose English model { run: "auto" }
lang = 'English'
tag = 'kan-bayashi/ljspeech_vits' #@param ["kan-bayashi/ljspeech_tacotron2", "kan-bayashi/ljspeech_fastspeech", "kan-bayashi/ljspeech_fastspeech2", "kan-bayashi/ljspeech_conformer_fastspeech2", "kan-bayashi/ljspeech_joint_finetune_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_joint_train_conformer_fastspeech2_hifigan", "kan-bayashi/ljspeech_vits"] {type:"string"}
vocoder_tag = "none" #@param ["none", "parallel_wavegan/ljspeech_parallel_wavegan.v1", "parallel_wavegan/ljspeech_full_band_melgan.v2", "parallel_wavegan/ljspeech_multi_band_melgan.v2", "parallel_wavegan/ljspeech_hifigan.v1", "parallel_wavegan/ljspeech_style_melgan.v1"] {type:"string"}

## 2.2 build model using params

In [55]:
import numpy as np
import espnet
from espnet2.bin.tts_inference import Text2Speech
from espnet2.utils.types import str_or_none

text2speech = Text2Speech.from_pretrained(
    model_tag=str_or_none(tag),
    vocoder_tag=str_or_none(vocoder_tag),
    #device="cuda",
    # Only for Tacotron 2 & Transformer
    threshold=0.5,
    # Only for Tacotron 2
    minlenratio=0.0,
    maxlenratio=10.0,
    use_att_constraint=False,
    backward_window=1,
    forward_window=3,
    # Only for FastSpeech & FastSpeech2 & VITS
    speed_control_alpha=1.0,
    # Only for VITS
    noise_scale=0.333,
    noise_scale_dur=0.333,
)

## 2.3 display audio from text

In [176]:
import time
import torch
from IPython.display import display, Audio

text = "Hello, this is new text"

def displayTTS(text):
    # synthesis
    wav = text2speech(text)["wav"]

    # let us listen to generated samples
    display(Audio(wav.view(-1).cpu().numpy(), rate=text2speech.fs))
    

In [177]:
TTS("I MEAN OF COURSE IF YOU'LL TAKE A COMPANION")

<class 'torch.Tensor'>


In [178]:
text = "I MEAN OF COURSE IF YOU'LL TAKE A COMPANION".lower()
TTS(text)

<class 'torch.Tensor'>


# 3. output audio from text

## 3.1 Save TTS to wav file

In [73]:
import soundfile as sf

def saveTTS(text, path, fileName):
    # synthesis
    wav = text2speech(text)["wav"]

    # output audio as fileName
    sf.write(path+fileName+".wav", wav.numpy(), text2speech.fs, "PCM_16")
    


In [79]:
text = "test save audio file"
rootPath = "LibriSpeech/"
fileName = "testOutput"
#saveTTS(text, rootPath, fileName)

## 3.2 Utility Functions: make empty directory

In [76]:
from pathlib import Path
def makeDir(pathName):
    # if under current dir, pathName will just be dirName
    Path(pathName).mkdir(parents=True, exist_ok=True)

## 3.3 Utility Functions: find path that contains all the goodie goodies

In [183]:
import os
def whereGoodiesDFS(rootPath, trainSetName):
    # trainSetName = "train-clean-100"
    goodies = []
    for dir1 in os.listdir(rootPath+trainSetName):
        newDir = rootPath+trainSetName+"/"+dir1
        if not os.path.isdir(newDir):
            continue
            
        if dir1 == ".DS_Store":
            continue
        for dir2 in os.listdir(rootPath+trainSetName+"/"+dir1):
            goodies.append(rootPath+trainSetName+"/"+dir1+"/"+dir2)
    return goodies

In [108]:
trainSetName = "train-clean-100"
allDir = whereGoodiesDFS(rootPath, trainSetName)
#print(allDir)

['LibriSpeech/train-clean-100/1069/133709', 'LibriSpeech/train-clean-100/1069/133699', 'LibriSpeech/train-clean-100/8580/287364', 'LibriSpeech/train-clean-100/8580/287363', 'LibriSpeech/train-clean-100/3168/173564', 'LibriSpeech/train-clean-100/3168/173565', 'LibriSpeech/train-clean-100/909/131045', 'LibriSpeech/train-clean-100/909/131044', 'LibriSpeech/train-clean-100/909/131041', 'LibriSpeech/train-clean-100/307/127535', 'LibriSpeech/train-clean-100/307/127539', 'LibriSpeech/train-clean-100/307/127540', 'LibriSpeech/train-clean-100/1263/139804', 'LibriSpeech/train-clean-100/1263/141777', 'LibriSpeech/train-clean-100/1263/138246', 'LibriSpeech/train-clean-100/2843/152918', 'LibriSpeech/train-clean-100/6880/216547', 'LibriSpeech/train-clean-100/103/1240', 'LibriSpeech/train-clean-100/103/1241', 'LibriSpeech/train-clean-100/8770/295465', 'LibriSpeech/train-clean-100/8770/295462', 'LibriSpeech/train-clean-100/8770/295463', 'LibriSpeech/train-clean-100/1034/121119', 'LibriSpeech/train-cle

## 3.4 Utility Functions: find text and retrieve label and sentence

In [134]:
import glob
import pandas as pd

def getMetaInfo(dirPath):
    txtFileName = glob.glob(dirPath+'/*.txt')[0]
    column_names = ["label", "text"]
    df = pd.DataFrame(columns = column_names)
    with open(txtFileName) as f:
        lines = f.readlines()
    for line in lines:
        label = line.split()[0]
        text = line[len(label)+1:]
        row = {"label": label, "text":text}
        df = df.append(row, ignore_index = True)
    return df

In [151]:
samplePath = allDir[0]
metaInfo = getMetaInfo(samplePath)
print(samplePath)
metaInfo

LibriSpeech/train-clean-100/1069/133709


Unnamed: 0,label,text
0,1069-133709-0000,HAD LAID BEFORE HER A PAIR OF ALTERNATIVES NOW...
1,1069-133709-0001,PROPERTY ERECTS A KIND OF BARRIER YOU CAN DO A...
2,1069-133709-0002,I MEAN OF COURSE IF YOU'LL TAKE A COMPANION SO...
3,1069-133709-0003,SHE'D KEEP PEOPLE OFF VERY WELL I THINK\n
4,1069-133709-0004,THAT IT'S A GREAT DEAL BETTER YOU SHOULD REMAI...
5,1069-133709-0005,SHE HAD A GREAT REGARD FOR WHAT WAS USUALLY DE...
6,1069-133709-0006,WHEN SHE SAT IN HER DAMP WATERPROOF AND SKETCH...
7,1069-133709-0007,AND HER IMAGINATION CONSTANTLY ANTICIPATED THE...
8,1069-133709-0008,BUT WAS NEVER OVER INQUISITIVE AS REGARDS THE ...
9,1069-133709-0009,SO LITTLE SURFACE OFFERED SO LIMITED A FACE TO...


## 3.4 Utility Functions: generate TTS wav and put into TTS dir

In [173]:
def generateTTS(dirPath):
    # Ex. dirPath = LibriSpeech/train-clean-100/1069/133709
    
    #1. setup empty dir inside
    ttsPath = dirPath+"/TTS"
    makeDir(ttsPath)
    
    #2. get metaInfo
    metaInfo = getMetaInfo(dirPath)
    
    #3. generate TTS wav and put inside TTS
    for _, row in metaInfo.iterrows():
        label = row[0]
        text = row[1]
        saveTTS(text, ttsPath+"/", label)
    print(dirPath+" done!")

In [174]:
#generateTTS(samplePath)

KeyboardInterrupt: 

## 3.4 Final Step: GO GO GO!

In [186]:
def generateTTSforAll(trainSetName):
    rootPath = "LibriSpeech/"
    # 1. get all
    allDir = whereGoodiesDFS(rootPath, trainSetName)
    n = len(allDir)
    for i, dirPath in enumerate(allDir):
        if ".DS_Store" in dirPath:
            continue
        # 2. generate tts for each
        generateTTS(dirPath)
        print(str(i+1)+"/"+str(n)+" DONE")
    

In [187]:
trainSetName = "train-clean-100"
generateTTSforAll(trainSetName)

LibriSpeech/train-clean-100/1069/133709 done!
2/587 DONE
LibriSpeech/train-clean-100/1069/133699 done!
3/587 DONE
LibriSpeech/train-clean-100/8580/287364 done!
4/587 DONE
LibriSpeech/train-clean-100/8580/287363 done!
5/587 DONE
LibriSpeech/train-clean-100/3168/173564 done!
6/587 DONE
LibriSpeech/train-clean-100/3168/173565 done!
7/587 DONE
LibriSpeech/train-clean-100/909/131045 done!
8/587 DONE
LibriSpeech/train-clean-100/909/131044 done!
9/587 DONE
LibriSpeech/train-clean-100/909/131041 done!
10/587 DONE
LibriSpeech/train-clean-100/307/127535 done!
11/587 DONE
LibriSpeech/train-clean-100/307/127539 done!
12/587 DONE
LibriSpeech/train-clean-100/307/127540 done!
13/587 DONE
LibriSpeech/train-clean-100/1263/139804 done!
14/587 DONE
LibriSpeech/train-clean-100/1263/141777 done!
15/587 DONE
LibriSpeech/train-clean-100/1263/138246 done!
16/587 DONE
LibriSpeech/train-clean-100/2843/152918 done!
17/587 DONE
LibriSpeech/train-clean-100/6880/216547 done!
18/587 DONE
LibriSpeech/train-clean-100/

LibriSpeech/train-clean-100/1743/142914 done!
145/587 DONE
LibriSpeech/train-clean-100/2764/36616 done!
146/587 DONE
LibriSpeech/train-clean-100/2764/36619 done!
147/587 DONE
LibriSpeech/train-clean-100/2764/36617 done!
148/587 DONE
LibriSpeech/train-clean-100/412/126975 done!
149/587 DONE
LibriSpeech/train-clean-100/1578/6379 done!
150/587 DONE
LibriSpeech/train-clean-100/1578/140045 done!
151/587 DONE
LibriSpeech/train-clean-100/1578/140049 done!
152/587 DONE
LibriSpeech/train-clean-100/5789/57195 done!
153/587 DONE
LibriSpeech/train-clean-100/5789/57158 done!
154/587 DONE
LibriSpeech/train-clean-100/5789/70653 done!
155/587 DONE
LibriSpeech/train-clean-100/4813/248641 done!
156/587 DONE
LibriSpeech/train-clean-100/4813/248638 done!
157/587 DONE
LibriSpeech/train-clean-100/8629/261139 done!
158/587 DONE
LibriSpeech/train-clean-100/8629/261140 done!
159/587 DONE
LibriSpeech/train-clean-100/3235/28433 done!
160/587 DONE
LibriSpeech/train-clean-100/3235/28452 done!
161/587 DONE
LibriSpe

LibriSpeech/train-clean-100/1246/135815 done!
287/587 DONE
LibriSpeech/train-clean-100/1246/124550 done!
288/587 DONE
LibriSpeech/train-clean-100/587/41619 done!
289/587 DONE
LibriSpeech/train-clean-100/587/41611 done!
290/587 DONE
LibriSpeech/train-clean-100/587/54108 done!
291/587 DONE
LibriSpeech/train-clean-100/3982/182255 done!
292/587 DONE
LibriSpeech/train-clean-100/3982/178459 done!
293/587 DONE
LibriSpeech/train-clean-100/8975/270782 done!
294/587 DONE
LibriSpeech/train-clean-100/198/129977 done!
295/587 DONE
LibriSpeech/train-clean-100/198/209 done!
296/587 DONE
LibriSpeech/train-clean-100/198/126831 done!
297/587 DONE
LibriSpeech/train-clean-100/5463/39173 done!
298/587 DONE
LibriSpeech/train-clean-100/5463/39174 done!
299/587 DONE
LibriSpeech/train-clean-100/7302/86815 done!
300/587 DONE
LibriSpeech/train-clean-100/7302/86814 done!
301/587 DONE
LibriSpeech/train-clean-100/196/122150 done!
302/587 DONE
LibriSpeech/train-clean-100/196/122159 done!
303/587 DONE
LibriSpeech/tra

LibriSpeech/train-clean-100/831/130739 done!
428/587 DONE
LibriSpeech/train-clean-100/2182/181173 done!
429/587 DONE
LibriSpeech/train-clean-100/2182/150130 done!
430/587 DONE
LibriSpeech/train-clean-100/2182/181183 done!
431/587 DONE
LibriSpeech/train-clean-100/1363/135842 done!
432/587 DONE
LibriSpeech/train-clean-100/1363/139304 done!
433/587 DONE
LibriSpeech/train-clean-100/200/124140 done!
434/587 DONE
LibriSpeech/train-clean-100/200/124139 done!
435/587 DONE
LibriSpeech/train-clean-100/200/126784 done!
436/587 DONE
LibriSpeech/train-clean-100/5192/19396 done!
437/587 DONE
LibriSpeech/train-clean-100/5192/19397 done!
438/587 DONE
LibriSpeech/train-clean-100/7067/76048 done!
439/587 DONE
LibriSpeech/train-clean-100/7067/76047 done!
440/587 DONE
LibriSpeech/train-clean-100/405/130894 done!
441/587 DONE
LibriSpeech/train-clean-100/405/130895 done!
442/587 DONE
LibriSpeech/train-clean-100/298/126791 done!
443/587 DONE
LibriSpeech/train-clean-100/298/126790 done!
444/587 DONE
LibriSpee

LibriSpeech/train-clean-100/125/121342 done!
569/587 DONE
LibriSpeech/train-clean-100/2092/145709 done!
570/587 DONE
LibriSpeech/train-clean-100/2092/145706 done!
571/587 DONE
LibriSpeech/train-clean-100/40/121026 done!
572/587 DONE
LibriSpeech/train-clean-100/40/222 done!
573/587 DONE
LibriSpeech/train-clean-100/4195/186236 done!
574/587 DONE
LibriSpeech/train-clean-100/4195/186238 done!
575/587 DONE
LibriSpeech/train-clean-100/4195/186237 done!
576/587 DONE
LibriSpeech/train-clean-100/4195/17507 done!
577/587 DONE
LibriSpeech/train-clean-100/78/368 done!
578/587 DONE
LibriSpeech/train-clean-100/78/369 done!
579/587 DONE
LibriSpeech/train-clean-100/1447/17506 done!
580/587 DONE
LibriSpeech/train-clean-100/1447/130552 done!
581/587 DONE
LibriSpeech/train-clean-100/1447/130550 done!
582/587 DONE
LibriSpeech/train-clean-100/1447/130551 done!
583/587 DONE
LibriSpeech/train-clean-100/2836/5354 done!
584/587 DONE
LibriSpeech/train-clean-100/2836/5355 done!
585/587 DONE
LibriSpeech/train-cle