In [None]:
# Copyright 2020 NVIDIA Corporation. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

In [None]:
##%tensorboard --logdir ../output_1500/train
!pip install seaborn
!pip install pandas
!pip install praat-textgrids
!pip install pydub
!pip install pyrle
!pip install ipywidgets
!apt-get install -y ffmpeg
!jupyter labextension install @jupyter-widgets/jupyterlab-manager

# FastPitch: Voice Modification with Pre-defined Pitch Transformations

The [FastPitch](https://arxiv.org/abs/2006.06873) model is based on the [FastSpeech](https://arxiv.org/abs/1905.09263) model. Similarly to [FastSpeech2](https://arxiv.org/abs/2006.04558), which has been developed concurrently, it learns to predict the pitch contour and conditions the generation on such contour.

The simple mechanism of predicting the pitch on grapheme-level (rather than frame-level, as FastSpeech2 does) allows to easily alter the pitch during synthesis. FastPitch can thus change the perceived emotional state of the speaker, or slightly emphasise certain lexical units.

## Requirements

Run the notebook inside the container. By default the container forwards port `8888`.
```
bash scripts/docker/interactive.sh

# inside the container
cd notebooks
jupyter notebook --ip='*' --port=8888
```
Please refer the Requirement section in `README.md` for more details and running outside the container.

In [None]:
import os
assert os.getcwd().split('/')[-1] == 'notebooks'

import IPython
from IPython.display import display, Markdown, Audio
import requests
import torch
import numpy as np
import io
import json
from matplotlib import pyplot as plt
from scipy.io.wavfile import read, write
import warnings
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import tempfile
%matplotlib inline
plt.rcParams['figure.figsize'] = [30, 15]
from ipywidgets import HBox, Label, VBox
from ipywidgets import widgets
from IPython.display import Audio, display
import parselmouth
import random



## Generate audio samples

Training a FastPitch model from scrath takes 3 to 27 hours depending on the type and number of GPUs, performance numbers can be found in Section "Training performance results" in `README.md`. Therefore, to save the time of running this notebook, we recommend to download the pretrained FastPitch checkpoints on NGC for inference.

You can find FP32 checkpoint at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:fastpitch_pyt_fp32_ckpt_v1/files) , and AMP (Automatic Mixed Precision) checkpoint at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:fastpitch_pyt_amp_ckpt_v1/files).

To synthesize audio, you will need a WaveGlow model, which generates waveforms based on mel-spectrograms generated by FastPitch.You can download a pre-trained WaveGlow AMP model at [NGC](https://ngc.nvidia.com/catalog/models/nvidia:waveglow256pyt_fp16).

In [None]:
# ! mkdir -p output
# ! MODEL_DIR='../pretrained_models' ../scripts/download_fastpitch.sh
# ! MODEL_DIR='../pretrained_models' ../scripts/download_waveglow.sh

You can perform inference using the respective checkpoints that are passed as `--fastpitch` and `--waveglow` arguments. Next, you will use FastPitch model to generate audio samples for input text, including the basic version and the variations i npace, fade out, and pitch transforms, etc.

In [None]:
# store paths in aux variables
fastp0 = '../output/FastPitch_checkpoint_100.pt'
fastp1500 = '../output/FastPitch_checkpoint_1500.pt'
fastp3000 = '../output/FastPitch_checkpoint_3000.pt'
waveg = '../pretrained_models/waveglow/nvidia_waveglow256pyt_fp16.pt'
flags0 = f'--cuda --fastpitch {fastp0} --waveglow {waveg} --wn-channels 256'
flags1500 = f'--cuda --fastpitch {fastp1500} --waveglow {waveg} --wn-channels 256'
flags3000 = f'--cuda --fastpitch {fastp3000} --waveglow {waveg} --wn-channels 256'

### 1. Basic speech synthesis

You need to create an input file with some text, or just input the text in the below cell:

In [None]:
%%writefile text.txt
I felt somehow it would have been an easier job.

In [None]:

tts_node = "https://tts.test.vocacloud.net"
#tts_node = "http://172.36.72.216:5000" #172.20.128.3
#tts_node = "http://172.20.128.2:5000" 

def get_temp_filename(mydir=".",prefix="temp_", suffix=""):
    
    """ return a string in the form of temp_X, where X is a large integer """
    file = tempfile.mkstemp(suffix=suffix, prefix=prefix, dir=mydir) 
    os.close(file[0])
    return file[1] 

def generate_kwargs_for_voca(text,what):
    headers = {'Authorization': 'Bearer a2f4cd32-f5f6-46fa-952a-b27697382b10',
               'Content-Type': 'application/json; version=1'}
    kwargs = {}
    kwargs['headers'] = headers
    json_data = {"sentence": text, "volume": 1, "speed": 1, "sampleRate": 22050, "reply_fields": what}
    return json_data, kwargs

def getDetailsFromTTS(text,what,url):
    sentence = text
    json_data, kwargs = generate_kwargs_for_voca(sentence, what)
    rawResponse = requests.post(url, json=json_data, stream=True, verify=False, **kwargs)
    reply = torch.load(io.BytesIO(rawResponse.content))
    return reply

def get_current_jenny(input):
    with open(input,"r") as f:
        text = f.readline()
    url = tts_node+"/tts/generate"
    rep = getDetailsFromTTS(text,['mel', 'align_logist', 'frame_length', 'sample_rate',"signal","align_text"],url)
    return rep["signal"].numpy(),int(rep["sample_rate"]),rep["mel"].numpy(),rep["align_logist"].numpy(),rep["align_text"].numpy()
    
# out,sr,mel,align, align_text = get_current_jenny("text.txt")    
# write("/tmp/dummy.wav", sr,out)
# plt.imshow(mel)
# plt.show()
# plt.imshow(align)
# plt.show()

# a = get_temp_filename("/tmp","temp_",'.wav')
# print(a)

text = "hi there"
what = ['mel', 'align_logist', 'frame_length', 'sample_rate',"signal","align_text"]
url = "https://tts.test.vocacloud.net"+"/tts/generate"
json_data, kwargs = generate_kwargs_for_voca(text, what)
rawResponse = requests.post(url, json=json_data, stream=True, verify=False, **kwargs)
rawResponse

In [None]:
what

Run the script below to generate audio from the input text file:

In [None]:
sentences = {}
with open("../Jenny/metadata.csv","r") as f:
    lines = f.readlines()
for line in lines:
    s = line.split("|")
    sentences[s[1]] = s[0]
    
def find_in_train_set(text):
    if text in sentences:
        return sentences[text]
    else:
        None
        
for s in sentences.keys():
    print(s)

In [None]:
def compare_sound(text,include_tacotron):
    # gen temps
    filename = get_temp_filename("/tmp","temp_",".txt")
    filename2 = get_temp_filename("/tmp","temp_",".wav")

    # write file
    with open(filename,"w") as f:
        f.write(text)
        
    display(Markdown("Original "+find_in_train_set(text)))
    train_id = find_in_train_set(text)
    if train_id:
        train_file = "../Jenny/wavs/{}.wav".format(train_id)
        display(Audio(train_file))
    else:
        display(Markdown("Not in train set"))

    mel = None
    align = None
    align_text = text
    if include_tacotron:
        display(Markdown("Tacotron2"))
        # get current
        out,sr,mel,align,align_text = get_current_jenny(filename)    
        write(filename2, sr,out)
        display(Audio(filename2))
    
    display(Markdown("fastPitch 0"))
    !python3 ../inference.py {flags0} -i {filename} -o output/0 --speaker 0 --n-speakers 2 >/dev/null
    display(Audio("output/0/audio_0.wav"))
    
    #display(Markdown("fastPitch 1500"))
    #!python3 ../inference.py {flags1500} -i {filename} -o output/1500 --speaker 0 --n-speakers 2 >/dev/null
    #IPython.display.display(IPython.display.Audio("output/1500/audio_0.wav"))
    
    display(Markdown("fastPitch 3000"))
    !python3 ../inference.py {flags3000} -i {filename} -o output/3000 --speaker 0 --n-speakers 2 >/dev/null
    display(Audio("output/3000/audio_0.wav"))
    return train_id, mel,filename2, align, align_text
    
#compare_sound("There alone could lovers see each other and communicate.")

In [None]:
id,mel,tc2_filename,align2,align_text2 = compare_sound("But it involves a happiness that will last throughout our lives, will it not?",False)

In [None]:
import os

text = "But it involves a happiness that will last throughout our lives, will it not?"
display(Markdown("Tacotron2"))
fn = get_temp_filename("/tmp","temp_",".txt")
filename = get_temp_filename("/tmp","temp_",".wav")
with open(fn,"w") as f:
    f.write(text)

out,sr,mel,align,align_text = get_current_jenny(fn)    
write(filename, sr,out)
display(Audio(filename))

In [None]:
# now collect the examples and get their pitch
def get_pitch(wav):
    snd = parselmouth.Sound(wav)
    pitch = snd.to_pitch().selected_array['frequency'] #time_step=snd.duration / (mel_len + 3)
    return pitch

# p1 = get_pitch("../Jenny/wavs/05363.wav")
# print(len(p1))
# p2 = get_pitch("output/0/audio_0.wav")
# print(len(p2))
# p3 = get_pitch("output/3000/audio_0.wav")
# print(len(p3))
# p4 = get_pitch(filename2)
# print(len(p4))
# t = min(len(p1),len(p2),len(p3))
# plt.plot(np.column_stack((p1[0:t],p2[0:t],p3[0:t])))

In [None]:
# now let's do force aligner
def mfa_align(text,in_wav_file,start_sec = 0,duration_sec=None):
    in_folder = "/tmp/test"
    with open(in_folder+"/0.txt","w") as f:
        f.write(text)
        
    #cat ../../Jenny/metadata.csv | awk -F '|' '$1==05363 {print $2}' > /tmp/test/0.txt
    out_folder = "/tmp/test2"
    wav_file = in_folder + "/0.wav"
    !rm {wav_file}
    trim = ""
    if duration_sec:
        trim = f"trim {start_sec} {duration_sec}"
    !sox {in_wav_file} -r 22050 -b 16 {wav_file} {trim} 
    !/workspace/fastpitch/MFA/montreal-forced-aligner/bin/mfa_align {in_folder} /workspace/fastpitch/MFA/montreal-forced-aligner/librispeech-lexicon.txt /workspace/fastpitch/MFA/montreal-forced-aligner/pretrained_models/english.zip {out_folder}
    
    return out_folder+"/test/0.TextGrid",wav_file
        



In [None]:
import seaborn as sns
import pandas as pd

# snd = parselmouth.Sound("/tmp/test/0.wav")
# sns.set() # Use seaborn's default style to make attractive graphs
# plt.rcParams['figure.dpi'] = 100 # Show nicely large images in this notebook
# snd_part = snd.extract_part(from_time=0.9, preserve_times=True)
# plt.figure()
# plt.plot(snd_part.xs(), snd_part.values.T, linewidth=0.5)
# plt.xlim([snd_part.xmin, snd_part.xmax])
# plt.xlabel("time [s]")
# plt.ylabel("amplitude")
# plt.show()
# def draw_spectrogram(spectrogram, dynamic_range=70):
#     X, Y = spectrogram.x_grid(), spectrogram.y_grid()
#     sg_db = 10 * np.log10(spectrogram.values)
#     plt.pcolormesh(X, Y, sg_db, vmin=sg_db.max() - dynamic_range, cmap='afmhot')
#     plt.ylim([spectrogram.ymin, spectrogram.ymax])
#     plt.xlabel("time [s]")
#     plt.ylabel("frequency [Hz]")

# def draw_intensity(intensity):
#     plt.plot(intensity.xs(), intensity.values.T, linewidth=3, color='w')
#     plt.plot(intensity.xs(), intensity.values.T, linewidth=1)
#     plt.grid(False)
#     plt.ylim(0)
#     plt.ylabel("intensity [dB]")
# intensity = snd.to_intensity()
# spectrogram = snd.to_spectrogram()
# plt.figure()
# draw_spectrogram(spectrogram)
# plt.twinx()
# draw_intensity(intensity)
# plt.xlim([snd.xmin, snd.xmax])
# plt.show()

# def draw_pitch(pitch):
#     # Extract selected pitch contour, and
#     # replace unvoiced samples by NaN to not plot
#     pitch_values = pitch.selected_array['frequency']
#     pitch_values[pitch_values==0] = np.nan
#     plt.plot(pitch.xs(), pitch_values, 'o', markersize=5, color='w')
#     plt.plot(pitch.xs(), pitch_values, 'o', markersize=2)
#     plt.grid(False)
#     plt.ylim(0, pitch.ceiling)
#     plt.ylabel("fundamental frequency [Hz]")
    
# pitch = snd.to_pitch()
# pre_emphasized_snd = snd.copy()
# pre_emphasized_snd.pre_emphasize()
# spectrogram = pre_emphasized_snd.to_spectrogram(window_length=0.03, maximum_frequency=8000)
# plt.figure()
# draw_spectrogram(spectrogram)
# plt.twinx()
# draw_pitch(pitch)
# plt.xlim([snd.xmin, snd.xmax])
# plt.show()



In [None]:
import sys
import textgrids
import pandas as pd

in_folder = "/tmp/test"
out_folder = "/tmp/test2"
gridFile = out_folder+"/test/"+"0.TextGrid"
wavFile = in_folder+"/0.wav"

def get_pitch_for_grid(gridFile,wavFile,res):
    snd = parselmouth.Sound(wavFile)
    grid = textgrids.TextGrid(gridFile)
    pitch = snd.to_pitch()
    freq = pitch.selected_array['frequency']
    print(len(freq))
    freq[freq==0] = np.nan
    for i,t in enumerate(grid['phones']):
        m = np.nanmean(freq[np.array(np.logical_and(t.xmax>pitch.xs() , t.xmin<pitch.xs()))])
        s = np.nanstd(freq[np.array(np.logical_and(t.xmax>pitch.xs() , t.xmin<pitch.xs()))])
        res = res.append(pd.DataFrame({"file":wavFile,"id":i,"text":t.text,"mean":m,"std":s,"min":t.xmin,"max":t.xmax},index=[0]))
    return res

!mkdir -p {in_folder}
!mkdir -p {out_folder}
# out = {}
# res = pd.DataFrame()
# for file in ["../Jenny/wavs/05363.wav",filename2,"output/3000/audio_0.wav"]:
#     tg,wf = mfa_align("But it involves a happiness that will last throughout our lives, will it not?",file)
#     res = get_pitch_for_grid(tg,wf,res)
# res    

In [None]:
from pydub import AudioSegment
import requests
import torch
import numpy as np
from pyrle import Rle as rle
import io
from scipy.io.wavfile import write
lables = [    ' ', '!', '"', "'", '(', ')', ',', '-', '.', ':', ';', '?', 'A', 'B',
              'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
              'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', 'a', 'b',
              'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
              'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z' ,'@']
maping = {}
demapping = {}
cc = 1
for c in lables:
    maping[c] = cc
    demapping[cc] = c
    cc+=1
    
def get_aligment(logits, padded_text,frame_length):
    """
    gets the alignment of the padded text using the logits of the generated speech.
    :param logits: matrix of character probabilities for each ts
    :param padded_text: encoded padded text
    :return:list of lists - each element delimits the frames the word was uttered
    """
    best_prob_per_frame = np.argmax(logits, axis=0).tolist()
    best_chars = [padded_text[frame] for frame in best_prob_per_frame]
    mid_sentence = int(logits.shape[1] / 2)
    # get first non-padding char
    first_non_pad_char = np.min(np.where(padded_text != maping['@']))
    start_frame = np.argmax(logits[first_non_pad_char, :mid_sentence])
    # get last non padding char
    last_non_pad_char = np.max(np.where(padded_text != maping['@']))
    end_frame = mid_sentence+np.argmax(logits[last_non_pad_char, mid_sentence:])
    #get RLE of best_chars - removing repeating chars
    rle_all = rle(best_chars[start_frame:end_frame])
    rle_dur = rle_all.runs
    rle_char = rle_all.values.astype(int)
    # note that the beginning of the alignment is not very "focused" so skip to start pos
    padded_text = padded_text[first_non_pad_char:(last_non_pad_char+1)]
    r_pos = 0
    frame = start_frame
    match = -1*np.ones((len(padded_text),2))
    #
    # The following matches the RLE encoding and the aligned_chars map
    #
    for i,c in enumerate(padded_text):
        if c == rle_char[r_pos]:
            # there is a match
            match[i,:] = (frame,frame+rle_dur[r_pos])
            frame += rle_dur[r_pos]
            r_pos += 1
        elif rle_char[r_pos]==maping[' '] and c == rle_char[r_pos+1]:
            # allow floating spaces in the middle of the words
            match[i,:] = (frame+rle_dur[r_pos],frame+rle_dur[r_pos]+rle_dur[r_pos+1])
            frame += (rle_dur[r_pos]+rle_dur[r_pos+1])
            r_pos += 2
        # if char was not found - it remains None
    # fix chars that were not found
    missing_values = np.where(match[:,0]==-1)[0]
    if len(missing_values)>0:
        prev_ends = match[(np.array(missing_values)-1),1]
        match[missing_values,] = np.stack((prev_ends,prev_ends),axis=1)
    negative_values = np.where(match[:,0]<0)[0]
    if len(negative_values)>0:
        prev_ends = match[(np.array(negative_values)-1),1]
        match[negative_values,] = np.stack((prev_ends,prev_ends),axis=1)
    # concat everything together
    get_all = match* frame_length #np.hstack((match*frame_length,np.array([demapping[c] for c in padded_text]).reshape(match.shape[0],-1)))
    # get the places with spaces
    return get_all,padded_text

# fl = 256/22050
# alignment,padded_text = get_aligment(align, align_text,fl)

# print(alignment.shape)
# print(len(padded_text))


In [None]:

#
# let's rerun everything....
#
text = "hi there"
what = ['mel', 'align_logist', 'frame_length', 'sample_rate',"signal","align_text"]
url = "https://tts.test.vocacloud.net"+"/tts/generate"
json_data, kwargs = generate_kwargs_for_voca(text, what)
rawResponse = requests.post(url, json=json_data, stream=True, verify=False, **kwargs)
rawResponse

#
# from Tacotron!
#
text = "But it involves a happiness that will last throughout our lives, will it not?"
fn = get_temp_filename("/tmp","temp_",".txt")
filename = get_temp_filename("/tmp","temp_",".wav")
with open(fn,"w") as f:
    f.write(text)
out,sr,mel,align,align_text = get_current_jenny(fn)    
write(filename, sr,out)
fl = 256/22050
letters_alignment,padded_text = get_aligment(align, align_text,fl)
s = 0
e = -1
duration = letters_alignment[e,1]-letters_alignment[s,0]
start = letters_alignment[s,0]
tg,wf = mfa_align(text,filename,start,duration)
phones = get_pitch_for_grid(tg,wf,pd.DataFrame())
phones['min'] = phones['min'] + letters_alignment[0,0]
phones['max'] = phones['max'] + letters_alignment[0,0]

#
# from ground_truth
#
train_id = find_in_train_set(text)
train_file = "../Jenny/wavs/{}.wav".format(train_id)
gt_tg,gt_wf = mfa_align(text,train_file)
gt_phones = get_pitch_for_grid(gt_tg,gt_wf,pd.DataFrame())
#gt_phones['min'] = gt_phones['min'] + letters_alignment[0,0]
#gt_phones['max'] = gt_phones['max'] + letters_alignment[0,0]


In [None]:
#now view all together
def draw_phones(phones,colour,compare=None,compare_colour="k"):
    plt.hlines(phones['mean'],phones['min'],phones['max'],color=colour)
    for i in range(phones['min'].shape[0]):
        x = phones.loc[i,'min']
        if np.isfinite(phones.loc[i,'mean']):
            plt.text(float(x),phones.loc[i,'mean'],phones.loc[i,'text'],fontsize=18,color=colour)
        if compare is not None:
            id = phones.loc[i,'id']
            new_h = compare['mean'][(compare['id']==id)]
            plt.hlines(new_h,phones.loc[i,'min'],phones.loc[i,'max'],color=compare_colour)
    
def draw_letters(padded_text,letters_alignment,colour):
    pt = [demapping[t] for t in padded_text.tolist()]
    for i,t in enumerate(pt):
        if i % 2:
            h = 100
        else:
            h = 90
        if t==" ":
            t = "-"
        if np.isfinite(letters_alignment[i,0]) and letters_alignment[i,0]>0:
            plt.vlines(letters_alignment[i,0],1,500,linestyle=":",color="gray")
            plt.vlines(letters_alignment[i,1],1,500,linestyle=":",color="gray")
            plt.text(letters_alignment[i,0],h,t,fontsize=18,color=colour)
            plt.hlines(h-20,letters_alignment[i,0],letters_alignment[i,1],color=colour)

phones = phones.reset_index(drop=True)
gt_phones = gt_phones.reset_index(drop=True)

draw_phones(phones,"r",gt_phones)
draw_letters(padded_text,letters_alignment,"b")
# plt.show()
# draw_phones(gt_phones,"k")
print("GT")
display(Audio(train_file))
print("tacotron")
display(Audio(filename))


In [None]:
%%writefile ../pitch_transform.py
import torch

def pitch_transform_custom(pitch, duration):
# pitch = pitch + 110
#     print("---!")
#     #duration = duration * 0 + 1.5
#     #pitch = pitch * 0 + 300
#     print(duration)
#     print(pitch)
#     #pitch_offset = [0, -11.734620465350275, -28.689850792933385, 0, -44.61186523887815, -45.70688894982871, 0, -56.560140114505515, -52.77171128844202, -58.20461132565032, -105.01983787614819, -124.48284897082337, -116.23204895517284, -116.23204895517284, 0, 0, -64.71739562158871, 0, -38.5914461297364, -42.897411924509385, 0, 0, -3.3426176814817268, -12.247741709341824, 5.828423138490507, 0, 0, 0, 0, 0, 0, 295.5051434349008, 0, -13.84992685544674, -8.756527112814183, -11.744156656968755, -11.744156656968755, 0, -16.215855559479365, -43.01594103179767, 0, 0, 0, 0, 0, -10.952073349230062, -16.445555763011413, -16.445555763011413, -16.445555763011413, -16.445555763011413, -22.59699129071464, -22.59699129071464, -26.005503905906323, 0, -30.1506943181019, -30.1506943181019, -33.673773790807786, 0, -33.591902348573, 33.02369931724701, 16.906020670352575, 16.906020670352575, 0, 0, 0, -18.83333666886324, -25.158673490755348, -35.46426430051292, -35.46426430051292, 0, 294.5591855863818, 0, 0, -83.69600586493456, -22.06781578151788, 0, 0   ]
#     #pitch = pitch + torch.tensor(pitch_offset).to("cuda")
#     print("---!")
#     pitch[0][-6] = 180  
#     pitch[0][-5] = 260  
#     pitch[0][-4] = 360  
#     pitch[0][-3] = 360  
#     pitch[0][-2] = 380  
#     pitch[0][-1] = 400  
#     duration = duration*1.3

    return pitch ,duration

In [None]:
!python ../inference.py {flags3000} -i {fn} -o output/out --pitch-transform-custom  --speaker 0 --n-speakers 2 
IPython.display.Audio("output/out/audio_0.wav")

In [None]:
import json

url = "http://tts.test.vocacloud.net/tts/mel2wav"
reply_fields=['signal','sample_rate','mel_before_denoiser','mel_after_denoiser']

text = "But it involves a happiness that will last throughout our lives, will it not?"
text = "I felt somehow it would have been an easier job."
text = "Notice, no fear. No sense of impending doom. We came on the wrong night."
train_id = '05322'

def show_samples(text):
    train_id = sentences[text]

    file = get_temp_filename("/tmp","temp_",".txt")
    with open(file,"w") as f:
        f.write(text)

    wav_name = "../Jenny/wavs/{}.wav".format(train_id)
    a = wav_name
    #print("gt_full - {}".format(wav_name))
    #display(Audio(wav_name, normalize=True))

    #print("api_full")
    out,sr,mel,align, align_text = get_current_jenny(file)
    api_file = get_temp_filename("/tmp","temp_",".wav")
    write(api_file, 22050, out)
    b = api_file
    #display(Audio(api_file, normalize=True))


    # print("mel from mels folder. vocoder is Jenny")

    # mel1 = torch.load('../Jenny/mels/{}.pt'.format(train_id))
    # mel = mel1
    # melFile = '/tmp/mel.pt'
    # torch.save(mel,melFile, _use_new_zipfile_serialization=False)
    # with open(melFile, 'rb') as fin:
    #     files = {'file': fin,'reply_fields': json.dumps(reply_fields)}
    #     rawResponse = requests.post(url, files=files, stream=True,verify=False)
    # try:
    #     reply = torch.load(io.BytesIO(rawResponse.content))
    # except Exception as ex:
    #     print(rawResponse.content)

    # write("/tmp/temp222.wav", reply["sample_rate"], reply["signal"].numpy())
    # display(Audio("/tmp/temp222.wav", normalize=True))


    # print("mel from tacotron. vocoder is Jenny")

    # mel2 = torch.tensor(mel)
    # mel = mel2
    # melFile = '/tmp/mel.pt'
    # torch.save(mel,melFile, _use_new_zipfile_serialization=False)
    # with open(melFile, 'rb') as fin:
    #     files = {'file': fin,'reply_fields': json.dumps(reply_fields)}
    #     rawResponse = requests.post(url, files=files, stream=True,verify=False)
    # try:
    #     reply = torch.load(io.BytesIO(rawResponse.content))
    # except Exception as ex:
    #     print(rawResponse.content)

    # write("/tmp/temp222.wav", reply["sample_rate"], reply["signal"].numpy())
    # display(Audio("/tmp/temp222.wav", normalize=True))




    #print("mel from fastPitch. vocoder is Universal")

    !python ../inference.py {flags3000} -i {file} -o output/out --pitch-transform-custom  --speaker 0 --n-speakers 2 >/dev/null
    c = "output/out/audio_0.wav"
    #display(Audio(c, normalize=True))



    #print("mel from fastPitch. vocoder is Jenny")
    d = "/tmp/MFVJ.wav"
    melFile = 'output/out/audio_0_mel.pt'
    with open(melFile, 'rb') as fin:
        files = {'file': fin,'reply_fields': json.dumps(reply_fields)}
        rawResponse = requests.post(url, files=files, stream=True,verify=False)
    try:
        reply = torch.load(io.BytesIO(rawResponse.content))
    except Exception as ex:
        print(rawResponse.content)

    write(d, reply["sample_rate"], reply["signal"].numpy())
    #display(Audio(d, normalize=True))





    audio_widgets_files  = [a,b,c,d]
    audio_widgets = []
    for widget_file in audio_widgets_files:
        with open(widget_file,"rb") as wf:
            data = wf.read()
        sample_rate = 22050
        out = widgets.Output()
        with out:
            display(Audio(data=data, rate=sample_rate))
        audio_widgets.append(out)
    display(HBox(audio_widgets))

random.seed(15)
sentences2 = list(sentences.keys())
random.shuffle(sentences2)
for s in sentences2[0:15]:
    show_samples(s)