In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pylab as plt
import IPython.display as ipd

import json
import sys
import torch
from torch.distributions import Normal

from flowtron import Flowtron
from data import Data
from train import update_params
from train import load_flowtron
sys.path.insert(0, "tacotron2")
sys.path.insert(0, "tacotron2/waveglow")
from denoiser import Denoiser


In [2]:
def plot_data(data, figsize=(16, 4)):
    fig, axes = plt.subplots(1, len(data), figsize=figsize)
    for i in range(len(data)):
        axes[i].imshow(data[i], aspect='auto', origin='lower', 
                       interpolation='none')

In [3]:
config_path = "config.json"
params = ["model_config.dummy_speaker_embedding=0",
          "data_config.p_arpabet=1.0"]

with open(config_path) as f:
    data = f.read()

config = json.loads(data)
update_params(config, params)

data_config = config["data_config"]
model_config = config["model_config"]

model_config.dummy_speaker_embedding=0
dummy_speaker_embedding=0
data_config.p_arpabet=1.0
p_arpabet=1.0


In [4]:
#model_path = "outdir/model_682000"
model_path = "flowtron_1006000"
# model_path = "flow/model_112000"
state_dict = torch.load(model_path, map_location='cpu')['model'].state_dict()
model = Flowtron(**model_config).cuda()
model.load_state_dict(state_dict)
_ = model.eval().cuda()

##### Load WaveGlow

In [5]:
waveglow_path = 'waveglow_256channels_final.pt'
waveglow = torch.load(waveglow_path)['model']
_ = waveglow.eval().cuda()
denoiser = Denoiser(waveglow).cuda().eval()

In [6]:
ignore_keys = ['training_files', 'validation_files']
trainset = Data(
    data_config['training_files'],
    **dict((k, v) for k, v in data_config.items() if k not in ignore_keys))

Number of speakers : 1
Number of speakers : 1


In [7]:
speaker_vecs = trainset.get_speaker_id(0).cuda()
speaker_vecs = speaker_vecs[None]

In [14]:
# Say luận mô hình Flowtron với các tham số sigma khác nhau. 
# Nếu sigma bằng 0, ko có sự thay đổi giữa các lần suy luận.
# Càng tăng sigma, sẽ có sự thay đổi giữa các lần suy lận.

sigma =0
n_frames = 500
text = "Kiểm tra sự thay đổi giọng nói giữa các mô hình"
text_encoded = trainset.get_text(text).cuda()[None]

with torch.no_grad():
    residual =    torch.cuda.FloatTensor(1, 80, n_frames).normal_() * sigma
    mels, attentions = model.infer(residual, speaker_vecs, text_encoded)


audio = denoiser(waveglow.infer(mels, sigma=0.8), 0.001)
ipd.Audio(audio[0].data.cpu().numpy(), rate=data_config['sampling_rate'])    

Hitting gate limit


In [18]:
fig, axes = plt.subplots()
axes.imshow(mels[0].cpu(), aspect='auto', origin='lower', interpolation='none')
fig.savefig('flowtron_sigma132.png')

In [1]:
from prior import *
import matplotlib.pyplot as plot

In [4]:
demoScatterPlot()

-----Sigma 0-----
[4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155]
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-----Sigma 0.5-----
[3.378503401360544, 3.81968253968254, 3.81968253968254, 3.2043537414965986, 4.098321995464852, 3.1579138321995464, 3.610702947845805, 3.4829931972789114, 4.2260317460317465, 3.7151927437641725]
[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
-----Sigma 1-----
[3.8545124716553287, 3.2275736961451247, 3.5526530612244898, 3.900952380952381, 4.179591836734694, 2.8212244897959184, 3.0650340136054424, 3.7732426303854876, 3.0650340136054424, 3.877732426303855]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
-----Tacotron-----
[3.2856235827664397, 3.215963718820862, 3.076643990929705, 3.0418140589569163, 3.239183673469388, 3.215963718820862, 3.4017233560090703, 3.006984126984127, 3.1927437641723357, 2.9

In [5]:
colors = ['b', 'c', 'y', 'm', 'r']
x = [4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155, 4.1099319727891155]
y = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
sigma0 = plot.scatter(x, y, marker = 'o', color=colors[0])

x = [3.378503401360544, 3.81968253968254, 3.81968253968254, 3.2043537414965986, 4.098321995464852, 3.1579138321995464, 3.610702947845805, 3.4829931972789114, 4.2260317460317465, 3.7151927437641725]
y = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]
sigma0p5 = plot.scatter(x, y, marker = 'o', color=colors[1])

x = [3.8545124716553287, 3.2275736961451247, 3.5526530612244898, 3.900952380952381, 4.179591836734694, 2.8212244897959184, 3.0650340136054424, 3.7732426303854876, 3.0650340136054424, 3.877732426303855]
y = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
sigma1 = plot.scatter(x, y, marker = 'o', color=colors[2])

x = [3.2856235827664397, 3.215963718820862, 3.076643990929705, 3.0418140589569163, 3.239183673469388, 3.215963718820862, 3.4017233560090703, 3.006984126984127, 3.1927437641723357, 2.995374149659864]
y = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]                                                            
t2 = plot.scatter(x, y, label = '+', color=colors[3])
plot.legend((sigma0, sigma0p5, sigma1, t2),
            ('Sigma=0', 'Sigma=0.5', 'Sigma=1', 'Tacotron'),
           scatterpoints=1,
           fancybox=True,
            loc= 'lower left'
           )
plot.xlabel("Duration(s)")
plot.ylabel("Parameters")
plot.savefig('scatterplot.png')

In [7]:
filename = "results/prior/sigma1/0.wav"
y, sr = librosa.load(path=filename)
f0, _, _ = librosa.pyin(y=y, fmin=librosa.note_to_hz('D♯2'), fmax=librosa.note_to_hz('A4') )
x = [2.5541950113378684, 2.391655328798186, 3.1811337868480725, 5.7817687074829935, 2.4961451247165534, 2.693514739229025, 2.5425850340136056, 3.900952380952381, 2.6354648526077096, 2.6354648526077096]
plt.contour(x, f0)

AttributeError: module 'librosa' has no attribute 'pyin'

In [1]:
for i in range(2):
    print(i)

0
1
