# Synthesis

This notebook calls the trained cyc-noise-hn-nsf models and generate the waveforms on CPU. 

## Load packages

In [1]:
# At the begining, let's load packages 
from __future__ import absolute_import
from __future__ import print_function
import sys, os
import numpy as np
import torch
import torch.nn as torch_nn
import torch.nn.functional as torch_nn_func
import time

# misc functions for this demonstration book
import tool_lib

## Initialize original Cyc-Noise-Hn-NSF 
We first use the cyc-noise-hn-nsf in data_models/pre_trained_hn_nsf.

#### Load model definition

In [2]:
# load the basic function blocks
REPO_PATH = "/home/tianyu_zhao/repos/nsf"
PROJ_PATH = "/home/tianyu_zhao/repos/nsf/project/cyc-noise-nsf-4"
sys.path.append(REPO_PATH)
sys.path.append(PROJ_PATH)
import model as nii_nn_blocks

# input feature dim (80 dimension Mel-spec + 1 dimension F0)
mel_dim = 80
f0_dim = 1
input_dim = mel_dim + f0_dim

# output dimension = 1 for waveform
output_dim = 1
# sampling rate of waveform (Hz)
sampling_rate = 16000
# up-sampling rate of acoustic features (sampling_rate * frame_shift)
feat_upsamp_rate = int(16000 * 0.01)

# sampling rate and up-sampling rate have been written in data_models/pre_trained_hn_nsf/model.py for this tutorial.
# no need to provide them as arguments
# declare the model
hn_nsf_model = nii_nn_blocks.Model(input_dim, output_dim, None)

#### Load pre-trained model and data

In [66]:
# load pre-trained model
checkpoint_path = os.path.join(PROJ_PATH, "ckpt_lili_10ms", "trained_network.pt")
device=torch.device("cuda")
hn_nsf_model.to(device, dtype=torch.float32)
checkpoint = torch.load(checkpoint_path, map_location="cpu")
hn_nsf_model.load_state_dict(checkpoint)

<All keys matched successfully>

In [70]:
# load mel and F0
input_mel_path = os.path.join(REPO_PATH, 'project', 'DATA', 'binbin', '10ms', 'mspec', 'scov1110.mspec')
input_f0_path = os.path.join(REPO_PATH, 'project', 'DATA', 'binbin', '10ms', 'f0', 'scov1110.f0')
# input_mel_path = '/data2/tianyu/cache/synthesized/f32/tst_04.mel'
# input_f0_path = '/data2/tianyu/cache/synthesized/f32/tst_04.f0'
input_mel = tool_lib.read_raw_mat(input_mel_path, mel_dim)
input_f0 = tool_lib.read_raw_mat(input_f0_path, f0_dim)

print("Input Mel shape:" + str(input_mel.shape))
print("Input F0 shape:" + str(input_f0.shape))

# compose the input tensor
input_length = min([input_mel.shape[0], input_f0.shape[0]])
input_tensor = torch.zeros(1, input_length, mel_dim + f0_dim, dtype=torch.float32)
input_tensor[0, :, 0:mel_dim] = torch.tensor(input_mel[0:input_length, :])
input_tensor[0, :, mel_dim:] = torch.tensor(input_f0[0:input_length]).unsqueeze(-1)
print("Input data tensor shape:" + str(input_tensor.shape))

Input Mel shape:(517, 80)
Input F0 shape:(517,)
Input data tensor shape:torch.Size([1, 517, 81])


### Do generation and evaluate speed

In [71]:
num_iter = 1

print("Generate a waveform for %d times:" % (num_iter))
time_start = time.time()
with torch.no_grad():
    for idx in range(num_iter):
        output_waveform = hn_nsf_model(input_tensor.cuda())
        print("%d" % (idx), end=', ')
time_end = time.time()


Generate a waveform for 1 times:
0, 

In [72]:
print("Generation done")
output_waveform_array = output_waveform[0][0].cpu().numpy()
output_duration = output_waveform_array.shape[0] / sampling_rate

time_average = (time_end - time_start) / num_iter
speed_per_s = output_waveform_array.shape[0] / time_average
real_time_factor =  output_duration / time_average
print("Speed (waveform sampling points per second): %f" % (speed_per_s))
print("Real time factor: %f" % (real_time_factor))

print("Generated sample:")
import IPython.display
IPython.display.Audio(output_waveform_array, rate=sampling_rate, normalize=False)

Generation done
Speed (waveform sampling points per second): 559452.447139
Real time factor: 34.965778
Generated sample:


In [65]:
input_mel.min(), input_mel.max(), input_f0.min(), input_f0.max()

(0.0, 0.6474179, 0.0, 298.922)

In [58]:
input_f0

array([  0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      ,   0.      ,   0.      ,
         0.      ,   0.      ,   0.      , 240.66783 , 253.020