This Notebook is for generating the page interactively.

In [1]:
from pathlib import Path
from os.path import expanduser
from os import system

In [2]:
from templates.audio_table import table_with_texts
from templates.frame import header, footer

In [16]:
code_map = {
    "nsf_adv": "NSF-adv",
    "nsf_adv_tts": "Tacotron2 + NSF-adv",
    "pwg": "Parallel WaveGAN",
    "pwg_tts": "Tacotron2 + Parallel WaveGAN",
    "h_sinc_nsf": "hn-sinc-NSF",
    "h_sinc_nsf_tts": "Tacotron2 + hn-sinc-NSF",
    "nhv_noadv": "NHV-noadv",
    "original": "Original",
    "nhv": "NHV(cGAN)",
    "nhv_non_conditional_gan": "NHV(GAN)",
    "nhv_tts": "Tacotron2 + NHV",
    "wavenet": "MoL WaveNet",
    "ddsp": "DDSP(Sinusoids + Noise)",
    "spn_adv": "DDSP(S+N, cGAN)"
}
file_root = Path("/home/sorcerer/Experiments/samples/")
target_root = Path("samples/")

In [17]:
copy_id = [9913, 9926, 9933, 9963, 9977, 9979, 9988]
# Cherry picked for sentences with good prosody. 
tts_id = [9917, 9918, 9920, 9925, 9929, 9942, 9957]

In [18]:
copy_model = ["original", "nhv_non_conditional_gan", "nhv", "pwg", "nsf_adv", "wavenet", "h_sinc_nsf", "nhv_noadv", "ddsp", "spn_adv"]
tts_model = ["original", "nhv_tts", "pwg_tts", "nsf_adv_tts", "h_sinc_nsf_tts"]

In [19]:
class CNTextLoader:
    def __init__(self, text_file_path):
        text_file_path = Path(expanduser(str(text_file_path)))
        assert text_file_path.exists(), "Text File Not Found"
        with open(text_file_path, "r") as f:
            lines = f.readlines()
        self.idxs = idxs = [int(l.strip().split()[0]) for l in lines[::2]]
        self.texts = texts = [l.strip().split()[1] for l in lines[::2]]
        self.pinyins = pinyins = [l.strip().split() for l in lines[1::2]]
        self.pinyins[2364] = ['zhe4', 'tu2', 'nan2', 'bu4', 'cheng2', 'shi4', 'pi1', 'guo4', 'de5']
        self.idx_text = {}
        self.idx_pinyin = {}
        for idx, text, pinyin in zip(idxs, texts, pinyins):
            self.idx_text[idx] = text
            self.idx_pinyin[idx] = pinyin

    def __len__(self):
        return len(self.idxs)
    
    def __getitem__(self, idx: int):
        return self.idx_text[idx], self.idx_pinyin[idx]

In [20]:
cn_text_loader = CNTextLoader("/home/sorcerer/datasets/BZNSYP/ProsodyLabeling/000001-010000.txt")

In [21]:
copy_text = [cn_text_loader[idx][0] for idx in copy_id]
tts_text  = [cn_text_loader[idx][0] for idx in tts_id]

In [22]:
system("rm -rf samples/")
system("mkdir ./samples")
for model_id in code_map:
    system("mkdir ./samples/" + model_id)

In [23]:
copy_audios = []
copy_titles = []
for i in copy_id:
    audios = []
    titles = []
    for model_id in copy_model:
        source_file = file_root / model_id / f"{i:06d}.wav"
        target_file = target_root / model_id / f"{i:06d}.wav"
        system(f"cp {str(source_file)} {str(target_file)}")
        audios.append("/nhv-web/" + str(target_file))
        titles.append(code_map[model_id])
    copy_audios.append(audios)
    copy_titles.append(titles)

In [24]:
tts_audios = []
tts_titles = []
for i in tts_id:
    audios = []
    titles = []
    for model_id in tts_model:
        source_file = file_root / model_id / f"{i:06d}.wav"
        target_file = target_root / model_id / f"{i:06d}.wav"
        system(f"cp {str(source_file)} {str(target_file)}")
        audios.append("/nhv-web/" + str(target_file))
        titles.append(code_map[model_id])
    tts_audios.append(audios)
    tts_titles.append(titles)

In [25]:
page = header("Online Supplement") + \
    """
    <div class="page-header">
        <h1>Neural Homomorphic Vocoder <small>Online supplement for InterSpeech 2020</small></h1>
    </div>
    <h3> Authors </h3>
    <div class="row">
        <div class="col-md-4">
            <address>
                <strong>Zhijun Liu</strong><br>
                <a href="#">sorcerer~at~sjtu.edu.cn</a>
            </address>
        </div>
        <div class="col-md-4">
            <address>
                <strong>Kuan Chen</strong><br>
                <a href="#">azrealkuan~at~sjtu.edu.cn</a>
            </address>
        </div>
        <div class="col-md-4">
            <address>
                <strong>Kai Yu</strong><br>
                <a href="#">kai.yu~at~sjtu.edu.cn</a>
            </address>
        </div>
    </div>
    <ul>
        <li>
            <strong>
            <a href="/nhv-web/phase.html">This jupyter notebook</a> demonstrates the importance of phase in speech.
            </strong>
        </li>
        <li>
        The <a href="https://www.data-baker.com/open_source.html">Chinese Standard Mandarin Speech Copus</a> is used for the demo. 
        </li>
        <li>
        All <strong>generated samples</strong> can be found on the <a href="https://drive.google.com/drive/folders/1IY28v3kHZh6e12wB0y0p2IXjqI9vwmH9?usp=sharing">google drive</a>.
        </li>
        <li>
        The analysis of <strong>computational complexity</strong> can be found in <a href="/nhv-web/computational_complexity.html">this jupyter notebook</a>.
        </li>
        <li>
        Further <strong>training details</strong> can be found in <a href="/nhv-web/training_details.html">this jupyter notebook</a>.
        </li>
    </ul>
    <h3> Copy Synthesis Demos </h3>
    <div class="alert alert-info" role="alert">
        <p>
        NHV(cGAN) is the model described in the paper and used in evaluation. NHV(GAN) was trained with the same discriminator used in b-NSF-adv. They have slightly different loss in audio quality. 
        </p>
        <p>
        DDSP(Sinusoids + Noise) uses the same network structure and inputs as NHV models. The output of the networks are replaced with `harmonic distributions` and `noise filter FFT amplitudes` as described in the DDSP paper. The F0 is encoded with 80 dimensional one-hot vectors in log-scale.
        This model was not used in evaluations.
        </p>
        <p>
        DDSP(S+N, cGAN) is the same as DDSP(S+N). Except that it is trained with cGAN. The GAN structure and hyper-parameters are the same as that used in NHV(cGAN).
        </p>
    </div>
    """ + \
    table_with_texts(
        copy_text,
        copy_audios,
        copy_titles,
        width=3
    ) + \
    """<h3> Text-to-Speech Demos </h3>""" + \
    table_with_texts(
        tts_text,
        tts_audios,
        tts_titles,
        width=3
    ) + \
    footer()
with open("index.html", "w") as f:
    f.write(page)

In [26]:
!cp /home/sorcerer/Experiments/2020/Tacotron+NHV/Final/phase.ipynb phase.ipynb

In [27]:
!source ./convert_all_ipynb_to_html.sh

[34mConverting computational_complexity.ipynb to HTML.[32m
[NbConvertApp] Converting notebook computational_complexity.ipynb to html
[NbConvertApp] Writing 297898 bytes to computational_complexity.html
[34mConverting phase.ipynb to HTML.[32m
[NbConvertApp] Converting notebook phase.ipynb to html
[NbConvertApp] Writing 1680105 bytes to phase.html
[34mConverting training_details.ipynb to HTML.[32m
[NbConvertApp] Converting notebook training_details.ipynb to html
[NbConvertApp] Writing 297915 bytes to training_details.html
