<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#uberduck_ml_dev.exec.force_spectrogram" data-toc-modified-id="uberduck_ml_dev.exec.force_spectrogram-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>uberduck_ml_dev.exec.force_spectrogram</a></span></li></ul></div>

In [None]:
# default_exp exec.force_spectrogram

# uberduck_ml_dev.exec.force_spectrogram

In [None]:
# export
import argparse
from collections import namedtuple
from dataclasses import dataclass
import json
import os
from pathlib import Path
from shutil import copyfile, copytree
import sys
from typing import List, Optional, Set

import numpy as np
from tqdm import tqdm
import torch

from nemo.collections.tts.models import TalkNetSpectModel
from nemo.collections.asr.data.audio_to_text import AudioToCharWithDursF0Dataset


def parse_args(args):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-m", "--model", default="Path to model state dict", required=True
    )
    parser.add_argument("-f", "--filelist", default="Path to filelist", required=True)
    parser.add_argument("-t", "--model-type", help="model type", default="talknet")
    parser.add_argument("--durations")
    parser.add_argument("--f0s")
    parser.add_argument("--rel-path")
    parser.add_argument("--cuda", default=torch.cuda.is_available())
    return parser.parse_args(args)


# expected_tokens = torch.tensor([[34, 18,  0,  6, 36, 23,  0, 41,  0, 15, 31,  8, 35,  0, 41,  0,  8, 33,
#           0, 23, 35,  0, 41,  0, 18, 39, 14, 28,  9,  0, 27, 12,  3,  0,  1, 34,
#           6, 35,  0, 42,  0, 25, 12,  3, 15, 33,  0,  5, 15, 27, 11,  0, 29, 18,
#           9, 26, 16, 18,  0, 41,  0,  8, 25,  3, 27,  0, 41,  0,  9, 22, 38, 15,
#          14, 27, 18,  0, 41,  0, 12, 26, 16,  0, 27, 12,  3,  0,  4, 31, 12,  0,
#          11, 35,  0, 42]])
# expected_durs = torch.tensor([[10,  1,  0,  1, 11,  1,  1,  1,  0,  1,  0,  2, 19,  2,  0,  1,  0,  1,
#           0,  1,  0,  1,  0,  1, 15,  1, 38,  1,  0,  1,  0,  2,  1,  1,  0,  1,
#           0,  1,  0,  1, 14,  1, 14,  1,  0,  1,  0,  2,  0,  2,  0,  1, 20,  1,
#           0,  1,  0,  2, 12,  3,  1,  1,  0,  1,  0,  1, 11,  3,  0,  1,  0,  1,
#           0,  1, 10,  1, 10,  2,  0,  1,  0,  2,  1,  1,  0,  1, 10,  1,  0,  1,
#           0,  1,  6,  3,  0,  1,  0,  1,  0,  1,  0,  1,  8,  2,  1,  1,  0,  1,
#          20,  1,  0,  1,  0,  1,  0,  1, 19,  1,  0,  1,  0,  1,  0,  1,  0,  1,
#           0,  1,  9,  1,  9,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
#          10,  1,  0,  1,  0,  1, 23,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,
#          19,  3,  1,  1,  0,  1,  0,  1,  8,  3,  0,  1,  0,  1,  0,  1, 15,  2,
#           0,  1,  0,  1, 17,  1,  0,  1,  0]])


def run(args):
    if args.model_type != "talknet":
        raise Exception("Supported model types: talknet")
    model = TalkNetSpectModel.restore_from(args.model, map_location="cpu")
    model.eval()
    durs = torch.load(args.durations)
    f0s = torch.load(args.f0s)
    rel_path = args.rel_path
    with open(args.filelist, encoding="utf-8") as f:
        lines = f.readlines()
    for line in tqdm(lines):
        if "{" in line or "}" in line:
            print("arpabet is not supported, skipping")
            print(line)

        path = line.split("|")[0].strip()
        if rel_path:
            path = os.path.join(rel_path, path)
        line_name, _ = os.path.splitext(os.path.basename(path))
        text = line.split("|")[1].strip()
        line_tokens = model.parse(text=line.split("|")[1].strip())
        line_durs = (
            torch.stack(
                (
                    durs[line_name]["blanks"],
                    torch.cat((durs[line_name]["tokens"], torch.zeros(1).int())),
                ),
                dim=1,
            )
            .view(-1)[:-1]
            .view(1, -1)
        )
        x_f0s = f0s[line_name].view(1, -1)
        if args.cuda:
            line_durs = line_durs.cuda()
            x_f0s = x_f0s.cuda()

        if model.blanking:
            debug_tokens = [
                AudioToCharWithDursF0Dataset.interleave(
                    x=torch.empty(len(t) + 1, dtype=torch.long, device=t.device).fill_(
                        model.vocab.blank
                    ),
                    y=t,
                )
                for t in line_tokens
            ]
            debug_tokens = AudioToCharWithDursF0Dataset.merge(
                debug_tokens, value=model.vocab.pad, dtype=torch.long
            )

        text_len = torch.tensor(debug_tokens.shape[-1], dtype=torch.long).unsqueeze(0)
        durs_len = torch.tensor(line_durs.shape[-1], dtype=torch.long).unsqueeze(0)
        print(text_len, durs_len)
        if text_len != durs_len:
            print([model.vocab._id2label[x] for x in debug_tokens[0]])
            import pdb

            pdb.set_trace()
        spect = model.force_spectrogram(tokens=line_tokens, durs=line_durs, f0=x_f0s)
        out_path = path.replace(".wav", ".npy")
        np.save(out_path, spect.detach().cpu().numpy())


try:
    from nbdev.imports import IN_NOTEBOOK
except:
    IN_NOTEBOOK = False

if __name__ == "__main__" and not IN_NOTEBOOK:
    args = parse_args(sys.argv[1:])
    run(args)

In [None]:
args = parse_args(
    [
        "-m",
        "../talknet/zwf/TalkNetSpect.nemo",
        "-f",
        # "/Users/zwf/data/voice/dvc-managed/zwf/list.txt",
        "/Users/zwf/Downloads/allfiles.txt",
        "--durations",
        "../talknet/zwf/durations2.pt",
        "--f0s",
        "../talknet/zwf/f0s2.pt",
        "--rel-path",
        "/Users/zwf/data/voice/dvc-managed/zwf",
    ]
    # [
    #     "-m",
    #     "../talknet/sam-lachow/TalkNetSpect.nemo",
    #     "-f",
    #     "/Users/zwf/data/voice/dvc-managed/uberduck-multispeaker/sam-lachow/list.txt",
    #     "--durations",
    #     "../talknet/sam-lachow/durations.pt",
    #     "--f0s",
    #     "../talknet/sam-lachow/f0s.pt",
    #     "--rel-path",
    #     "/Users/zwf/data/voice/dvc-managed/uberduck-multispeaker/sam-lachow",
    # ]
)
run(args)

[NeMo W 2022-01-10 08:03:05 modelPT:138] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset
      manifest_filepath: trainfiles.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      load_audio: true
      normalize: false
      sample_rate: 22050
      trim: false
      durs_file: /content/drive/My Drive/talknet/zwf/durations.pt
      f0_file: /content/drive/My Drive/talknet/zwf/f0s.pt
      blanking: true
      vocab:
        notation: phonemes
        punct: true
        spaces: true
        stresses: false
        add_blank_at: last
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 32
      num_workers: 4
    
[NeMo W 2022-01-10 08:03:05 modelPT:145] If you intend to do validation, please

[NeMo I 2022-01-10 08:03:05 features:252] PADDING: 1
[NeMo I 2022-01-10 08:03:05 features:269] STFT using torch
[NeMo I 2022-01-10 08:03:05 modelPT:439] Model TalkNetSpectModel was successfully restored from ../talknet/zwf/TalkNetSpect.nemo.


  1%|█▏                                                                                                                       | 4/392 [00:00<00:08, 45.44it/s]

TOKENS:  tensor([[34, 18,  0,  6, 36, 23,  0, 41,  0, 15, 31,  8, 35,  0, 41,  0,  8, 33,
          0, 23, 35,  0, 41,  0, 18, 39, 14, 28,  9,  0, 27, 12,  3,  0,  1, 34,
          6, 35,  0, 42,  0, 25, 12,  3, 15, 33,  0,  5, 15, 27, 11,  0, 29, 18,
          9, 26, 16, 18,  0, 41,  0,  8, 33,  3, 27,  0, 41,  0,  9, 27, 15, 27,
         14, 18,  0, 41,  0, 12, 25, 23,  0, 27, 12,  3,  0,  4, 31, 12,  0, 11,
         35,  0, 42]])
DURS:  tensor([[10,  1,  0,  1, 11,  1,  1,  1,  0,  1,  0,  2, 19,  2,  0,  1,  0,  1,
          0,  1,  0,  1,  0,  1, 15,  1, 38,  1,  0,  1,  0,  2,  1,  1,  0,  1,
          0,  1,  0,  1, 14,  1, 14,  1,  0,  1,  0,  2,  0,  2,  0,  1, 20,  1,
          0,  1,  0,  2, 12,  3,  1,  1,  0,  1,  0,  1, 11,  3,  0,  1,  0,  1,
          0,  1, 10,  1, 10,  2,  0,  1,  0,  2,  1,  1,  0,  1, 10,  1,  0,  1,
          0,  1,  6,  3,  0,  1,  0,  1,  0,  1,  0,  1,  8,  2,  1,  1,  0,  1,
         20,  1,  0,  1,  0,  1,  0,  1, 19,  1,  0,  1,  0,  1,  0,  




RuntimeError: The size of tensor a (93) must match the size of tensor b (94) at non-singleton dimension 1

In [None]:
import tarfile

with tarfile.open(
    os.path.join("../talknet/sam-lachow/", "TalkNetSpect.nemo"), "r:gz"
) as tf:
    tf.extractall("/tmp")
model_weights_path = "/tmp/model_weights.ckpt"
state_dict = torch.load("/tmp/model_weights.ckpt", map_location="cpu")
state_dict = {
    k.replace("model.", ""): v for k, v in state_dict.items() if k.startswith("model.")
}

In [None]:
model.force_spectrogram??

In [None]:
from nemo.collections.tts.models import TalkNetSpectModel

model = TalkNetSpectModel.restore_from(
    "../talknet/sam-lachow/TalkNetSpect.nemo", map_location="cpu"
)

[NeMo W 2022-01-09 22:54:24 modelPT:138] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.asr.data.audio_to_text.AudioToCharWithDursF0Dataset
      manifest_filepath: trainfiles.json
      max_duration: null
      min_duration: 0.1
      int_values: false
      load_audio: true
      normalize: false
      sample_rate: 22050
      trim: false
      durs_file: /content/drive/My Drive/talknet/sam-lachow/durations.pt
      f0_file: /content/drive/My Drive/talknet/sam-lachow/f0s.pt
      blanking: true
      vocab:
        notation: phonemes
        punct: true
        spaces: true
        stresses: false
        add_blank_at: last
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 32
      num_workers: 4
    
[NeMo W 2022-01-09 22:54:24 modelPT:145] If you intend to do vali

[NeMo I 2022-01-09 22:54:24 features:252] PADDING: 1
[NeMo I 2022-01-09 22:54:24 features:269] STFT using torch
[NeMo I 2022-01-09 22:54:24 modelPT:439] Model TalkNetSpectModel was successfully restored from ../talknet/sam-lachow/TalkNetSpect.nemo.


In [None]:
TalkNetSpectModel.force_spectrogram??

In [None]:
durs = torch.load("../talknet/zwf/durations.pt")["136"]
print(durs)
print(durs["blanks"].shape)
print(durs["tokens"].shape)

{'blanks': tensor([10,  0,  0, 16,  0,  0, 16,  0,  0,  0,  0,  0,  5,  1,  0,  7,  0,  0,
         0,  7,  0,  0, 11,  0,  0,  0, 12,  0,  0,  0, 15,  0,  0, 15,  0,  0,
         1,  0,  0,  0,  8,  0,  0, 11,  0,  0,  0, 17,  0,  0,  0,  0,  0, 28,
         0,  0,  2,  0, 10,  0,  0, 17,  0,  0,  0,  0,  0, 11,  1,  0, 13,  0,
         0,  0, 12,  0,  0,  1,  9,  0,  0, 19,  0,  0,  0, 14,  0,  0,  0,  0,
        20,  0,  0,  0,  0, 12,  0,  0,  0,  0, 18,  0,  0,  0,  0, 13,  0,  0,
         0, 11,  0,  0, 19,  0,  0]), 'tokens': tensor([1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 1, 1, 2, 3, 1, 1, 2, 1, 1, 1, 2,
        2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 3, 1,
        1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2,
        1, 2, 1, 1, 3, 1, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, 1])}
torch.Size([115])
torch.Size([114])


In [None]:
torch.load("../talknet/zwf/f0s.pt")["136"].shape

In [None]:
model.blanking

True

In [None]:
model.vocab._id2label

[' ',
 'B',
 'CH',
 'D',
 'DH',
 'F',
 'G',
 'HH',
 'JH',
 'K',
 'L',
 'M',
 'N',
 'NG',
 'P',
 'R',
 'S',
 'SH',
 'T',
 'TH',
 'V',
 'W',
 'Y',
 'Z',
 'ZH',
 'AA',
 'AE',
 'AH',
 'AO',
 'AW',
 'AY',
 'EH',
 'ER',
 'EY',
 'IH',
 'IY',
 'OW',
 'OY',
 'UH',
 'UW',
 "'",
 ',',
 '.',
 '!',
 '?',
 '-',
 ':',
 ';',
 '/',
 '"',
 '(',
 ')',
 '[',
 ']',
 '{',
 '}',
 '<pad>',
 '<oov>',
 '<blank>']