In [1]:
import json
import numpy as np
import torch
import soundfile as sf
import librosa

from pathlib import Path

from nemo.collections.tts.torch.helpers import BetaBinomialInterpolator

[NeMo W 2022-05-09 01:52:59 experimental:27] Module <class 'nemo.collections.nlp.data.language_modeling.megatron.megatron_batch_samplers.MegatronPretrainingRandomBatchSampler'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [16]:
generate_mels = "PREDICTED" # or "PREDICTED", "ORIGINAL"
dataset_part = "train" # or "val", "train"

In [3]:
if generate_mels == "PREDICTED":
    from nemo.collections.tts.models import FastPitchModel
    spec_model = FastPitchModel.load_from_checkpoint("/akshita/R3priorckpts/FastPitch--v_loss=0.6679-epoch=149.ckpt")
    spec_model.eval().cuda()

[NeMo I 2022-05-09 01:53:04 tokenize_and_classify:93] Creating ClassifyFst grammars.


[NeMo W 2022-05-09 01:53:37 g2ps:84] apply_to_oov_word=None, it means that some of words will remain unchanged if they are not handled by one of rule in self.parse_one_word(). It is useful when you use tokenizer with set of phonemes and chars together, otherwise it can be not.
[NeMo W 2022-05-09 01:53:37 modelPT:148] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /Data/openslr-95-german-neutral-tts/thorsten-de/train_manifest.json
      sample_rate: 22050
      sup_data_path: /Data/openslr-95-german-neutral-tts/thorsten-de/sup_data
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
      lowfreq: 0
      highfreq: 8000
      max_duration: null

[NeMo I 2022-05-09 01:53:38 features:259] PADDING: 1
[NeMo I 2022-05-09 01:53:38 features:276] STFT using torch


In [17]:
def infer(spec_gen_model, vocoder_model, str_input, speaker=None):
    """
    Synthesizes spectrogram and audio from a text string given a spectrogram synthesis and vocoder model.
    
    Args:
        spec_gen_model: Spectrogram generator model (FastPitch in our case)
        vocoder_model: Vocoder model (HiFiGAN in our case)
        str_input: Text input for the synthesis
        speaker: Speaker ID
    
    Returns:
        spectrogram and waveform of the synthesized audio.
    """
    with torch.no_grad():
        parsed = spec_gen_model.parse(str_input)
        if speaker is not None:
            speaker = torch.tensor([speaker]).long().to(device=spec_gen_model.device)
        spectrogram = spec_gen_model.generate_spectrogram(tokens=parsed, speaker=speaker)
        audio = vocoder_model.convert_spectrogram_to_audio(spec=spectrogram)
        
    if spectrogram is not None:
        if isinstance(spectrogram, torch.Tensor):
            spectrogram = spectrogram.to('cpu').numpy()
        if len(spectrogram.shape) == 3:
            spectrogram = spectrogram[0]
    if isinstance(audio, torch.Tensor):
        audio = audio.to('cpu').numpy()
    return spectrogram, audio

def get_best_ckpt_from_last_run(
        base_dir, 
        new_speaker_id, 
        duration_mins, 
        mixing_enabled, 
        original_speaker_id, 
        model_name="FastPitch"
    ):    
    mixing = "no_mixing" if not mixing_enabled else "mixing"
    
    d = f"{original_speaker_id}_to_{new_speaker_id}_{mixing}_{duration_mins}_mins"
    
    exp_dirs = list([i for i in (Path(base_dir) / d / model_name).iterdir() if i.is_dir()])
    last_exp_dir = sorted(exp_dirs)[-1]
    
    last_checkpoint_dir = last_exp_dir / "checkpoints"
    
    last_ckpt = list(last_checkpoint_dir.glob('*-last.ckpt'))

    if len(last_ckpt) == 0:
        raise ValueError(f"There is no last checkpoint in {last_checkpoint_dir}.")
    
    return str(last_ckpt[0])

In [18]:
def load_wav(audio_file):
    with sf.SoundFile(audio_file, 'r') as f:
        samples = f.read(dtype='float32')
    return samples.transpose()

In [19]:
# Get records from the training manifest
manifest_path = "/Data/openslr-95-german-neutral-tts/thorsten-de/"+dataset_part+"_manifest.json"
records = []
with open(manifest_path, "r") as f:
    for i, line in enumerate(f):
        records.append(json.loads(line))

In [20]:
beta_binomial_interpolator = BetaBinomialInterpolator()

if generate_mels == "PREDICTED":
    spec_model.eval()
    device = spec_model.device
    save_dir = Path("/Data/openslr-95-german-neutral-tts/synmels/"+dataset_part)
else:
    save_dir = Path("/Data/openslr-95-german-neutral-tts/orig_synmels/"+dataset_part)

save_dir.mkdir(exist_ok=True, parents=True)

# Generate a spectrograms (we need to use ground truth alignment for correct matching between audio and mels)
for i, r in enumerate(records):
    
    if generate_mels == "PREDICTED":
        print("loading predicted melspecs")
        audio = load_wav(r["audio_filepath"])

        audio = torch.from_numpy(audio).unsqueeze(0).to(device)
        audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)

        # Again, our finetuned FastPitch model doesn't use multiple speakers,
        # but we keep the code to support it here for reference
        if spec_model.fastpitch.speaker_emb is not None and "speaker" in r:
            speaker = torch.tensor([r['speaker']]).to(device)
        else:
            speaker = None

        with torch.no_grad():
            if "normalized_text" in r:
                text = spec_model.parse(r["normalized_text"], normalize=False)
            else:
                text = spec_model.parse(r['text'])

            text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)

            spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

            # Generate attention prior and spectrogram inputs for HiFi-GAN
            attn_prior = torch.from_numpy(
              beta_binomial_interpolator(spect_len.item(), text_len.item())
            ).unsqueeze(0).to(text.device)

            spectrogram = spec_model.forward(
              text=text, 
              input_lens=text_len, 
              spec=spect, 
              mel_lens=spect_len, 
              attn_prior=attn_prior,
              speaker=speaker,
            )[0]
        save_path = save_dir / f"mel_{i}.npy"
        np.save(save_path, spectrogram[0].to('cpu').numpy())
        r["mel_filepath"] = str(save_path)
    
    else:
        # original melspecs
        print("loading original melspecs")
        y, sr = librosa.load(r["audio_filepath"])
        spectrogram2 = np.log(librosa.feature.melspectrogram(y=y, sr=sr, n_fft=1024, win_length=1024, hop_length=256))
        spectrogram = spectrogram2[ :80, :]
        save_path = save_dir / f"mel_{i}.npy"
        np.save(save_path, spectrogram)
        r["mel_filepath"] = str(save_path)
    

if generate_mels == "PREDICTED":
    hifigan_manifest_path = "/Data/openslr-95-german-neutral-tts/synmels/hifigan_"+dataset_part+"_ft.json"
else:
    hifigan_manifest_path = "/Data/openslr-95-german-neutral-tts/orig_synmels/hifigan_"+dataset_part+"_ft.json"

with open(hifigan_manifest_path, "w") as f:
    for r in records:
        f.write(json.dumps(r) + '\n')
# Please do the same for the validation json. Code is omitted.

[NeMo W 2022-05-09 02:02:16 tts_tokenizers:351] Text: [das KAY1ND HHAE1T neununddreißig komma acht GRAE1D FIY1BER0.] contains unknown char/phoneme: [ß]. Original text: [Das Kind hat neununddreißig Komma acht Grad Fieber.]. Symbol will be skipped.
[NeMo W 2022-05-09 02:02:16 tts_tokenizers:351] Text: [AY1N medizinmann, DER1 blutsbruderschaft gutheißTIY1, HHAE1T VAO1N medizin keine ahnung.] contains unknown char/phoneme: [ß]. Original text: [Ein Medizinmann, der Blutsbruderschaft gutheißt, hat von Medizin keine Ahnung.]. Symbol will be skipped.
[NeMo W 2022-05-09 02:02:17 tts_tokenizers:351] Text: [MIY1N NEY1M ist MAA1RLAH0N AH1ND IH1CH grußIY1 alle, DAY1 MIH1CH kennen.] contains unknown char/phoneme: [ß]. Original text: [Mein Name ist Marlon und ich gruße alle, die mich kennen.]. Symbol will be skipped.
[NeMo W 2022-05-09 02:02:18 tts_tokenizers:351] Text: [funfundsiebzig pferdestarken MAE1KAH0N auf DER1 AO1TOW0BAA2N einfach keinen SPAA1ß.] contains unknown char/phoneme: [ß]. Original t