In [1]:
import json
import numpy as np
import torch
import soundfile as sf
import time
import random
import librosa
import os
from pathlib import Path

from nemo.collections.tts.torch.helpers import BetaBinomialInterpolator
from nemo.collections.tts.models.base import SpectrogramGenerator
from nemo.collections.tts.models import FastPitchModel, MixerTTSModel
from nemo.collections.asr.parts.preprocessing.segment import AudioSegment

[NeMo W 2022-07-18 10:05:29 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2022-07-18 10:05:29 experimental:27] Module <class 'nemo.collections.tts.torch.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [2]:
def json_reader(filename):
    with open(filename) as f:
        for line in f:
            yield json.loads(line)

train = list(json_reader('/home/chsieh/manifest/mtts-dataset/train-100.json'))
val = list(json_reader('/home/chsieh/manifest/mtts-dataset/val-100.json'))
print(len(train), len(val))

25364 512


In [3]:
def load_wav(audio_file, target_sr=None):
    with sf.SoundFile(audio_file, 'r') as f:
        samples = f.read(dtype='float32')
        sample_rate = f.samplerate
    if target_sr is not None and sample_rate != target_sr:
        samples = librosa.core.resample(samples, orig_sr=sample_rate, target_sr=target_sr)
    return samples.transpose()


# ckpt_file = '/home/chsieh/chengping-ws/fastpitch/completed/emb-all/FastPitch/2022-07-08_07-44-15/checkpoints/FastPitch--v_loss=0.8746-epoch=199.ckpt'
ckpt_file = '/home/chsieh/chengping-ws/fastpitch/completed/emb-all-cln/FastPitch/2022-07-13_21-39-05/checkpoints/FastPitch--v_loss=0.8200-epoch=479.ckpt'
spec_model = FastPitchModel.load_from_checkpoint(ckpt_file)
spec_model.to('cuda:0')
spec_model.eval()

[NeMo I 2022-07-18 10:05:35 tokenize_and_classify:87] Creating ClassifyFst grammars.


[NeMo W 2022-07-18 10:05:58 experimental:27] Module <class 'nemo.collections.tts.torch.g2ps.IPAG2P'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2022-07-18 10:05:58 g2ps:86] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2022-07-18 10:05:58 modelPT:149] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /ws/mtts-dataset/train-100.json
      sample_rate: 22050
      sup_data_path: /raid/speech_sup/fastpitch_sup_data_folder
      sup_data_types:
      - align_prior_matrix
  

[NeMo I 2022-07-18 10:05:58 features:200] PADDING: 1


FastPitchModel(
  (mel_loss): MelLoss()
  (pitch_loss): PitchLoss()
  (duration_loss): DurationLoss()
  (aligner): AlignmentEncoder(
    (softmax): Softmax(dim=3)
    (log_softmax): LogSoftmax(dim=3)
    (key_proj): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(384, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      )
      (1): ReLU()
      (2): ConvNorm(
        (conv): Conv1d(768, 80, kernel_size=(1,), stride=(1,))
      )
    )
    (query_proj): Sequential(
      (0): ConvNorm(
        (conv): Conv1d(80, 160, kernel_size=(3,), stride=(1,), padding=(1,))
      )
      (1): ReLU()
      (2): ConvNorm(
        (conv): Conv1d(160, 80, kernel_size=(1,), stride=(1,))
      )
      (3): ReLU()
      (4): ConvNorm(
        (conv): Conv1d(80, 80, kernel_size=(1,), stride=(1,))
      )
    )
  )
  (forward_sum_loss): ForwardSumLoss(
    (log_softmax): LogSoftmax(dim=3)
    (ctc_loss): CTCLoss()
  )
  (bin_loss): BinLoss()
  (preprocessor): AudioToMelSpectrogramPreprocessor(
  

In [4]:
beta_binomial_interpolator = BetaBinomialInterpolator()
device = spec_model.device
print("device: {}".format(device))

device: cuda:0


In [7]:
save_name = 'Libritts-FP-embadd-cln-mels-22k'

In [9]:
save_dir = Path(f"/home/chsieh/{save_name}/")
save_dir.mkdir(exist_ok=True, parents=True)

In [10]:
cur_manifest = train

In [11]:
start_time = time.time()
for i, r in enumerate(cur_manifest):
    save_path = save_dir / f"mel_train_{i}.npy"
    
    if os.path.exists(save_path):
        r["mel_filepath"] = str(save_path)
        continue
        
    audio = load_wav(r["audio_filepath"], 22050)
    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
    
    # Again, our finetuned FastPitch model doesn't use multiple speakers,
    # but we keep the code to support it here for reference
    if spec_model.fastpitch.speaker_emb is not None and "speaker" in r:
       speaker = torch.tensor([r['speaker']]).to(device)
    else:
       speaker = None
    
    with torch.no_grad():
        if "normalized_text" in r:
           text = spec_model.parse(r["normalized_text"], normalize=False)
        else:
           text = spec_model.parse(r['text'])
        
        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
    
        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

        # Generate attention prior and spectrogram inputs for HiFi-GAN
        attn_prior = torch.from_numpy(
         beta_binomial_interpolator(spect_len.item(), text_len.item())
        ).unsqueeze(0).to(text.device)
            
        spectrogram = spec_model.forward(
         text=text, 
         input_lens=text_len, 
         spec=spect, 
         mel_lens=spect_len, 
         attn_prior=attn_prior,
         speaker=speaker,
        )[0]
        
        
        np.save(save_path, spectrogram[0].to('cpu').numpy())
        r["mel_filepath"] = str(save_path)
        
    if i%500 == 0:
        tm = time.time() - start_time
        print("done {} in {} sec".format(i, tm))

done 0 in 1.722810983657837 sec
done 500 in 74.01316928863525 sec
done 1000 in 139.46378183364868 sec
done 1500 in 203.0050597190857 sec
done 2000 in 266.296373128891 sec
done 2500 in 326.1671657562256 sec
done 3000 in 386.45816016197205 sec
done 3500 in 447.59197425842285 sec
done 4000 in 505.961065530777 sec
done 4500 in 566.7194142341614 sec
done 5000 in 626.1484844684601 sec
done 5500 in 683.803308725357 sec
done 6000 in 743.7966949939728 sec
done 6500 in 808.656904220581 sec
done 7000 in 870.1506638526917 sec
done 7500 in 930.0228970050812 sec
done 8000 in 987.2079255580902 sec
done 8500 in 1046.0684022903442 sec
done 9000 in 1107.9700260162354 sec
done 9500 in 1172.7521977424622 sec
done 10000 in 1232.4664885997772 sec
done 10500 in 1293.0866334438324 sec
done 11000 in 1351.3352448940277 sec
done 18000 in 2167.26926612854 sec
done 18500 in 2223.825672388077 sec
done 19000 in 2281.038801431656 sec
done 19500 in 2339.4216430187225 sec
done 20000 in 2396.7035009860992 sec
done 20500

In [13]:
hifigan_manifest_path = f"/home/chsieh/manifest/mel-dataset/{save_name}"
os.makedirs(hifigan_manifest_path, exist_ok=True)

with open(os.path.join(hifigan_manifest_path, 'hifigan_train.json'), "w") as f:
    for r in cur_manifest:
        f.write(json.dumps(r) + '\n')

In [15]:
cur_manifest = val
start_time = time.time()
for i, r in enumerate(cur_manifest):
    save_path = save_dir / f"mel_val_{i}.npy"
    
    if os.path.exists(save_path):
        r["mel_filepath"] = str(save_path)
        continue
        
    audio = load_wav(r["audio_filepath"], 22050)
    audio = torch.from_numpy(audio).unsqueeze(0).to(device)
    audio_len = torch.tensor(audio.shape[1], dtype=torch.long, device=device).unsqueeze(0)
    
    # Again, our finetuned FastPitch model doesn't use multiple speakers,
    # but we keep the code to support it here for reference
    if spec_model.fastpitch.speaker_emb is not None and "speaker" in r:
       speaker = torch.tensor([r['speaker']]).to(device)
    else:
       speaker = None
    
    with torch.no_grad():
        if "normalized_text" in r:
           text = spec_model.parse(r["normalized_text"], normalize=False)
        else:
           text = spec_model.parse(r['text'])
        
        text_len = torch.tensor(text.shape[-1], dtype=torch.long, device=device).unsqueeze(0)
    
        spect, spect_len = spec_model.preprocessor(input_signal=audio, length=audio_len)

        # Generate attention prior and spectrogram inputs for HiFi-GAN
        attn_prior = torch.from_numpy(
         beta_binomial_interpolator(spect_len.item(), text_len.item())
        ).unsqueeze(0).to(text.device)
            
        spectrogram = spec_model.forward(
         text=text, 
         input_lens=text_len, 
         spec=spect, 
         mel_lens=spect_len, 
         attn_prior=attn_prior,
         speaker=speaker,
        )[0]
        
        
        np.save(save_path, spectrogram[0].to('cpu').numpy())
        r["mel_filepath"] = str(save_path)
        
    if i%500 == 0:
        tm = time.time() - start_time
        print("done {} in {} sec".format(i, tm))
        
hifigan_manifest_path = f"/home/chsieh/manifest/mel-dataset/{save_name}"
os.makedirs(hifigan_manifest_path, exist_ok=True)

with open(os.path.join(hifigan_manifest_path, 'hifigan_val.json'), "w") as f:
    for r in cur_manifest:
        f.write(json.dumps(r) + '\n')

In [16]:
import os
import json
import fnmatch

In [None]:
hifigan_manifest_path = f"/home/chsieh/chengping-ws/mel-dataset/{save_name}"
os.makedirs(hifigan_manifest_path, exist_ok=True)

In [44]:
dataset_path = f'/home/chsieh/manifest/mel-dataset/{save_name}/hifigan_train.json'
dataset_path_ws = f'/home/chsieh/chengping-ws/mel-dataset/{save_name}/hifigan_train.json'

In [40]:
with open(dataset_path) as f:
    dataset = [eval(line.strip()) for line in f]

In [41]:
dataset[0]

{'audio_filepath': '/data/speech/LibriTTS/LibriTTS/train-clean-360/8410/278217/8410_278217_000100_000004.wav',
 'duration': 3.730083,
 'text': 'I have not yet recovered from the shock of that horrible business at Winchester.',
 'speaker': 32,
 'old_speaker_id': 8410,
 'mel_filepath': '/data/speech/Libritts-FP-embadd-cln-mels-22k/mel_train_0.npy'}

In [42]:
for d in dataset:
    d['audio_filepath'] = d['audio_filepath'].replace('/raid/speech', '/data/speech')
    d['mel_filepath'] = d['mel_filepath'].replace('/home/chsieh', '/data/speech')

with open(dataset_path, 'w') as fout:
    for m in dataset: fout.write(json.dumps(m) + '\n')

In [45]:
for d in dataset:
    d['audio_filepath'] = d['audio_filepath'].replace('/data/speech', '/raid/speech')
    d['mel_filepath'] = d['mel_filepath'].replace('/data/speech', '/raid/speech_mel')

with open(dataset_path_ws, 'w') as fout:
    for m in dataset: fout.write(json.dumps(m) + '\n')