In [1]:
%env CUDA_VISIBLE_DEVICES=3

env: CUDA_VISIBLE_DEVICES=3


In [2]:
from pathlib import Path
import librosa
import torch
import resampy
import numpy as np
import os
import soundfile as sf

from data_objects.kaldi_interface import KaldiInterface

from utils.load_yaml import HpsYaml
from src.audio_utils import MAX_WAV_VALUE, load_wav, mel_spectrogram, normalize

from speaker_encoder.audio import preprocess_wav

from src.transformer_vqbnf_translate import Transformer as Translator
from src.bnftocode import Quantizer

from speaker_encoder.voice_encoder import SpeakerEncoder

from synthesizer.src.transformer_bnftomel_prosody_ecapa import Transformer as Synthesizer

from vocoder.hifigan_model import load_hifigan_generator

Load Models

In [3]:
def build_model(model_class, model_config, model_file, device):
    model = model_class(
        model_config["model"]
    ).to(device)
    ckpt = torch.load(model_file, map_location=device)
    model.load_state_dict(ckpt["model"])
    model.eval()
    return model

In [4]:
device = 'cuda'

# # mel2bnf
# mel2bnf_model_train_config = Path('/mnt/nvme-data1/waris/repo/accent_conversion/acoustic_model/config/am_config.yaml')
# mel2bnf_config = HpsYaml(mel2bnf_model_train_config) 
# mel2bnf_model_file = Path('/mnt/nvme-data1/waris/model_checkpoints/acoustic_model/acoustic_model/best_loss_step_910000.pth')
# mel2bnf_model = build_model(AcousticModel, mel2bnf_config, mel2bnf_model_file, device)

# # bnf2bnf
# syn_dir_trans = Path("/mnt/data1/waris/model_outputs/translator/sythesizer_like_train_set/logs-translator_train/taco_pretrained")
# translator = Translator(syn_dir_trans)

# encoder_accent_weights = Path("/home/grads/q/quamer.waris/projects/Accentron/pretrained_model/pretrained/encoder/saved_models/encoder_accent.pt")
# encoder_accent.load_model(encoder_accent_weights)

# bnf2bnf
ppg2ppg_model_train_config = Path('/mnt/nvme-data1/waris/repo/vq-bnf-translator/conf/translator_vq128.yaml')
ppg2ppg_config = HpsYaml(ppg2ppg_model_train_config) 
ppg2ppg_model_file = Path('/mnt/nvme-data1/waris/model_checkpoints/translator_vq/translator-vq128/best_loss_step_200000.pth')
ppg2ppg_model = build_model(Translator, ppg2ppg_config, ppg2ppg_model_file, device) 

# bnfQuantize
vq_train_config = Path('/mnt/data1/waris/repo/vq-bnf/conf/vq_128.yaml')
bnf2code_config = HpsYaml(vq_train_config) 
bnf2code_model_file = Path('/mnt/data1/waris/repo/vq-bnf/ckpt/vq128/loss_step_100000.pth')
bnf2code_model = build_model(Quantizer, bnf2code_config, bnf2code_model_file, device)

# bnf2mel
ppg2mel_model_train_config = Path('/mnt/data1/waris/repo/vc-vq-subset/conf/transformer_vc_vq128_prosody_ecapa.yaml')
ppg2mel_config = HpsYaml(ppg2mel_model_train_config) 
ppg2mel_model_file = Path('/mnt/nvme-data1/waris/model_checkpoints/vc-vq/transformer-vc-vq128-all-prosody-ecapa/best_loss_step_940000.pth')
ppg2mel_model = build_model(Synthesizer, ppg2mel_config, ppg2mel_model_file, device) 

weights_fpath = "speaker_encoder/ckpt/pretrained_bak_5805000.pt"
encoder = SpeakerEncoder(weights_fpath)

# mel2wav
hifigan_model = load_hifigan_generator(device)

In [5]:

def compute_spk_dvec(
    wav_path
):
    fpath = Path(wav_path)
    wav = preprocess_wav(fpath)
    # print('wac-shape',wav.shape)
    spk_dvec = encoder.embed_utterance(wav)
    #print(spk_dvec)
    return spk_dvec

def compute_mel(wav_path):
    audio, sr = load_wav(wav_path)
    lwav = len(audio)
    if sr != 24000:
        audio = resampy.resample(audio, sr, 24000)
    audio = audio / MAX_WAV_VALUE
    audio = normalize(audio) * 0.95
    audio = torch.FloatTensor(audio).unsqueeze(0)
    melspec = mel_spectrogram(
        audio,
        n_fft=1024,
        num_mels=80,
        sampling_rate=24000,
        hop_size=240,
        win_size=1024,
        fmin=0,
        fmax=8000,
    )
    return melspec.squeeze(0).numpy().T, lwav

def bin_level_min_max_norm(melspec):
    # frequency bin level min-max normalization to [-4, 4]
    mel_min=-12.0
    mel_max=2.5
    mel = (melspec - mel_min) / (mel_max - mel_min) * 8.0 - 4.0
    return np.clip(mel, -4., 4.)

In [6]:
from types import SimpleNamespace

inference_args = {}
inference_args['threshold']=0.5
inference_args['minlenratio']=0.5
inference_args['maxlenratio']=1.5

inference_args = SimpleNamespace(**inference_args)

In [7]:
def get_bnf_kaldi(spk_fpath, utterance_id):
    kaldi_dir = f'{spk_fpath}/kaldi'
    speaker = spk_fpath.split('/')[-1]
    ki = KaldiInterface(wav_scp=str(os.path.join(kaldi_dir, 'wav.scp')),
                        bnf_scp=str(os.path.join(kaldi_dir, 'bnf/feats.scp')))
    bnf = ki.get_feature('_'.join([speaker, utterance_id]), 'bnf')

In [8]:
@torch.no_grad()
def translate2code(bnf):
    bnf = torch.from_numpy(bnf).unsqueeze(0).to(device)

    bnf_qn, indices = bnf2code_model.inference(torch.squeeze(bnf))
    
    return bnf_qn.cpu().numpy(), indices.cpu().numpy()

In [9]:
@torch.no_grad()
def convert(src_speaker_fpath,  tgt_speaker, tgt_spk_dvec, tgt_prosody_fpath, utterance_id, output_dir):

    #Compute PPG
    ppg = np.load(f"{src_speaker_fpath}")

    #remove repeatations of same frames
    selection = np.ones(len(ppg), dtype=bool)
    for idx in range(len(ppg)-1):
        if np.array_equal(ppg[idx], ppg[idx+1]):
            selection[idx+1] = False

    ppg = ppg[selection]

    #print(ppg.shape)
    ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)

    # ppg_pred, _, att_ws = ppg2ppg_model.inference(torch.squeeze(ppg), inference_args)
    # ppg_pred = ppg_pred.cpu().numpy()

    # #Quantize

    # ppg_pred, _ = translate2code(ppg_pred)

    # Convert to Mel

    tgt_spk_dvec = torch.from_numpy(tgt_spk_dvec).unsqueeze(0).to(device)
    
    # Compute prosody representation
    prosody_speaker = os.path.basename(tgt_prosody_fpath)
    prosody_wav_fpath = f"{tgt_prosody_fpath}/wav/{utterance_id}.wav"
    prosody_vec, _ = compute_mel(prosody_wav_fpath)
    prosody_vec = bin_level_min_max_norm(prosody_vec)
    prosody_vec = torch.from_numpy(prosody_vec).unsqueeze(0).to(device)


    # #print(ppg.shape)
    # ppg = torch.from_numpy(ppg_pred).unsqueeze(0).to(device)

    mel_pred, att_ws = ppg2mel_model.inference(torch.squeeze(ppg), spemb=torch.squeeze(tgt_spk_dvec), prosody_vec=torch.squeeze(prosody_vec))
    mel_pred = mel_pred.unsqueeze(0)
    
    y = hifigan_model(mel_pred.view(1, -1, 80).transpose(1, 2))

    output_dir = os.path.join(output_dir, tgt_speaker)
    os.makedirs(output_dir, exist_ok=True)

    wav_fname = f"{output_dir}/{utterance_id}.wav"

    sf.write(wav_fname, y.squeeze().cpu().numpy(), 24000, "PCM_16")

In [9]:
vq_cluster = 'vq128'

# utterance_ids = ['arctic_a00'+str(i) for i in range(10,40)] + ['arctic_b05'+str(i) for i in range(10, 40)]+ ['arctic_b050'+str(i) for i in range(0, 10)]
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA']
utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539', 'arctic_a0018']

basepath_bnf_vq = '/mnt/data1/waris/repo/vq-bnf/translation-test'
basepath_wav = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc'
output_path = '/mnt/nvme-data1/waris/repo/vq-bnf-translator/synthesis_output/VQ128_DQ'
speaker_dvec_path = '/mnt/nvme-data1/waris/preprocessed_data/avg_spk_embed'


for speaker in speakers:
    tgt_speaker_fpath = os.path.join(basepath_wav, speaker)
    tgt_spk_dvec = np.load(f'{speaker_dvec_path}/{speaker}/embeds_50_mean.npy').astype('float32')

    for utterance_id in utterance_ids:
        src_speaker_fpath = Path(f'{basepath_bnf_vq}/{vq_cluster}/ppgs/ppg-{speaker}-{utterance_id}.npy')
        if not src_speaker_fpath.exists():
            continue

        prosody_fpath = os.path.join(basepath_wav, speaker)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_path)

: 

custom data

In [10]:
vq_cluster = 'vq128'

utterance_ids = ['custom_a0'+str(i) for i in range(1,10)]
speakers = ['WARS']

basepath_bnf_vq = '/mnt/nvme-data1/waris/datasets/claro/WARS/vq-bnf-128'
basepath_wav = '/mnt/nvme-data1/waris/datasets/claro'
output_path = '/mnt/nvme-data1/waris/repo/vq-bnf-translator/synthesis_output/VQ128_DQ_CD_Rec'
# speaker_dvec_path = '/mnt/nvme-data1/waris/preprocessed_data/avg_spk_embed'


for speaker in speakers:
    tgt_speaker_fpath = os.path.join(basepath_wav, speaker)
    # tgt_spk_dvec = np.load(f'{speaker_dvec_path}/{speaker}/embeds_50_mean.npy').astype('float32')

    for utterance_id in utterance_ids:
        src_speaker_fpath = Path(f'{basepath_bnf_vq}/ppgs/{utterance_id}.npy')
        if not src_speaker_fpath.exists():
            continue

        tgt_spk_dvec = compute_spk_dvec(os.path.join(basepath_wav, speaker, "wav", utterance_id+".wav"))

        prosody_fpath = os.path.join(basepath_wav, speaker)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_path)