In [None]:
import os
import argparse
import torch
import librosa
import time
from scipy.io.wavfile import write
from tqdm import tqdm
import soundfile as sf
import utils
import time
from models import HuBERT_NeuralDec_VITS
from mel_processing import mel_spectrogram_torch
import logging

from speaker_encoder.voice_encoder import SpeakerEncoder
logging.getLogger('numba').setLevel(logging.WARNING)

In [None]:
class Parameters:
    def __init__(self):
        self.hpfile = "logs/neuralvc/config.json"
        self.ptfile = "logs/neuralvc/G_990000.pth"
        self.model_name = "hubert-neuraldec-vits"
        self.outdir = "output/temp"
        self.use_timestamp = False
args = Parameters()

In [None]:
if not os.path.exists(args.outdir):
    os.makedirs(args.outdir)

# hps = utils.get_hparams_from_file(args.hpfile)

In [None]:
os.makedirs(args.outdir, exist_ok=True)
hps = utils.get_hparams_from_file(args.hpfile)

print("Loading model...")
net_g = HuBERT_NeuralDec_VITS(
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
_ = net_g.eval()

print("Loading checkpoint...")
_ = utils.load_checkpoint(args.ptfile, net_g, None, True)

print("Loading hubert...")
hubert = torch.hub.load("bshall/hubert:main", f"hubert_soft").eval() 

if hps.model.use_spk:
    print("Loading speaker encoder...")
    smodel = SpeakerEncoder('speaker_encoder/ckpt/pretrained_bak_5805000.pt')
print("ok")

In [None]:
from tqdm import tqdm

def convert(src_list, tgt):
    tgtname = tgt.split("/")[-1].split(".")[0]
    wav_tgt, _ = librosa.load(tgt, sr=hps.data.sampling_rate)
    if not os.path.exists(os.path.join(args.outdir, tgtname)):
        os.makedirs(os.path.join(args.outdir, tgtname))
    sf.write(os.path.join(args.outdir, tgtname, f"tgt_{tgtname}.wav"), wav_tgt, hps.data.sampling_rate)
    wav_tgt, _ = librosa.effects.trim(wav_tgt, top_db=20)
    g_tgt = smodel.embed_utterance(wav_tgt)
    g_tgt = torch.from_numpy(g_tgt).unsqueeze(0)
    for src in tqdm(src_list):
        srcname = src.split("/")[-1].split(".")[0]
        title = srcname + "-" + tgtname
        with torch.no_grad():
            wav_src, _ = librosa.load(src, sr=hps.data.sampling_rate)
            sf.write(os.path.join(args.outdir, tgtname, f"src_{srcname}.wav"), wav_src, hps.data.sampling_rate)
            wav_src = torch.from_numpy(wav_src).unsqueeze(0).unsqueeze(0)
            c = hubert.units(wav_src)
            c = c.transpose(1,2)
            audio = net_g.infer(c, g=g_tgt)
            audio = audio[0][0].data.cpu().float().numpy()
            write(os.path.join(args.outdir, tgtname, f"{title}.wav"), hps.data.sampling_rate, audio)

In [None]:
# Test
import time

tgt1 = "/mnt/hd/cma/zzy/dataset/test/M_5105_28233_000016_000001.wav"

src_list1 = ["/mnt/hd/cma/zzy/dataset/test/F_3575_170457_000032_000001.wav"]

convert(src_list1, tgt1)