In [1]:
import torch, torchaudio
from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
import sys
sys.path.insert(0,'/home/nis/tianyi.tan/hubert/')

from hubert.model import HubertSoft

def load_hubert(checkpoint_path=None, rank=0, device='cuda'):
    print("### load_hubert", checkpoint_path, device)
    assert checkpoint_path is not None
    print("### loading checkpoint from: ", checkpoint_path)
    checkpoint = torch.load(checkpoint_path)
    hubert = HubertSoft().to(device) if device!='cuda' else HubertSoft().to(rank)

    checkpoint = checkpoint['hubert'] if checkpoint['hubert'] is not None else checkpoint
    consume_prefix_in_state_dict_if_present(checkpoint, "module.")

    hubert.load_state_dict(checkpoint, strict=True)
    hubert.eval().to(device)
    return hubert

def load_hifigan(checkpoint_path="./hifigan/g_00205000", device='cuda'):
    import sys
    sys.path.insert(0,'/home/nis/tianyi.tan/wesper-demo')
    import hifigan
    import json
    with open("/home/nis/tianyi.tan/hifigan/hifigan/config.json", "r") as f:
        config = json.load(f)
    config = hifigan.AttrDict(config)
    vocoder = hifigan.Generator(config)
    print("### HiFI-GAN ckpt", checkpoint_path)
    if checkpoint_path.startswith("http"):
        ckpt = torch.hub.load_state_dict_from_url(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.hub.load_state_dict_from_url(checkpoint_path)
    else:
        ckpt = torch.load(checkpoint_path, map_location=torch.device('cpu')) if device!='cuda' else torch.load(checkpoint_path)
    checkpoint = ckpt['generator']['model']
    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
    vocoder.load_state_dict(checkpoint)
    vocoder.eval()
    vocoder.remove_weight_norm()
    vocoder.to(device)

    return vocoder

def load_acoustic(checkpoint_path=None, rank=0, device='cuda'):
    import sys
    sys.path.insert(0,'/home/nis/tianyi.tan/.cache/torch/hub/bshall_acoustic-model_main')
    from acoustic import AcousticModel
    
    print("### load_acoustic", checkpoint_path, device)
    assert checkpoint_path is not None
    print("### loading checkpoint from: ", checkpoint_path)
    checkpoint = torch.load(checkpoint_path)
    acoustic = AcousticModel(discrete=False).to(device)
    
    checkpoint = checkpoint['acoustic-model'] if checkpoint['acoustic-model'] is not None else checkpoint
    consume_prefix_in_state_dict_if_present(checkpoint, "module.")
    
    acoustic.load_state_dict(checkpoint)
    acoustic.eval().to(device)
    return acoustic

In [2]:
# Load the content encoder (either hubert_soft or hubert_discrete)
hubert = load_hubert("/home/nis/tianyi.tan/.cache/torch/hub/checkpoints/model-layer12-450000.pt")
#hubert = load_hubert("/data/hdd0/tianyi.tan/hubert/model-best.pt") #g0
#hubert = load_hubert("/data/ssd0/tianyi.tan/ckpt-9M/model-best.pt") #g5
#hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()


# Load the acoustic model (either hubert_soft or hubert_discrete)
#acoustic = load_acoustic("/home/nis/tianyi.tan/.cache/torch/hub/checkpoints/hubert-soft-0321fd7e.pt")
#acoustic = load_acoustic("/data/hdd0/tianyi.tan/acoustic-model/model-best.pt") #g0
acoustic = load_acoustic("/data/ssd0/tianyi.tan/ckpt-acoustic-model/model-best.pt") #g5
#acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True).cuda()

# Load the vocoder (either hifigan_hubert_soft or hifigan_hubert_discrete)
#hifigan = load_hifigan("/home/nis/tianyi.tan/.cache/torch/hub/checkpoints/g_00205000")
hifigan = load_hifigan("/data/ssd0/tianyi.tan/ckpt-voc-128/model-best.pt")
#hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).cuda()

# Load the source audio
#source, sr = torchaudio.load("sample_whisper.wav")
source, sr = torchaudio.load("s000u053w.WAV")
assert sr == 16000
source = source.unsqueeze(0).cuda()

# Convert to the target speaker
with torch.inference_mode():
    # Extract speech units
    units = hubert.units(source)
    # Generate target spectrogram
    mel = acoustic.generate(units).transpose(1, 2)
    # Generate audio waveform
    target = hifigan(mel)

import soundfile as sf
sf.write('my_out2.wav',target.squeeze().squeeze().cpu(),16000)

### load_hubert /home/nis/tianyi.tan/.cache/torch/hub/checkpoints/model-layer12-450000.pt cuda
### loading checkpoint from:  /home/nis/tianyi.tan/.cache/torch/hub/checkpoints/model-layer12-450000.pt
### load_acoustic /data/ssd0/tianyi.tan/ckpt-acoustic-model/model-best.pt cuda
### loading checkpoint from:  /data/ssd0/tianyi.tan/ckpt-acoustic-model/model-best.pt
### HiFI-GAN ckpt /data/ssd0/tianyi.tan/ckpt-voc-128/model-best.pt
Removing weight norm...
