# StarGANv2-VC Demo (Danish to English VC)

### Utils

In [1]:

# load packages
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
import os

from Utils.ASR.models import ASRCNN
from Utils.JDC.model import JDCNet
from models import Generator, MappingNetwork, StyleEncoder

import IPython.display as ipd

%matplotlib inline

In [2]:
# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt
# Source: https://github.com/jjery2243542/voice_conversion

# Binary speaker setup
# speakers = {'synthetic': 0, 'august': 1}

# Multi speaker setup
speakers = {'p240': 1, 'p232': 2, 'p236': 3, 'p259': 5, 'p256': 6, 'p273': 7, 'synthetic': 8, 'august': 9}

to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4

def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor

def build_model(model_params={}):
    args = Munch(model_params)
    generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)
    style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)
    
    nets_ema = Munch(generator=generator,
                     mapping_network=mapping_network,
                     style_encoder=style_encoder)

    return nets_ema

def compute_style(path, speaker):
    wave, sr = librosa.load(path, sr=24000)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != 24000:
        wave = librosa.resample(wave, sr, 24000)
    mel_tensor = preprocess(wave).to('cuda')

    with torch.no_grad():
        label = torch.LongTensor([speakers[speaker]])
        ref = starganv2.style_encoder(mel_tensor.unsqueeze(1), label)
    
    return ref, label

### Load models

In [3]:
# load F0 model

F0_model = JDCNet(num_class=1, seq_len=192)
params = torch.load("Utils/JDC/bst.t7")['net']
F0_model.load_state_dict(params)
_ = F0_model.eval()
F0_model = F0_model.to('cuda')

In [4]:
# load vocoder
from parallel_wavegan.utils import load_model
vocoder = load_model("Vocoder/checkpoint-400000steps.pkl").to('cuda').eval()
vocoder.remove_weight_norm()
_ = vocoder.eval()

In [5]:
source_speaker = 'synthetic'
source_wav_path = f'Data/Babelfish/{source_speaker}/train/1.wav'
audio, source_sr = librosa.load(source_wav_path, sr=24000)
audio = audio / np.max(np.abs(audio))
audio.dtype = np.float32
source_audio = preprocess(audio).to('cuda:0')

In [27]:
# load starganv2
model = 'Babelfish-Split-Multi1'
model_version = '00480'
model_path = f'Models/{model}/epoch_{model_version}.pth'

with open(f'Models/{model}/babelfish_split_multi_config.yml') as f:
    starganv2_config = yaml.safe_load(f)
starganv2 = build_model(model_params=starganv2_config["model_params"])
params = torch.load(model_path, map_location='cpu')
params = params['model_ema']
_ = [starganv2[key].load_state_dict(params[key]) for key in starganv2]
_ = [starganv2[key].eval() for key in starganv2]
starganv2.style_encoder = starganv2.style_encoder.to('cuda')
starganv2.mapping_network = starganv2.mapping_network.to('cuda')
starganv2.generator = starganv2.generator.to('cuda')

FileNotFoundError: [Errno 2] No such file or directory: 'Models/Babelfish-Split-Multi1/epoch_00480.pth'

### Conversion

In [28]:
with torch.no_grad():
    target_speaker = 'august'
    target_path = f'Data/Babelfish_Split_Multi/{target_speaker}/130.wav'
    target, _ = compute_style(target_path, target_speaker)
    f0_feat = F0_model.get_feature_GAN(source_audio.unsqueeze(1))
    out = starganv2.generator(source_audio.unsqueeze(1), target, F0=f0_feat)
    c = out.transpose(-1, -2).squeeze().to('cuda')
    y_out = vocoder.inference(c)
    y_out = y_out.view(-1).cpu()
    wave = y_out.numpy()
    print('Converted: %s' % target_speaker)
    display(ipd.Audio(wave, rate=24000))
    print('Target: %s' % target_speaker)
    display(ipd.Audio(target_path, rate=24000))
    print('Source:')
    display(ipd.Audio(source_wav_path, rate=24000))

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

In [None]:
from speechbrain.pretrained import SpectralMaskEnhancement

denoise_model = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/metricgan-plus-voicebank",
    savedir="pretrained_models/metricgan-plus-voicebank",
)

def enhance(noisy):
    wave_tensor = torch.from_numpy(noisy).float().unsqueeze(0)
    # Add relative length tensor
    enhanced = denoise_model.enhance_batch(wave_tensor, lengths=torch.tensor([1.]))
    return enhanced

In [29]:
print('Converted: %s' % target_speaker)
display(ipd.Audio(enhance(wave), rate=24000))
print('Target: %s' % target_speaker)
display(ipd.Audio(target_path, rate=24000))
print('Source:')
display(ipd.Audio(source_wav_path, rate=24000))

Converted: august


Target: august


Source:
