# Zero Shot Voice Conversion

## Import libraries and load pretrained models

In [1]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [3]:
from synthesizer.inference import Synthesizer
from synthesizer.kaldi_interface import KaldiInterface
from encoder import inference as encoder
from vocoder import inference as vocoder
import numpy as np
import librosa
from pathlib import Path
from utils.argutils import print_args
import random
import IPython.display as ipd
from synthesizer.hparams import hparams


encoder_speaker_weights = Path("/home/grads/q/quamer.waris/projects/Accentron/pretrained_model/pretrained/encoder/saved_models/pretrained.pt")
vocoder_weights = Path("/home/grads/q/quamer.waris/projects/Accentron/pretrained_model/pretrained/vocoder/saved_models/pretrained/pretrained.pt")
syn_dir = Path("/mnt/data1/waris/model_outputs/accentron/parallel/logs-Accetron_train_parallel/taco_pretrained")

encoder.load_model(encoder_speaker_weights)
synthesizer = Synthesizer(syn_dir)
vocoder.load_model(vocoder_weights)
#hparams = hparams.parse()

Loaded encoder "pretrained.pt" trained to step 1564501
Loaded encoder "encoder_accent.pt" trained to step 90001
Found synthesizer "Accetron_train_parallel" trained to step 204001
Found synthesizer "translator_train" trained to step 294001
Building Wave-RNN
Trainable Parameters: 4.481M
Loading model weights at /home/grads/q/quamer.waris/projects/Accentron/pretrained_model/pretrained/vocoder/saved_models/pretrained/pretrained.pt


In [4]:
def synthesize(bnf, embed):
    spec = synthesizer.synthesize_spectrograms([bnf], [embed])[0]
    generated_wav = vocoder.infer_waveform(spec)
    generated_wav = np.pad(generated_wav, (0, synthesizer.sample_rate), mode="constant")
    return generated_wav

In [7]:
def generate_speaker_embed(tgt_utterance_path):
    wav, _ = librosa.load(tgt_utterance_path, hparams.sample_rate)
    wav = encoder.preprocess_wav(wav)
    embed_speaker = encoder.embed_utterance(wav)

    return embed_speaker

## Generate BNF for L1 reference utterance

In [None]:
import os
src_speaker = 'BDL'
utterance_id = 'arctic_b0539'
kaldi_dir = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data/BDL/kaldi' #Path to kaldi directory of the speaker.
ki = KaldiInterface(wav_scp=str(os.path.join(kaldi_dir, 'wav.scp')),
                    bnf_scp=str(os.path.join(kaldi_dir, 'bnf/feats.scp')))
bnf = ki.get_feature('_'.join([src_speaker, utterance_id]), 'bnf')

In [None]:
tgt_utterance_path = '/mnt/data1/waris/datasets/UEDIN_mandarin_bi_data_2010/downsampled_22kHz/Mandarin_mini_testset/MF1_ENG_0001_1.wav'

embed_speaker = generate_speaker_embed(tgt_utterance_path)

In [16]:
synthesis_wav = synthesize(bnf, embed_speaker)
ipd.Audio(synthesis_wav, rate=hparams.sample_rate)

In [13]:
from scipy.io import wavfile

output_dir = '/home/grads/q/quamer.waris/projects/ac-vc/synthesis_output/parallel_report_xxx'
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)
output_file = os.path.join(output_dir, 'man_man_f1_222.wav')
wavfile.write(output_file, hparams.sample_rate, synthesis_wav)
