In [1]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [2]:
import time
import sys
import os
import argparse
import torch
import numpy as np
import glob
from pathlib import Path
from tqdm import tqdm
# from conformer_ppg_model.build_ppg_model import load_ppg_model
# from src.mel_decoder_mol_encAddlf0 import MelDecoderMOL
# from src.mel_decoder_lsa import MelDecoderLSA
# from src.rnn_ppg2mel import BiRnnPpg2MelModel
# from src.transformer_bnftomel import Transformer
from src.audio_utils import MAX_WAV_VALUE, load_wav, mel_spectrogram, normalize
import pyworld
import librosa
import resampy
import soundfile as sf
# from src.transformer_bnftomel import Transformer
from utils.f0_utils import get_cont_lf0
from utils.load_yaml import HpsYaml

from vocoders.hifigan_model import load_hifigan_generator

from speaker_encoder.voice_encoder import SpeakerEncoder
from speaker_encoder.audio import preprocess_wav
from src import build_model
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import matplotlib.pyplot as plt
import librosa.display
from data_objects.kaldi_interface import KaldiInterface

In [3]:

def compute_spk_dvec(
    wav_path, weights_fpath="speaker_encoder/ckpt/pretrained_bak_5805000.pt",
):
    fpath = Path(wav_path)
    wav = preprocess_wav(fpath)
    # print('wac-shape',wav.shape)
    encoder = SpeakerEncoder(weights_fpath)
    spk_dvec = encoder.embed_utterance(wav)
    #print(spk_dvec)
    return spk_dvec

def compute_spk_dvec1(
    mel, weights_fpath="speaker_encoder/ckpt/pretrained_bak_5805000.pt",
):
    wav = preprocess_wav(mel)
    encoder = SpeakerEncoder(weights_fpath)
    spk_dvec = encoder.embed_utterance(wav)
    #print(spk_dvec)
    return spk_dvec

def compute_f0(wav, sr=16000, frame_period=10.0):
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(
        wav, sr, frame_period=frame_period, f0_floor=20.0, f0_ceil=600.0)
    return f0


def compute_mean_std(lf0):
    nonzero_indices = np.nonzero(lf0)
    mean = np.mean(lf0[nonzero_indices])
    std = np.std(lf0[nonzero_indices])
    return mean, std 


def f02lf0(f0):
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    return lf0


def get_converted_lf0uv(
    wav, 
    lf0_mean_trg, 
    lf0_std_trg,
    convert=True,
):
    f0_src = compute_f0(wav)
    if not convert:
        uv, cont_lf0 = get_cont_lf0(f0_src)
        lf0_uv = np.concatenate([cont_lf0[:, np.newaxis], uv[:, np.newaxis]], axis=1)
        return lf0_uv

    lf0_src = f02lf0(f0_src)
    lf0_mean_src, lf0_std_src = compute_mean_std(lf0_src)
    
    lf0_vc = lf0_src.copy()
    lf0_vc[lf0_src > 0.0] = (lf0_src[lf0_src > 0.0] - lf0_mean_src) / lf0_std_src * lf0_std_trg + lf0_mean_trg
    f0_vc = lf0_vc.copy()
    f0_vc[lf0_src > 0.0] = np.exp(lf0_vc[lf0_src > 0.0])
    
    uv, cont_lf0_vc = get_cont_lf0(f0_vc)
    lf0_uv = np.concatenate([cont_lf0_vc[:, np.newaxis], uv[:, np.newaxis]], axis=1)
    return lf0_uv


def build_ppg2mel_model(model_config, model_file, device):
    model_class = build_model(model_config["model_name"])
    ppg2mel_model = model_class(
        **model_config["model"]
    ).to(device)
    ckpt = torch.load(model_file, map_location=device)
    ppg2mel_model.load_state_dict(ckpt["model"])
    ppg2mel_model.eval()
    return ppg2mel_model

def build_transf_model(model_config, model_file, device):
    model_class = build_model(model_config["model_name"])
    ppg2mel_model = model_class(
        model_config["model"]
    ).to(device)
    ckpt = torch.load(model_file, map_location=device)
    ppg2mel_model.load_state_dict(ckpt["model"])
    ppg2mel_model.eval()
    return ppg2mel_model

def get_bnfs(spk_id, utterance_id, kaldi_dir):
    ki = KaldiInterface(wav_scp=str(os.path.join(kaldi_dir, 'wav.scp')),
                        bnf_scp=str(os.path.join(kaldi_dir, 'bnf/feats.scp')))
    bnf = ki.get_feature('_'.join([spk_id, utterance_id]), 'bnf')
    return bnf

In [4]:
def compute_mel(wav_path):
    audio, sr = load_wav(wav_path)
    lwav = len(audio)
    if sr != 24000:
        audio = resampy.resample(audio, sr, 24000)
    audio = audio / MAX_WAV_VALUE
    audio = normalize(audio) * 0.95
    audio = torch.FloatTensor(audio).unsqueeze(0)
    melspec = mel_spectrogram(
        audio,
        n_fft=1024,
        num_mels=80,
        sampling_rate=24000,
        hop_size=240,
        win_size=1024,
        fmin=0,
        fmax=8000,
    )
    return melspec.squeeze(0).numpy().T, lwav

def bin_level_min_max_norm(melspec):
    # frequency bin level min-max normalization to [-4, 4]
    mel_min=-12.0
    mel_max=2.5
    mel = (melspec - mel_min) / (mel_max - mel_min) * 8.0 - 4.0
    return np.clip(mel, -4., 4.)

In [5]:
# Build models
print("PPG2Mel-model, Vocoder-model...")
device = 'cuda'

# ppg_model = load_ppg_model(
#     './conformer_ppg_model/en_conformer_ctc_att/config.yaml', 
#     './conformer_ppg_model/en_conformer_ctc_att/24epoch.pth',
#     device,
# )

ppg2mel_model_train_config = Path('/mnt/data1/waris/repo/vc-vq-subset/conf/transformer_vc_vq256_prosody_ecapa.yaml')
ppg2mel_config = HpsYaml(ppg2mel_model_train_config) 
ppg2mel_model_file = Path('/mnt/nvme-data1/waris/model_checkpoints/vc-vq/transformer-vc-vq256-all-prosody-ecapa-rd/best_loss_step_990000.pth')

ppg2mel_model = build_transf_model(ppg2mel_config, ppg2mel_model_file, device) 
hifigan_model = load_hifigan_generator(device)

PPG2Mel-model, Vocoder-model...


In [6]:
# @torch.no_grad()
# def convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir):

#     #Computer target speaker dvec
#     tgt_speaker = os.path.basename(tgt_speaker_fpath)
#     tgt_wav_path = f"{tgt_speaker_fpath}/wav/{utterance_id}.wav"
#     tgt_spk_dvec = compute_spk_dvec(tgt_wav_path)
#     tgt_spk_dvec = torch.from_numpy(tgt_spk_dvec).unsqueeze(0).to(device)
    
#     # # Compute prosody representation
#     # prosody_speaker = os.path.basename(tgt_prosody_fpath)
#     # # prosody_speaker_kaldi_dir = os.path.join(tgt_prosody_fpath, 'kaldi')
#     # # prosody_vec = get_bnfs(prosody_speaker, utterance_id, prosody_speaker_kaldi_dir)
#     # prosody_wav_fpath = f"{tgt_prosody_fpath}/wav/{utterance_id}.wav"
#     # prosody_vec, _ = compute_mel(prosody_wav_fpath)
#     # prosody_vec = bin_level_min_max_norm(prosody_vec)
#     # prosody_vec = torch.from_numpy(prosody_vec).unsqueeze(0).to(device)
    
#     #Compute PPG
#     src_speaker = os.path.basename(src_speaker_fpath)
#     src_speaker_kaldi_dir = os.path.join(src_speaker_fpath, 'kaldi')
#     src_wav_path = f"{src_speaker_fpath}/wav/{utterance_id}.wav"
#     # src_wav, _ = librosa.load(src_wav_path, sr=16000)
#     # src_wav_tensor = torch.from_numpy(src_wav).unsqueeze(0).float().to(device)
#     # src_wav_lengths = torch.LongTensor([len(src_wav)]).to(device)
#     #ppg = ppg_model(src_wav_tensor, src_wav_lengths)
#     ppg = get_bnfs(src_speaker, utterance_id, src_speaker_kaldi_dir)
#     ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)

#     # ppg_path = "/mnt/data1/waris/model_preprocessing/FAC-RR-Test/BDL"
#     # ppg = np.load(f"{ppg_path}/{utterance_id}.ling_feat.npy")
#     # ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)

#     # #Compute lf0_uv
#     # ref_wav, _ = librosa.load(tgt_wav_path, sr=16000)
#     # ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
#     # lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
#     # lf0_uv = torch.from_numpy(lf0_uv).unsqueeze(0).to(device)
#     # min_len = min(ppg.shape[1], prosody_vec.shape[1])
#     # min_len = ppg.shape[1]

#     # ppg = ppg[:, :min_len]
#     # # lf0_uv = lf0_uv[:min_len]
#     # prosody_vec = prosody_vec[:, :min_len]
    
#     #x, logf0_uv=None, spemb=None, prosody_vec=None
#     mel_pred, att_ws = ppg2mel_model.inference(torch.squeeze(ppg), torch.squeeze(tgt_spk_dvec))#, torch.squeeze(prosody_vec))
#     mel_pred = mel_pred.unsqueeze(0)
    
#     y = hifigan_model(mel_pred.view(1, -1, 80).transpose(1, 2))

#     step = os.path.basename(ppg2mel_model_file)[:-4].split("_")[-1]
#     output_dir = os.path.join(output_dir, 'Step_'+step, tgt_speaker)
#     os.makedirs(output_dir, exist_ok=True)

#     wav_fname = f"{output_dir}/{utterance_id}.wav"

#     sf.write(wav_fname, y.squeeze().cpu().numpy(), 24000, "PCM_16")

In [7]:
@torch.no_grad()
def convert(src_speaker_fpath, tgt_speaker_fpath, tgt_prosody_fpath, utterance_id, output_dir, dr=True):

    #Computer target speaker dvec
    tgt_speaker = os.path.basename(tgt_speaker_fpath)
    tgt_wav_path = f"{tgt_speaker_fpath}/wav/{utterance_id}.wav"
    tgt_spk_dvec = compute_spk_dvec(tgt_wav_path)
    tgt_spk_dvec = torch.from_numpy(tgt_spk_dvec).unsqueeze(0).to(device)
    
    # Compute prosody representation
    prosody_speaker = os.path.basename(tgt_prosody_fpath)
    prosody_wav_fpath = f"{tgt_prosody_fpath}/wav/{utterance_id}.wav"
    prosody_vec, _ = compute_mel(prosody_wav_fpath)
    prosody_vec = bin_level_min_max_norm(prosody_vec)
    prosody_vec = torch.from_numpy(prosody_vec).unsqueeze(0).to(device)

    #Compute PPG
    ppg = np.load(f"{src_speaker_fpath}")

    #remove repeatations of same frames
    if dr:
        selection = np.ones(len(ppg), dtype=bool)
        for idx in range(len(ppg)-1):
            if np.array_equal(ppg[idx], ppg[idx+1]):
                selection[idx+1] = False

        ppg = ppg[selection]

    #print(ppg.shape)
    ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)

    mel_pred, att_ws = ppg2mel_model.inference(torch.squeeze(ppg), spemb=torch.squeeze(tgt_spk_dvec), prosody_vec=torch.squeeze(prosody_vec))
    mel_pred = mel_pred.unsqueeze(0)
    
    y = hifigan_model(mel_pred.view(1, -1, 80).transpose(1, 2))

    output_dir = os.path.join(output_dir, tgt_speaker)
    os.makedirs(output_dir, exist_ok=True)

    wav_fname = f"{output_dir}/{prosody_speaker}_{utterance_id}.wav"

    sf.write(wav_fname, y.squeeze().cpu().numpy(), 24000, "PCM_16")

@torch.no_grad()
def convert(src_speaker_fpath,  tgt_speaker, tgt_spk_dvec, tgt_prosody_fpath, utterance_id, output_dir, dr=True):

    #Computer target speaker dvec
    # tgt_speaker = os.path.basename(tgt_speaker_fpath)
    # tgt_wav_path = f"{tgt_speaker_fpath}/wav/{utterance_id}.wav"
    # tgt_spk_dvec = compute_spk_dvec(tgt_wav_path)
    tgt_spk_dvec = torch.from_numpy(tgt_spk_dvec).unsqueeze(0).to(device)
    
    # Compute prosody representation
    prosody_speaker = os.path.basename(tgt_prosody_fpath)
    prosody_wav_fpath = f"{tgt_prosody_fpath}/wav/{utterance_id}.wav"
    prosody_vec, _ = compute_mel(prosody_wav_fpath)
    prosody_vec = bin_level_min_max_norm(prosody_vec)
    prosody_vec = torch.from_numpy(prosody_vec).unsqueeze(0).to(device)

    #Compute PPG
    ppg = np.load(f"{src_speaker_fpath}")

    #remove repeatations of same frames
    if dr:
        selection = np.ones(len(ppg), dtype=bool)
        for idx in range(len(ppg)-1):
            if np.array_equal(ppg[idx], ppg[idx+1]):
                selection[idx+1] = False

        ppg = ppg[selection]

    #print(ppg.shape)
    ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)

    mel_pred, att_ws = ppg2mel_model.inference(torch.squeeze(ppg), spemb=torch.squeeze(tgt_spk_dvec), prosody_vec=torch.squeeze(prosody_vec))
    mel_pred = mel_pred.unsqueeze(0)
    
    y = hifigan_model(mel_pred.view(1, -1, 80).transpose(1, 2))

    output_dir = os.path.join(output_dir, tgt_speaker)
    os.makedirs(output_dir, exist_ok=True)

    wav_fname = f"{output_dir}/{prosody_speaker}_{utterance_id}.wav"

    sf.write(wav_fname, y.squeeze().cpu().numpy(), 24000, "PCM_16")

Reconstruction

In [8]:
vq_cluster = 'vq256'
# cmu_speakers = ['BDL', 'SLT', 'CLB', 'RMS']
# utterance_ids = ['arctic_a00'+str(i) for i in range(10,31)]

speakers = ['BDL', 'NJS', 'TXHC', 'YKWK', 'ZHAA']
utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']

basepath_bnf_vq = '/mnt/data1/waris/repo/vq-bnf/translation-test'
basepath_bnf_org = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/SV2TTS/translator/ppgs'
basepath_wav = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc'
output_dir = '/mnt/data1/waris/repo/vc-vq-subset/synthesis_output/reconstruction/vq256_ecapa_esd/1M'

for speaker in speakers:
    tgt_speaker_fpath = os.path.join(basepath_wav, speaker)

    for utterance_id in utterance_ids:
        src_speaker_fpath = Path(f'{basepath_bnf_vq}/{vq_cluster}/ppgs/ppg-{speaker}-{utterance_id}.npy')
        # src_speaker_fpath = Path(f'{basepath_bnf_org}/ppg-{speaker}-{utterance_id}.npy')
        if not src_speaker_fpath.exists():
            continue
        # convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)

        # prosody_fpath = os.path.join(basepath_wav, "BDL")
        # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir)

        prosody_fpath = os.path.join(basepath_wav, speaker)
        convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir)

Voice conversion

In [12]:
utterance_ids = ['arctic_a000'+str(i) for i in range(1,10)] + \
    ['arctic_a00'+str(i) for i in range(10,100)] + \
    ['arctic_a0'+str(i) for i in range(100,500)] + \
    ['arctic_b000'+str(i) for i in range(1,10)] + \
    ['arctic_b00'+str(i) for i in range(10,100)] + \
    ['arctic_b0'+str(i) for i in range(100,500)]


wav_path = "/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc"

speakers = ['NJS', 'ZHAA', 'TXHC', 'YKWK']

final_list = {}
for speaker in speakers:
    final_list[speaker] = {}
    duration_list = []
    for utterance in utterance_ids:
        nns_wav_fpath = Path(f'{wav_path}/{speaker}/wav/{utterance}.wav')
        bdl_wav_fpath = Path(f'{wav_path}/BDL/wav/{utterance}.wav')

        if nns_wav_fpath.exists() and bdl_wav_fpath.exists():
            
            snd_ns, sr = librosa.load(bdl_wav_fpath)
            dur_ns = int(librosa.get_duration(y=snd_ns, sr=sr)*1000)
            
            snd_nns, sr = librosa.load(nns_wav_fpath)
            dur_nns = int(librosa.get_duration(y=snd_nns, sr=sr)*1000)

            d2 = abs(dur_ns - dur_nns)
            duration_list.append((utterance, d2))
        
    duration_list.sort(key=lambda a: a[1])
    final_list[speaker]['top'] = [duration_list[i][0] for i in range(10)]
    final_list[speaker]['bottom'] = [duration_list[-i][0] for i in range(1, 11)]

In [14]:
# np.save('/mnt/data1/waris/repo/vc-vq-subset/extreme_utterances.npy', final_list) 

In [8]:
final_list = np.load('/mnt/data1/waris/repo/vc-vq-subset/extreme_utterances.npy', allow_pickle='TRUE').item()

In [9]:
vq_cluster = 'vq256'
# cmu_speakers = ['BDL', 'SLT', 'CLB', 'RMS']
# utterance_ids = ['arctic_a00'+str(i) for i in range(10,31)]

# utterance_ids = ['arctic_a0158', 'arctic_a0329',
#  'arctic_a0192',
#  'arctic_a0346',
#  'arctic_a0207',
#  'arctic_a0484',
#  'arctic_a0098',
#  'arctic_a0399',
#  'arctic_a0389',
#  'arctic_a0240',
#  'arctic_b0407', 
#  'arctic_a0023', 
#  'arctic_b0403', 
#  'arctic_b0480', 
#  'arctic_b0253', 
#  'arctic_b0204', 
#  'arctic_b0397', 
#  'arctic_a0139', 
#  'arctic_a0002', 
#  'arctic_b0460']

# utterance_ids = ['arctic_a00'+str(i) for i in range(10,40)] + ['arctic_b05'+str(i) for i in range(10, 40)]+ ['arctic_b050'+str(i) for i in range(0, 10)]
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA']
# utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']

basepath_bnf_vq = '/mnt/data1/waris/repo/vq-bnf/translation-test'
basepath_bnf_org = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/SV2TTS/translator/ppgs'
basepath_wav = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc'
output_path = '/mnt/data1/waris/repo/vc-vq-subset/synthesis_output_IS23_extreme/Prosody/VQ256_RD'
dr = True
speaker_dvec_path = '/mnt/nvme-data1/waris/preprocessed_data/avg_spk_embed'


for speaker in speakers:
    tgt_speaker_fpath = os.path.join(basepath_wav, speaker)
    tgt_spk_dvec = np.load(f'{speaker_dvec_path}/{speaker}/embeds_50_mean.npy').astype('float32')

    for utterance_id in final_list[speaker]['top']:
        src_speaker_fpath = Path(f'{basepath_bnf_vq}/{vq_cluster}/ppgs/ppg-BDL-{utterance_id}.npy')
        # src_speaker_fpath = Path(f'{basepath_bnf_org}/ppg-{speaker}-{utterance_id}.npy')
        if not src_speaker_fpath.exists():
            continue
        # convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)
        output_dir = output_path + '/TOP'
        
        prosody_fpath = os.path.join(basepath_wav, "BDL")
        # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

        prosody_fpath = os.path.join(basepath_wav, speaker)
        # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

    for utterance_id in final_list[speaker]['bottom']:
        src_speaker_fpath = Path(f'{basepath_bnf_vq}/{vq_cluster}/ppgs/ppg-BDL-{utterance_id}.npy')
        # src_speaker_fpath = Path(f'{basepath_bnf_org}/ppg-{speaker}-{utterance_id}.npy')
        if not src_speaker_fpath.exists():
            continue
        # convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)
        output_dir = output_path + '/BOTTOM'
        
        prosody_fpath = os.path.join(basepath_wav, "BDL")
        # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

        prosody_fpath = os.path.join(basepath_wav, speaker)
        # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

In [None]:
vq_cluster = 'vq256'
# cmu_speakers = ['BDL', 'SLT', 'CLB', 'RMS']
# utterance_ids = ['arctic_a00'+str(i) for i in range(10,31)]

# utterance_ids = ['arctic_a0158', 'arctic_a0329',
#  'arctic_a0192',
#  'arctic_a0346',
#  'arctic_a0207',
#  'arctic_a0484',
#  'arctic_a0098',
#  'arctic_a0399',
#  'arctic_a0389',
#  'arctic_a0240',
#  'arctic_b0407', 
#  'arctic_a0023', 
#  'arctic_b0403', 
#  'arctic_b0480', 
#  'arctic_b0253', 
#  'arctic_b0204', 
#  'arctic_b0397', 
#  'arctic_a0139', 
#  'arctic_a0002', 
#  'arctic_b0460']

utterance_ids = ['arctic_a00'+str(i) for i in range(10,40)] + ['arctic_b05'+str(i) for i in range(10, 40)]+ ['arctic_b050'+str(i) for i in range(0, 10)]
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA']
# utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']

basepath_bnf_vq = '/mnt/data1/waris/repo/vq-bnf/translation-test'
basepath_bnf_org = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/SV2TTS/translator/ppgs'
basepath_wav = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc'
output_path = '/mnt/data1/waris/repo/vc-vq-subset/synthesis_output_IS23_final/Prosody/VQ256_RD'
dr = True
speaker_dvec_path = '/mnt/nvme-data1/waris/preprocessed_data/avg_spk_embed'


for speaker in speakers:
    tgt_speaker_fpath = os.path.join(basepath_wav, speaker)
    tgt_spk_dvec = np.load(f'{speaker_dvec_path}/{speaker}/embeds_50_mean.npy').astype('float32')

    for utterance_id in utterance_ids: #final_list[speaker]['top']:
        src_speaker_fpath = Path(f'{basepath_bnf_vq}/{vq_cluster}/ppgs/ppg-BDL-{utterance_id}.npy')
        # src_speaker_fpath = Path(f'{basepath_bnf_org}/ppg-{speaker}-{utterance_id}.npy')
        if not src_speaker_fpath.exists():
            continue
        # convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)
        output_dir = output_path #+ '/TOP'
        
        prosody_fpath = os.path.join(basepath_wav, "BDL")
        # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

        prosody_fpath = os.path.join(basepath_wav, speaker)
        # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
        convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

    # for utterance_id in final_list[speaker]['bottom']:
    #     src_speaker_fpath = Path(f'{basepath_bnf_vq}/{vq_cluster}/ppgs/ppg-BDL-{utterance_id}.npy')
    #     # src_speaker_fpath = Path(f'{basepath_bnf_org}/ppg-{speaker}-{utterance_id}.npy')
    #     if not src_speaker_fpath.exists():
    #         continue
    #     # convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)
    #     output_dir = output_path + '/BOTTOM'
        
    #     prosody_fpath = os.path.join(basepath_wav, "BDL")
    #     # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
    #     convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

    #     prosody_fpath = os.path.join(basepath_wav, speaker)
    #     # convert(src_speaker_fpath, tgt_speaker_fpath, prosody_fpath, utterance_id, output_dir, dr=dr)
    #     convert(src_speaker_fpath, speaker, tgt_spk_dvec, prosody_fpath, utterance_id, output_dir, dr=dr)

VC Analysis

In [8]:
vq_cluster = 'vq64'
cmu_speakers = ['BDL', 'SLT', 'CLB', 'RMS']
utterance_ids = ['arctic_a00'+str(i) for i in range(10,31)]

basepath_bnf_vq = '/mnt/data1/waris/repo/vq-bnf/translation'
basepath_bnf_org = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/SV2TTS/synthesizer/ppgs'
basepath_wav = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train'
output_dir = '/mnt/data1/waris/repo/vc-vq-subset/synthesis_output/VC/vq64_all_spk_bo/640k/'

for speaker in cmu_speakers:
    tgt_speaker_fpath = os.path.join(basepath_wav, speaker)

    for utterance_id in utterance_ids:
        src_speaker_fpath = Path(f'{basepath_bnf_vq}/{vq_cluster}/ppg-BDL-{utterance_id}.npy')
        # src_speaker_fpath = Path(f'{basepath_bnf_org}/ppg-{speaker}-{utterance_id}.npy')
        if not src_speaker_fpath.exists():
            continue
        convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)

TSNE Visualization for VC performance

In [13]:
cmu_speakers = ['BDL', 'SLT', 'CLB', 'RMS']
utterance_ids = ['arctic_a00'+str(i) for i in range(10,30) if i != 18]

speakers_svq = ['SVQ_SLT', 'SVQ_CLB', 'SVQ_RMS']
speakers_novq = ['NoVQ_SLT', 'NoVQ_CLB', 'NoVQ_RMS']
speakers_vq = ['VQ_SLT', 'VQ_CLB', 'VQ_RMS']
basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train'

embed_novq = []
label_novq = []
embed_vq64 = []
label_vq64 = []
embed_all = []
label_all = []
for speaker in cmu_speakers:
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = Path(f"{basepath}/{speaker}/wav/{utterance_id}.wav")
        if not tgt_speaker_fpath.exists():
            continue

        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_novq.append(tgt_spk_dvec)
        embed_vq64.append(tgt_spk_dvec)
        embed_all.append(tgt_spk_dvec)
        label_novq.append(speaker)
        label_vq64.append(speaker)
        label_all.append(speaker)

basepath = '/mnt/data1/waris/repo/vc-vq-subset/synthesis_output/VC/novq_510k'

for speaker in cmu_speakers:
    if speaker == 'BDL':
        continue
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = Path(f"{basepath}/{speaker}/{utterance_id}.wav")
        if not tgt_speaker_fpath.exists():
            continue

        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_novq.append(tgt_spk_dvec)
        embed_all.append(tgt_spk_dvec)
        label_novq.append("VC_"+speaker)
        label_all.append("NoVQ_"+speaker)

basepath = '/mnt/data1/waris/repo/vc-vq-subset/synthesis_output/VC/vq64_all_spk/710k'

for speaker in cmu_speakers:
    if speaker == 'BDL':
        continue
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = Path(f"{basepath}/{speaker}/{utterance_id}.wav")
        if not tgt_speaker_fpath.exists():
            continue

        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_novq.append(tgt_spk_dvec)
        embed_all.append(tgt_spk_dvec)
        label_novq.append("VC_"+speaker)
        # label_all.append("NoVQ_"+speaker)
        label_all.append("SVQ_"+speaker)

basepath = '/mnt/data1/waris/repo/vc-vq-subset/synthesis_output/VC/vq64_all'

for speaker in cmu_speakers:
    if speaker == 'BDL':
        continue
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = Path(f"{basepath}/{speaker}/{utterance_id}.wav")
        if not tgt_speaker_fpath.exists():
            continue

        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_vq64.append(tgt_spk_dvec)
        embed_all.append(tgt_spk_dvec)
        label_vq64.append("VC_"+speaker)
        label_all.append("VQ_"+speaker)

In [14]:
from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection)
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np

In [14]:
# Computing t-SNE
print("Computing t-SNE embedding - speaker")
tsne_sp = manifold.TSNE(n_components=2, init='pca', random_state=0)
speaker_tsne = tsne_sp.fit_transform(embed_novq)

Computing t-SNE embedding - speaker


In [15]:
markers = ["d" , "o", "^", "P", "o", "^", "P"]
speakers = cmu_speakers + speakers_vc

colors =  mpl.cm.get_cmap('Dark2')(np.arange(7))

plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_tsne[np.where(speaker==np.array(label_novq))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker, fontsize=16)

plt.legend(fontsize=16)
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_novq_510k.png", format='png')

(21, 2) BDL
(21, 2) SLT
(21, 2) CLB
(21, 2) RMS
(19, 2) VC_SLT
(19, 2) VC_CLB
(19, 2) VC_RMS


In [16]:
# Computing t-SNE
print("Computing t-SNE embedding - speaker")
tsne_sp = manifold.TSNE(n_components=2, init='pca', random_state=0)
speaker_tsne = tsne_sp.fit_transform(embed_vq64)

Computing t-SNE embedding - speaker


In [17]:
markers = ["d" , "o", "^", "P", "o", "^", "P"]
speakers = cmu_speakers + speakers_vc

colors =  mpl.cm.get_cmap('Dark2')(np.arange(7))

plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_tsne[np.where(speaker==np.array(label_vq64))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker, fontsize=16)

plt.legend(fontsize=16)
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_vq64_300k.png", format='png')

(21, 2) BDL
(21, 2) SLT
(21, 2) CLB
(21, 2) RMS
(19, 2) VC_SLT
(19, 2) VC_CLB
(19, 2) VC_RMS


In [15]:
# Computing t-SNE
print("Computing t-SNE embedding - speaker")
tsne_sp = manifold.TSNE(n_components=2, init='pca', random_state=42)
speaker_tsne = tsne_sp.fit_transform(embed_all)

Computing t-SNE embedding - speaker


In [16]:
markers = ["d" , "o", "^", "P", "o", "^", "P", "o", "^", "P", "o", "^", "P"]
speakers = cmu_speakers + speakers_novq + speakers_vq + speakers_svq

colors =  ['black', 'red', 'red', 'red', 'blue', 'blue', 'blue', 'green', 'green', 'green', 'm', 'm', 'm']

plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_tsne[np.where(speaker==np.array(label_all))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    # plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker, fontsize=16)

plt.legend(fontsize=12)
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_svq64_720k_vs_vq64_350k_vs_novq.png", format='png')

(19, 2) BDL
(19, 2) SLT
(19, 2) CLB
(19, 2) RMS
(19, 2) NoVQ_SLT
(19, 2) NoVQ_CLB
(19, 2) NoVQ_RMS
(19, 2) VQ_SLT
(19, 2) VQ_CLB
(19, 2) VQ_RMS
(19, 2) SVQ_SLT
(19, 2) SVQ_CLB
(19, 2) SVQ_RMS


In [24]:
embed_org = np.array(embed_all[19:76])
embed_svq = np.array(embed_all[76:133])
embed_vq = np.array(embed_all[133:])

In [28]:
from scipy.spatial import distance

avg_cos = 0
for a,b in zip(embed_org, embed_svq):
    print(distance.cosine(a,b))
    avg_cos = avg_cos + distance.cosine(a,b)

avg_cos/(19*3)

0.13499289751052856
0.17255401611328125
0.18531930446624756
0.25232994556427
0.22357076406478882
0.244778573513031
0.12615728378295898
0.1412842869758606
0.1976436972618103
0.14557218551635742
0.15009206533432007
0.23128026723861694
0.06881076097488403
0.13865739107131958
0.1390005350112915
0.22643709182739258
0.18770748376846313
0.20696061849594116
0.11833471059799194
0.11977988481521606
0.1289922595024109
0.11696809530258179
0.0850449800491333
0.11445426940917969
0.18369239568710327
0.13235366344451904
0.08721566200256348
0.06692373752593994
0.13401246070861816
0.16852843761444092
0.12698489427566528
0.08555388450622559
0.1344478726387024
0.14009618759155273
0.14326703548431396
0.1352495551109314
0.1679006814956665
0.07567143440246582
0.18857800960540771
0.25537246465682983
0.13892507553100586
0.24604099988937378
0.26810604333877563
0.27864307165145874
0.14032042026519775
0.22596555948257446
0.22558796405792236
0.27853691577911377
0.19729459285736084
0.16270416975021362
0.16001933813

0.1681298289382667

In [26]:
avg_cos = 0
for a,b in zip(embed_org, embed_vq):
    avg_cos = avg_cos + distance.cosine(a,b)

avg_cos/19

0.6008362926934895

In [7]:
len(embed_all)

198

In [8]:
import umap

ModuleNotFoundError: No module named 'numba.experimental.structref'

In [10]:


# Configure UMAP hyperparameters
reducer = umap.UMAP(n_components=2, random_state=42)

AttributeError: module 'umap' has no attribute 'UMAP'

In [None]:
speaker_umap = reducer.fit_transform(embed_all)

In [None]:
markers = ["d" , "o", "^", "P", "o", "^", "P", "o", "^", "P"]
speakers = cmu_speakers + speakers_novq + speakers_vq

colors =  ['black', 'red', 'red', 'red', 'blue', 'blue', 'blue', 'green', 'green', 'green']

plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_umap[np.where(speaker==np.array(label_all))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    # plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker, fontsize=16)

plt.legend(fontsize=12)
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_umap_vq64_vs_novq.png", format='png')

### Generate Synthesis for Unseen Speakers

In [7]:
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA']
utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
#utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)] + ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']

basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/test_speakers_16k'
output_dir = '/mnt/data1/waris/repo/vc-vq-prosody/synthesis_output/prosody_experiments/vq-dr-80/'

for speaker in speakers:
    src_speaker_fpath = os.path.join(basepath, 'BDL')
    tgt_speaker_fpath = os.path.join(basepath, speaker)

    for utterance_id in utterance_ids:
        convert(src_speaker_fpath, tgt_speaker_fpath, src_speaker_fpath, utterance_id, output_dir)
        convert(src_speaker_fpath, tgt_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)

In [None]:
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA']
#utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)] #+ ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']

basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/test_speakers_16k'
output_dir = '/mnt/data1/waris/repo/vc-vq-prosody/synthesis_output/prosody_experiments/vq-dr-80/'

for speaker in speakers:
    src_speaker_fpath = os.path.join(basepath, 'BDL')
    tgt_speaker_fpath = os.path.join(basepath, speaker)

    for utterance_id in utterance_ids:
        convert(src_speaker_fpath, tgt_speaker_fpath, src_speaker_fpath, utterance_id, output_dir)
        convert(src_speaker_fpath, tgt_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)

Single utterance conversions

In [13]:
utterance_id = "001"
output_dir = '/mnt/data1/waris/repo/vc-vq-prosody/synthesis_output/prosody_experiments/vq-dr/'

#Computer target speaker dvec
tgt_speaker = "p258"
tgt_wav_path = "/mnt/data1/waris/datasets/vctk/wav48_silence_trimmed/p258/wav/p258_001_mic2.wav"
tgt_spk_dvec = compute_spk_dvec(tgt_wav_path)
tgt_spk_dvec = torch.from_numpy(tgt_spk_dvec).unsqueeze(0).to(device)


#Compute PPG #p225
src_speaker = os.path.basename("/mnt/data1/waris/datasets/vctk/wav48_silence_trimmed/p225")
src_speaker_kaldi_dir = os.path.join("/mnt/data1/waris/datasets/vctk/wav48_silence_trimmed/p225", 'kaldi')
src_wav_path = f"/mnt/data1/waris/datasets/vctk/wav48_silence_trimmed/p225/wav/p225_001_mic2.wav"
# src_wav, _ = librosa.load(src_wav_path, sr=16000)
# src_wav_tensor = torch.from_numpy(src_wav).unsqueeze(0).float().to(device)
# src_wav_lengths = torch.LongTensor([len(src_wav)]).to(device)
#ppg = ppg_model(src_wav_tensor, src_wav_lengths)
ppg = get_bnfs("p225", "p225_001_mic2", src_speaker_kaldi_dir)
ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)

# ppg_path = "/mnt/data1/waris/model_preprocessing/FAC-RR-Test/BDL"
# ppg = np.load(f"{ppg_path}/{utterance_id}.ling_feat.npy")
# ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)

# #Compute lf0_uv
# ref_wav, _ = librosa.load(tgt_wav_path, sr=16000)
# ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
# lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
# lf0_uv = torch.from_numpy(lf0_uv).unsqueeze(0).to(device)
# min_len = min(ppg.shape[1], prosody_vec.shape[1])
# min_len = ppg.shape[1]

# ppg = ppg[:, :min_len]
# # lf0_uv = lf0_uv[:min_len]
# prosody_vec = prosody_vec[:, :min_len]

for prosody_spk, wav_fpath in zip(['p225', 'p258'],['/mnt/data1/waris/datasets/vctk/wav48_silence_trimmed/p225/wav/p225_001_mic2.wav', '/mnt/data1/waris/datasets/vctk/wav48_silence_trimmed/p258/wav/p258_001_mic2.wav']):

    # Compute prosody representation
    prosody_speaker = os.path.basename(prosody_spk)
    # prosody_speaker_kaldi_dir = os.path.join(tgt_prosody_fpath, 'kaldi')
    # prosody_vec = get_bnfs(prosody_speaker, utterance_id, prosody_speaker_kaldi_dir)
    prosody_wav_fpath = wav_fpath
    prosody_vec, _ = compute_mel(prosody_wav_fpath)
    prosody_vec = bin_level_min_max_norm(prosody_vec)
    prosody_vec = torch.from_numpy(prosody_vec).unsqueeze(0).to(device)

    with torch.no_grad():
    #x, logf0_uv=None, spemb=None, prosody_vec=None
        mel_pred, att_ws = ppg2mel_model.inference(torch.squeeze(ppg), torch.squeeze(tgt_spk_dvec), torch.squeeze(prosody_vec))
        mel_pred = mel_pred.unsqueeze(0)

        y = hifigan_model(mel_pred.view(1, -1, 80).transpose(1, 2))

        step = os.path.basename(ppg2mel_model_file)[:-4].split("_")[-1]
        output_dir = os.path.join(output_dir, 'Step_'+step, tgt_speaker)
        os.makedirs(output_dir, exist_ok=True)

        wav_fname = f"{output_dir}/{prosody_speaker}_{utterance_id}.wav"

        sf.write(wav_fname, y.squeeze().cpu().numpy(), 24000, "PCM_16")

### Generate Synthesis for Seen Speakers

In [8]:
speakers = ['MBMPS', 'BWC', 'HKK', 'SKA']
utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
#utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)] #+ ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']

basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train'
output_dir = '/mnt/data1/waris/repo/vc-vq-prosody/synthesis_output/prosody_experiments/vq-dr/'

for speaker in speakers:
    src_speaker_fpath = os.path.join(basepath, 'BDL')
    tgt_speaker_fpath = os.path.join(basepath, speaker)

    for utterance_id in utterance_ids:
        convert(src_speaker_fpath, tgt_speaker_fpath, src_speaker_fpath, utterance_id, output_dir)
        convert(src_speaker_fpath, tgt_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train/SKA/wav/arctic_b0537.wav'

### TSNE Visualisation

In [5]:
# #utterance_ids = ['arctic_b05'+str(i) for i in range(21, 40)]
# utterance_ids = ['arctic_a00'+str(i) for i in range(10, 50)]
# speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA', 'BDL']
# speakers_fac = ['FAC_NJS', 'FAC_TXHC', 'FAC_YKWK', 'FAC_ZHAA']
# basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/test_speakers_16k'

# embed_unseen = []
# label_unseen = []
# for speaker in speakers:
#     for utterance_id in utterance_ids:
#         tgt_speaker_fpath = f"{basepath}/{speaker}/wav/{utterance_id}.wav"
#         tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
#         embed_unseen.append(tgt_spk_dvec)
#         label_unseen.append(speaker)

# basepath = '/mnt/data1/waris/projects/dummy/synthesis_output/prosody_experiments/saln/Step_460000' #'/mnt/data1/waris/projects/dummy/synthesis_output/Step_1380000'
# #utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
# utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)]

# for speaker in speakers:
#     if speaker == 'BDL':
#         continue
#     for utterance_id in utterance_ids:
#         tgt_speaker_fpath = f"{basepath}/{speaker}/{speaker}_{utterance_id}.wav"
#         tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
#         embed_unseen.append(tgt_spk_dvec)
#         label_unseen.append("FAC_"+speaker)


utterance_ids = ['arctic_a00'+str(i) for i in range(10, 50)]
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA', 'BDL']
speakers_fac = ['FAC_NJS', 'FAC_NJS_L1', 'FAC_TXHC', 'FAC_TXHC_L1', 'FAC_YKWK', 'FAC_YKWK_L1', 'FAC_ZHAA', 'FAC_ZHAA_L1']
basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/test_speakers_16k'

embed_unseen = []
label_unseen = []
for speaker in speakers:
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/wav/{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_unseen.append(tgt_spk_dvec)
        label_unseen.append(speaker)

basepath = '/mnt/data1/waris/repo/transformer-vc-prosody/synthesis_output/prosody_experiments/confI/Step_820000' #'/mnt/data1/waris/projects/dummy/synthesis_output/Step_1380000'
utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
#utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)]

for speaker in speakers:
    if speaker == 'BDL':
        continue
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/{speaker}_{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_unseen.append(tgt_spk_dvec)
        label_unseen.append("FAC_"+speaker)

        tgt_speaker_fpath = f"{basepath}/{speaker}/BDL_{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_unseen.append(tgt_spk_dvec)
        label_unseen.append("FAC_"+speaker+"_L1")

[0.         0.         0.17308295 0.08495504 0.         0.06151969
 0.         0.00348422 0.         0.01598782 0.16120018 0.02397711
 0.08538381 0.         0.00954989 0.04692713 0.08034351 0.09539499
 0.12028189 0.         0.         0.1706251  0.         0.01229028
 0.         0.         0.         0.06027151 0.         0.
 0.00044759 0.         0.         0.         0.         0.19359453
 0.00417533 0.00275733 0.00772364 0.         0.         0.
 0.         0.         0.         0.03690539 0.         0.
 0.04718795 0.17268014 0.         0.         0.         0.
 0.02193573 0.         0.05320492 0.16026908 0.06875139 0.
 0.14004706 0.         0.02261326 0.         0.         0.
 0.09406947 0.10522682 0.         0.         0.         0.00283123
 0.00739008 0.03633346 0.03437466 0.         0.         0.0016782
 0.03522995 0.01660772 0.12308419 0.         0.1521407  0.04140547
 0.03553603 0.01659731 0.         0.08137074 0.         0.11665595
 0.03894931 0.         0.         0.        

In [6]:
from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection)
import matplotlib.pyplot as plt

# Computing t-SNE
print("Computing t-SNE embedding - speaker")
tsne_sp = manifold.TSNE(n_components=2, init='pca', random_state=0)
speaker_tsne = tsne_sp.fit_transform(embed_unseen)

Computing t-SNE embedding - speaker


In [7]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

markers = ["d" , "o", "^", "P", "p", "X", "X", "*", "*", "s", "s", "v", "v"]
speakers = speakers + speakers_fac

colors =  mpl.cm.get_cmap('tab20')(np.arange(13))

In [8]:
plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_tsne[np.where(speaker==np.array(label_unseen))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker, fontsize=16)

plt.legend(fontsize=16)
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_Unseen_82k.png", format='png')

(40, 2) NJS
(40, 2) TXHC
(40, 2) YKWK
(40, 2) ZHAA
(40, 2) BDL
(4, 2) FAC_NJS
(4, 2) FAC_NJS_L1
(4, 2) FAC_TXHC
(4, 2) FAC_TXHC_L1
(4, 2) FAC_YKWK
(4, 2) FAC_YKWK_L1
(4, 2) FAC_ZHAA
(4, 2) FAC_ZHAA_L1


: 

In [None]:
#utterance_ids = ['arctic_b04'+str(i) for i in range(21, 40)]
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 50)]
speakers = ['MBMPS', 'BWC', 'HKK', 'SKA', "BDL"]
speakers_fac = ['FAC_MBMPS', 'FAC_BWC', 'FAC_HKK', 'FAC_SKA']
basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train'

embed_seen = []
label_seen = []
for speaker in speakers:
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/wav/{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_seen.append(tgt_spk_dvec)
        label_seen.append(speaker)

basepath = '/mnt/data1/waris/repo/transformer-vc-prosody/synthesis_output/prosody_experiments/confI/Step_820000'
#utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)]

for speaker in speakers:
    if speaker == 'BDL':
        continue
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_seen.append(tgt_spk_dvec)
        label_seen.append("FAC_"+speaker)

In [None]:
from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection)
import matplotlib.pyplot as plt

# Computing t-SNE
print("Computing t-SNE embedding - speaker")
tsne_sp = manifold.TSNE(n_components=2, init='pca', random_state=0)
speaker_tsne = tsne_sp.fit_transform(embed_seen)

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

markers = ["d" , "o", "^", "P", "p", "X", "*", "s", "v"]
speakers = speakers + speakers_fac

colors =  mpl.cm.get_cmap('tab20')(np.arange(9))

In [None]:
plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_tsne[np.where(speaker==np.array(label_seen))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker)

plt.legend()
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_Seen_82k.png", format='png')