In [1]:
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_VISIBLE_DEVICES=1


In [2]:
import time
import sys
import os
import argparse
import torch
import numpy as np
import glob
from pathlib import Path
from tqdm import tqdm
import soundfile as sf
from src.transformer_bnftomel import Transformer
from utils.f0_utils import get_cont_lf0
from utils.load_yaml import HpsYaml

from vocoders.hifigan_model import load_hifigan_generator

from speaker_encoder import inference as encoder
from speaker_encoder.audio import preprocess_wav
from data_objects.kaldi_interface import KaldiInterface

from src import build_model
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import matplotlib.pyplot as plt
import librosa.display

In [3]:
encoder_speaker_weights = Path("/home/grads/q/quamer.waris/projects/Accentron/pretrained_model/pretrained/encoder/saved_models/pretrained.pt")
encoder.load_model(encoder_speaker_weights)

Loaded encoder "pretrained.pt" trained to step 1564501


In [4]:
def compute_spk_dvec(wav_path):
    fpath = Path(wav_path)
    wav = preprocess_wav(fpath)
    # print('wac-shape',wav.shape)
    spk_dvec = encoder.embed_utterance(wav)
    #print(spk_dvec)
    return spk_dvec

In [5]:
def build_transf_model(model_config, model_file, device):
    model_class = build_model(model_config["model_name"])
    ppg2mel_model = model_class(
        model_config["model"]
    ).to(device)
    ckpt = torch.load(model_file, map_location=device)
    ppg2mel_model.load_state_dict(ckpt["model"])
    ppg2mel_model.eval()
    return ppg2mel_model

In [6]:
def get_bnfs(spk_id, utterance_id, kaldi_dir):
    ki = KaldiInterface(wav_scp=str(os.path.join(kaldi_dir, 'wav.scp')),
                        bnf_scp=str(os.path.join(kaldi_dir, 'bnf/feats.scp')))
    bnf = ki.get_feature('_'.join([spk_id, utterance_id]), 'bnf')
    return bnf

In [7]:
# ppg2mel_model_train_config = Path('/mnt/data1/waris/projects/dummy/config/transformer_vc_ppg2mel_outspkdloss_inp_conct.yaml')
ppg2mel_model_train_config = Path('/mnt/data1/waris/projects/dummy/config/transformer_vc_ppg2mel.yaml')
ppg2mel_config = HpsYaml(ppg2mel_model_train_config) 
#ppg2mel_model_file = Path('/mnt/data1/waris/projects/dummy/ckpt/transformer_baseline/best_loss_step_910000.pth')
ppg2mel_model_file = Path('/mnt/data1/waris/projects/dummy/ckpt/transformer_baseline_two/best_loss_step_960000.pth')
device = 'cuda'

ppg2mel_model = build_transf_model(ppg2mel_config, ppg2mel_model_file, device) 
hifigan_model = load_hifigan_generator(device)

In [8]:
@torch.no_grad()
def convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir):
        
    # Data related
    tgt_speaker = os.path.basename(tgt_speaker_fpath)
    tgt_wav_path = f"{tgt_speaker_fpath}/wav/{utterance_id}.wav"
    tgt_spk_dvec = compute_spk_dvec(tgt_wav_path)
    tgt_spk_dvec = torch.from_numpy(tgt_spk_dvec).unsqueeze(0).to(device)

    src_speaker = os.path.basename(src_speaker_fpath)
    src_speaker_kaldi_dir = os.path.join(src_speaker_fpath, 'kaldi')
    ppg = get_bnfs(src_speaker, utterance_id, src_speaker_kaldi_dir)
    ppg = torch.from_numpy(ppg).unsqueeze(0).to(device)
    
    min_len = ppg.shape[1]
    ppg = ppg[:, :min_len]

    mel_pred, att_ws = ppg2mel_model.inference(torch.squeeze(ppg), torch.squeeze(tgt_spk_dvec))

    mel_pred = mel_pred.unsqueeze(0)
    
    y = hifigan_model(mel_pred.view(1, -1, 80).transpose(1, 2))

    step = os.path.basename(ppg2mel_model_file)[:-4].split("_")[-1]
    output_dir = os.path.join(output_dir, 'Step_'+step, tgt_speaker)
    os.makedirs(output_dir, exist_ok=True)

    wav_fname = f"{output_dir}/{utterance_id}.wav"

    sf.write(wav_fname, y.squeeze().cpu().numpy(), 24000, "PCM_16")


### Generate Synthesis for Unseen Speakers

In [9]:
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA']
#utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)]

basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/test_speakers_16k'
output_dir = '/mnt/data1/waris/projects/dummy/synthesis_output/baseline_two/'
for speaker in speakers:
    src_speaker_fpath = os.path.join(basepath, 'BDL')
    tgt_speaker_fpath = os.path.join(basepath, speaker)

    for utterance_id in utterance_ids:
        convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)

### Generate Synthesis for Seen Speakers

In [10]:
speakers = ['MBMPS', 'BWC', 'HKK', 'SKA']
#utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)]

basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train'
output_dir = '/mnt/data1/waris/projects/dummy/synthesis_output'
for speaker in speakers:
    src_speaker_fpath = os.path.join(basepath, 'BDL')
    tgt_speaker_fpath = os.path.join(basepath, speaker)

    for utterance_id in utterance_ids:
        convert(src_speaker_fpath, tgt_speaker_fpath, utterance_id, output_dir)

TSNE Visualisation

In [11]:
#utterance_ids = ['arctic_b05'+str(i) for i in range(21, 40)]
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 50)]
speakers = ['NJS', 'TXHC', 'YKWK', 'ZHAA', 'BDL']
speakers_fac = ['FAC_NJS', 'FAC_TXHC', 'FAC_YKWK', 'FAC_ZHAA']
basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/test_speakers_16k'

embed_unseen = []
label_unseen = []
for speaker in speakers:
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/wav/{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_unseen.append(tgt_spk_dvec)
        label_unseen.append(speaker)

basepath = '/mnt/data1/waris/projects/dummy/synthesis_output/Step_1380000'
#utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)]

for speaker in speakers:
    if speaker == 'BDL':
        continue
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_unseen.append(tgt_spk_dvec)
        label_unseen.append("FAC_"+speaker)

In [12]:
from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection)
import matplotlib.pyplot as plt

# Computing t-SNE
print("Computing t-SNE embedding - speaker")
tsne_sp = manifold.TSNE(n_components=2, init='pca', random_state=0)
speaker_tsne = tsne_sp.fit_transform(embed_unseen)

Computing t-SNE embedding - speaker


In [13]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

markers = ["d" , "o", "^", "P", "p", "X", "*", "s", "v"]
speakers = speakers + speakers_fac

colors =  mpl.cm.get_cmap('tab20')(np.arange(9))

In [14]:
plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_tsne[np.where(speaker==np.array(label_unseen))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker)

plt.legend()
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_Unseen_960k_baseline_two.png", format='png')

(40, 2) NJS
(40, 2) TXHC
(40, 2) YKWK
(40, 2) ZHAA
(40, 2) BDL
(20, 2) FAC_NJS
(20, 2) FAC_TXHC
(20, 2) FAC_YKWK
(20, 2) FAC_ZHAA


In [16]:
#utterance_ids = ['arctic_b04'+str(i) for i in range(21, 40)]
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 50)]
speakers = ['MBMPS', 'BWC', 'HKK', 'SKA', "BDL"]
speakers_fac = ['FAC_MBMPS', 'FAC_BWC', 'FAC_HKK', 'FAC_SKA']
basepath = '/mnt/data1/waris/datasets/data/arctic_dataset/all_data_for_ac_vc_train'

embed_seen = []
label_seen = []
for speaker in speakers:
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/wav/{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_seen.append(tgt_spk_dvec)
        label_seen.append(speaker)

basepath = '/mnt/data1/waris/projects/dummy/synthesis_output/Step_1380000'
#utterance_ids = ['arctic_b0534', 'arctic_b0537', 'arctic_b0538', 'arctic_b0539']
utterance_ids = ['arctic_a00'+str(i) for i in range(10, 30)]

for speaker in speakers:
    if speaker == 'BDL':
        continue
    for utterance_id in utterance_ids:
        tgt_speaker_fpath = f"{basepath}/{speaker}/{utterance_id}.wav"
        tgt_spk_dvec = compute_spk_dvec(tgt_speaker_fpath)
        embed_seen.append(tgt_spk_dvec)
        label_seen.append("FAC_"+speaker)

In [17]:
from sklearn import (manifold, datasets, decomposition, ensemble, discriminant_analysis, random_projection)
import matplotlib.pyplot as plt

# Computing t-SNE
print("Computing t-SNE embedding - speaker")
tsne_sp = manifold.TSNE(n_components=2, init='pca', random_state=0)
speaker_tsne = tsne_sp.fit_transform(embed_seen)

Computing t-SNE embedding - speaker


In [18]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np

markers = ["d" , "o", "^", "P", "p", "X", "*", "s", "v"]
speakers = speakers + speakers_fac

colors =  mpl.cm.get_cmap('tab20')(np.arange(9))

In [19]:
plt.figure(figsize=(12,8))
for speaker, c, m in zip(speakers, colors, markers):
    X_speaker_embedding = speaker_tsne[np.where(speaker==np.array(label_seen))]
    print(X_speaker_embedding.shape, speaker)
    plt.scatter(X_speaker_embedding[:,0], X_speaker_embedding[:,1], label=speaker, marker=m, color=c)
    plt.text(X_speaker_embedding[-1,0], X_speaker_embedding[-1,1], speaker)

plt.legend()
plt.tight_layout()
plt.savefig("embed_viz/SpeakerEmbeddings_Seen_1380000.png", format='png')

(40, 2) MBMPS
(40, 2) BWC
(40, 2) HKK
(40, 2) SKA
(40, 2) BDL
(20, 2) FAC_MBMPS
(20, 2) FAC_BWC
(20, 2) FAC_HKK
(20, 2) FAC_SKA
