In [None]:
cd..

In [None]:
# load packages
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import yaml
from munch import Munch
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import torchaudio
import librosa
import matplotlib.pyplot as plt
from utils.ASR.models import ASRCNN
from utils.JDC.model import JDCNet
from models import Generator, MappingNetwork, StyleEncoder
import soundfile as sf
import IPython.display as ipd
import pyworld
from tqdm import tqdm
% matplotlib inline

In [None]:
speakers = ['F101', 'F102', 'F103', 'F104', 'F105', 'F106', 'F107', 'F108', 'F109', 'F110',
            'M101', 'M102', 'M103', 'M104', 'M105', 'M106', 'M107', 'M108', 'M109', 'M110',
            'FAF', 'FFS', 'FKM', 'FKN', 'FKS', 'FMS', 'FSU', 'FTK', 'FYM', 'FYN',
            'MAU', 'MHT', 'MMS', 'MMY', 'MNM', 'MSH', 'MTK', 'MTM', 'MTT', 'MXM']
print(len(speakers))
to_mel = torchaudio.transforms.MelSpectrogram(
    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
mean, std = -4, 4


def preprocess(wave):
    wave_tensor = torch.from_numpy(wave).float()
    mel_tensor = to_mel(wave_tensor)
    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std
    return mel_tensor


def build_model(model_params={}):
    args = Munch(model_params)
    generator = Generator(args.dim_in, args.style_dim, args.max_conv_dim, w_hpf=args.w_hpf, F0_channel=args.F0_channel)
    mapping_network = MappingNetwork(args.latent_dim, args.style_dim, args.num_domains, hidden_dim=args.max_conv_dim)
    style_encoder = StyleEncoder(args.dim_in, args.style_dim, args.num_domains, args.max_conv_dim)

    nets_ema = Munch(generator=generator,
                     mapping_network=mapping_network,
                     style_encoder=style_encoder)

    return nets_ema


def compute_style(model, speaker_dicts):
    reference_embeddings = {}
    for key, (path, speaker) in speaker_dicts.items():
        if path == "":
            label = torch.LongTensor([speaker]).to('cuda')
            latent_dim = model.mapping_network.shared[0].in_features
            ref = model.mapping_network(torch.randn(1, latent_dim).to('cuda'), label)
        else:
            print(path)
            wave, sr = librosa.load(path, sr=24000)
            audio, index = librosa.effects.trim(wave, top_db=30)
            if sr != 24000:
                wave = librosa.resample(wave, sr, 24000)
            mel_tensor = preprocess(wave).to('cuda')

            with torch.no_grad():
                label = torch.LongTensor([speaker])
                ref = model.style_encoder(mel_tensor.unsqueeze(1), label)
        reference_embeddings[key] = (ref, label)

    return reference_embeddings


## Load models

In [None]:

# load F0 model
F0_model = JDCNet(num_class=1, seq_len=192)
params = torch.load("utils/JDC/bst.t7")['net']
F0_model.load_state_dict(params)
_ = F0_model.eval()
F0_model = F0_model.to('cuda')

# load vocoder
from parallel_wavegan.utils import load_model

vocoder = load_model("Vocoder/checkpoint-400000steps.pkl").to('cuda').eval()
vocoder.remove_weight_norm()
_ = vocoder.eval()

# load starganv2
model_path = 'Models/atr/epoch_00032.pth'

with open('Configs/config.yml') as f:
    starganv2_config = yaml.safe_load(f)
starganv2 = build_model(model_params=starganv2_config["model_params"])
params = torch.load(model_path, map_location='cpu')
print("Epochs:", params["epochs"])

params = params['model_ema']
_ = [starganv2[key].load_state_dict(params[key]) for key in starganv2]
_ = [starganv2[key].eval() for key in starganv2]
starganv2.style_encoder = starganv2.style_encoder.to('cuda')
starganv2.mapping_network = starganv2.mapping_network.to('cuda')
starganv2.generator = starganv2.generator.to('cuda')

In [None]:
speakers_normal = ['F101', 'F102', 'F103', 'F104', 'F105', 'F106', 'F107', 'F108', 'F109', 'F110',
            'M101', 'M102', 'M103', 'M104', 'M105', 'M106', 'M107', 'M108', 'M109', 'M110']
speakers_prof = ['FAF', 'FFS', 'FKM', 'FKN', 'FKS', 'FMS', 'FSU', 'FTK', 'FYM', 'FYN',
            'MAU', 'MHT', 'MMS', 'MMY', 'MNM', 'MSH', 'MTK', 'MTM', 'MTT', 'MXM']
speakers_male_prof = ['MAU', 'MHT', 'MMS', 'MMY', 'MNM', 'MSH', 'MTK', 'MTM', 'MTT', 'MXM']
speakers_female_prof = ['FAF', 'FFS', 'FKM', 'FKN', 'FKS', 'FMS', 'FSU', 'FTK', 'FYM', 'FYN']
speakers_male_normal = ['M101', 'M102', 'M103', 'M104', 'M105', 'M106', 'M107', 'M108', 'M109', 'M110']
speakers_female_normal = ['F101', 'F102', 'F103', 'F104', 'F105', 'F106', 'F107', 'F108', 'F109', 'F110']

## Calculate speaker embedding

In [None]:
def scale_db(y, target_dB_FS=-25, eps=1e-6):
    rms = np.sqrt(np.mean(y ** 2))
    scalar = 10 ** (target_dB_FS / 20) / (rms + eps)
    y *= scalar
    return y

# no reference, using mapping network
speaker_dicts = {}
for s in speakers:
    speaker_dicts[s] = ("data/ATR_processed/wav24/%s/1.wav" % s,
                        speakers.index(s))
reference_embeddings = compute_style(starganv2, speaker_dicts)

In [None]:
embedding = np.array([reference_embeddings[k][0].squeeze().cpu().numpy() for k in reference_embeddings])
label = list(reference_embeddings.keys())
print(embedding.shape)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=6, svd_solver='arpack')
emb_pca = pca.fit_transform(embedding)
print(emb_pca.shape)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import matplotlib.transforms as transforms

def confidence_ellipse(x, y, ax, n_std=3.0, facecolor='none', **kwargs):
    """
    Create a plot of the covariance confidence ellipse of *x* and *y*.

    Parameters
    ----------
    x, y : array-like, shape (n, )
        Input data.

    ax : matplotlib.axes.Axes
        The axes object to draw the ellipse into.

    n_std : float
        The number of standard deviations to determine the ellipse's radiuses.

    **kwargs
        Forwarded to `~matplotlib.patches.Ellipse`

    Returns
    -------
    matplotlib.patches.Ellipse
    """
    if x.size != y.size:
        raise ValueError("x and y must be the same size")

    cov = np.cov(x, y)
    pearson = cov[0, 1]/np.sqrt(cov[0, 0] * cov[1, 1])
    # Using a special case to obtain the eigenvalues of this
    # two-dimensionl dataset.
    ell_radius_x = np.sqrt(1 + pearson)
    ell_radius_y = np.sqrt(1 - pearson)
    ellipse = Ellipse((0, 0), width=ell_radius_x * 1.7, height=ell_radius_y * 1.7,
                      facecolor=facecolor, **kwargs)

    # Calculating the stdandard deviation of x from
    # the squareroot of the variance and multiplying
    # with the given number of standard deviations.
    scale_x = np.sqrt(cov[0, 0]) * n_std
    mean_x = np.mean(x)

    # calculating the stdandard deviation of y ...
    scale_y = np.sqrt(cov[1, 1]) * n_std
    mean_y = np.mean(y)
    ax.scatter([mean_x], [mean_y], marker='X', c='C3', edgecolor="None", alpha=0.8, zorder=3, s=8)
    transf = transforms.Affine2D() \
        .rotate_deg(45) \
        .scale(scale_x, scale_y) \
        .translate(mean_x, mean_y)

    ellipse.set_transform(transf + ax.transData)
    return ax.add_patch(ellipse)

In [None]:
fig, ax = plt.subplots(figsize=(4,4), dpi=250)
idx = [i for i, l in enumerate(label) if l in speakers_male_prof]
confidence_ellipse(emb_pca[idx, 0], emb_pca[idx, 1], ax, facecolor='C0', alpha=0.2)
idx = [i for i, l in enumerate(label) if l in speakers_female_prof]
confidence_ellipse(emb_pca[idx, 0], emb_pca[idx, 1], ax, facecolor='C1',  alpha=0.2)


idx = [i for i, l in enumerate(label) if l in speakers_female_prof]
ax.scatter(emb_pca[idx, 0], emb_pca[idx, 1], zorder=3,  marker='^', edgecolor='none', c='C0', alpha=0.7, label='Female announcer')
print(np.mean(emb_pca[idx], axis=0))

idx = [i for i, l in enumerate(label) if l in speakers_male_prof]
ax.scatter(emb_pca[idx, 0], emb_pca[idx, 1], zorder=3,  marker='v', c='C0', edgecolor='none', alpha=0.7, label='Male announcer')


idx = [i for i, l in enumerate(label) if l in speakers_female_normal]
ax.scatter(emb_pca[idx, 0], emb_pca[idx, 1], zorder=3, marker='^', c='C1', edgecolor='none', alpha=0.7, label='Female non-expert')

idx = [i for i, l in enumerate(label) if l in speakers_male_normal]
ax.scatter(emb_pca[idx, 0], emb_pca[idx, 1], zorder=3, marker='v', c='C1', edgecolor='none', alpha=0.7, label='Male non-expert')


idx = [i for i, l in enumerate(label) if l in speakers_male_normal]
confidence_ellipse(emb_pca[idx, 0], emb_pca[idx, 1], ax, facecolor='C2', alpha=0.2)
idx = [i for i, l in enumerate(label) if l in speakers_female_normal]
confidence_ellipse(emb_pca[idx, 0], emb_pca[idx, 1], ax, facecolor='C3',  alpha=0.2)

# ax.scatter(emb_pca[[14,30], 0], emb_pca[[14,30], 1], zorder=3, marker='v', c="None", edgecolor='r', alpha=0.7)
plt.xlabel("Dimension 0")
plt.ylabel("Dimension 1")
for l, e in zip(label, emb_pca):
    if l == "MAU" or l == "M105" :
        plt.annotate(l, xy=[e[0], e[1]], xytext=[e[0]-3.5, e[1]], ha='center', va='center', arrowprops=dict(arrowstyle="->"))
ax.grid(zorder=5, linestyle='--')
ax.legend(ncol=2, loc='lower center', bbox_to_anchor=(0.5, 1), fontsize=8)
plt.savefig("emb_pca.svg")