### Overview



In [1]:
import os
import importlib
import random
import librosa
import torch
import umap
import umap.plot
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import torch.nn.functional as F
import torch

import numpy as np
from TTS.utils.generic_utils import load_config
from tqdm import tqdm
from TTS.utils.speakers import save_speaker_mapping, load_speaker_mapping

# you may need to change this depending on your system
os.environ['CUDA_VISIBLE_DEVICES']='0'


def calculate_error(speaker_mapping, max_speakers=9999, samples_per_speaker=1000):
    if speaker_mapping == {}:
        raise ValueError("File Not Found Error, Verify SPEAKER_JSON_PATH")

    speakers_samples = {}
    embeds = []
    labels = []
    colors_id = []
    num_speakers = 0
    keys_mapping = list(speaker_mapping.keys())
    #random.shuffle(keys_mapping)
    for i, sample in enumerate(keys_mapping):
        if speaker_mapping[sample]['name'] in speakers_samples.keys():
            if speakers_samples[speaker_mapping[sample]['name']]['n_samples'] > samples_per_speaker:
                continue
            speakers_samples[speaker_mapping[sample]['name']]['n_samples'] = speakers_samples[speaker_mapping[sample]['name']]['n_samples'] + 1
            aux =  speakers_samples[speaker_mapping[sample]['name']]['samples']
            aux.append(speaker_mapping[sample]['embedding'])
            speakers_samples[speaker_mapping[sample]['name']]['samples'] = aux
        else:
            if num_speakers >= max_speakers: #max speakers
                continue
            speakers_samples[speaker_mapping[sample]['name']] = {'n_samples':1,'samples':[speaker_mapping[sample]['embedding']]}    
            num_speakers += 1


    speakers_samples_keys = list(speakers_samples.keys())
    print(" Num speakers: ", len(speakers_samples_keys))
    for i, sample in enumerate(speakers_samples_keys):
        speaker_embeddings =  speakers_samples[sample]['samples']
        speakers_samples[sample]['centroid'] = np.mean(np.array(speaker_embeddings), axis=0)

    total_incorect = 0
    total_testes = 0
    inte = 0
    for i, sample in enumerate(speakers_samples_keys): # for in all speakers
        inte = 0
        speakers_samples[sample]['n_samples_invade_space'] = 0
        for emb in speakers_samples[sample]['samples']: # for in all embeddings speaker
            #calcule  between dist emb and centroid
            dist_self = F.pairwise_distance(torch.FloatTensor([emb]),torch.FloatTensor(speakers_samples[sample]['centroid'])).detach().cpu().numpy();

            for j in range(len(speakers_samples_keys)): # for in all speakers
                if j == i:
                    continue
                centroid = speakers_samples[speakers_samples_keys[j]]['centroid']
                #calcule the dist between emb and centroid for others speakers   
                dist_other = F.pairwise_distance(torch.FloatTensor([emb]),torch.FloatTensor(centroid)).detach().cpu().numpy()
                #print(dist_other, np.array(emb).shape,np.array(centroid).shape )    
                if dist_self > dist_other:
                        speakers_samples[sample]['n_samples_invade_space'] += 1
                inte+=1
        print("Total Speaker Comparisons for, ",sample.replace('\n',''), 'is ', inte)
        print('Number of sample incorrect:',speakers_samples[sample]['n_samples_invade_space'])
        total_testes+=inte
        total_incorect+=speakers_samples[sample]['n_samples_invade_space']


    print("Of ",total_testes, "Samples missed", total_incorect, " error percentage: ",total_incorect*100/total_testes)

In [2]:
# Set constants
ROOT_PATH = '../../' 
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/ResNetSE34L/angleproto/speakers-pt-BRSpeech-beta3-Angular Prototypical-ResNetSE34L.json')

#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)

calculate_error(speaker_mapping)

 Num speakers:  44
Total Speaker Comparisons for,  AI is  43043
Number of sample incorrect: 1576
Total Speaker Comparisons for,  AN is  5074
Number of sample incorrect: 28
Total Speaker Comparisons for,  AR is  20855
Number of sample incorrect: 125
Total Speaker Comparisons for,  AS is  43043
Number of sample incorrect: 1172
Total Speaker Comparisons for,  BR is  31906
Number of sample incorrect: 195
Total Speaker Comparisons for,  BZ is  11395
Number of sample incorrect: 197
Total Speaker Comparisons for,  CP is  18662
Number of sample incorrect: 51
Total Speaker Comparisons for,  CZ is  7396
Number of sample incorrect: 33
Total Speaker Comparisons for,  DR is  22188
Number of sample incorrect: 180
Total Speaker Comparisons for,  EB is  2236
Number of sample incorrect: 0
Total Speaker Comparisons for,  EC is  8987
Number of sample incorrect: 32
Total Speaker Comparisons for,  ED is  43043
Number of sample incorrect: 989
Total Speaker Comparisons for,  EF is  301
Number of sample incor

In [3]:
# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/ResNetSE34L/angleproto/speakers-en-vctk-Angular Prototypical-ResNetSE34L.json')

#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping) 

 Num speakers:  108
Total Speaker Comparisons for,  p232 is  44084
Number of sample incorrect: 4063
Total Speaker Comparisons for,  p305 is  45154
Number of sample incorrect: 2470
Total Speaker Comparisons for,  p341 is  43763
Number of sample incorrect: 1753
Total Speaker Comparisons for,  p362 is  45261
Number of sample incorrect: 3920
Total Speaker Comparisons for,  p334 is  45261
Number of sample incorrect: 3123
Total Speaker Comparisons for,  p317 is  45261
Number of sample incorrect: 2314
Total Speaker Comparisons for,  p254 is  42479
Number of sample incorrect: 3008
Total Speaker Comparisons for,  p236 is  52644
Number of sample incorrect: 3454
Total Speaker Comparisons for,  p277 is  49113
Number of sample incorrect: 4199
Total Speaker Comparisons for,  p376 is  31244
Number of sample incorrect: 5761
Total Speaker Comparisons for,  p323 is  45368
Number of sample incorrect: 3047
Total Speaker Comparisons for,  p360 is  45368
Number of sample incorrect: 4392
Total Speaker Compar

Total Speaker Comparisons for,  p345 is  42479
Number of sample incorrect: 4282
Total Speaker Comparisons for,  p269 is  42372
Number of sample incorrect: 4451
Total Speaker Comparisons for,  p302 is  33277
Number of sample incorrect: 5213
Total Speaker Comparisons for,  p243 is  42051
Number of sample incorrect: 2433
Total Speaker Comparisons for,  p241 is  37771
Number of sample incorrect: 3188
Of  4715490 Samples missed 395734  error percentage:  8.392213746609578


In [4]:
## Teste using GE2E-ResNetSE34L (trained on VoxCeleb) for BRspeech and VCTK

In [5]:
# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/ResNetSE34L/ge2e/speakers-pt-GE2E-ResNetSE34L-BRSpeech3-beta3.json')

#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping)

 Num speakers:  44
Total Speaker Comparisons for,  AI is  43043
Number of sample incorrect: 1418
Total Speaker Comparisons for,  AN is  5074
Number of sample incorrect: 40
Total Speaker Comparisons for,  AR is  20855
Number of sample incorrect: 190
Total Speaker Comparisons for,  AS is  43043
Number of sample incorrect: 1258
Total Speaker Comparisons for,  BR is  31906
Number of sample incorrect: 174
Total Speaker Comparisons for,  BZ is  11395
Number of sample incorrect: 320
Total Speaker Comparisons for,  CP is  18662
Number of sample incorrect: 76
Total Speaker Comparisons for,  CZ is  7396
Number of sample incorrect: 100
Total Speaker Comparisons for,  DR is  22188
Number of sample incorrect: 265
Total Speaker Comparisons for,  EB is  2236
Number of sample incorrect: 16
Total Speaker Comparisons for,  EC is  8987
Number of sample incorrect: 92
Total Speaker Comparisons for,  ED is  43043
Number of sample incorrect: 1317
Total Speaker Comparisons for,  EF is  301
Number of sample in

In [6]:
# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/ResNetSE34L/ge2e/speakers-en-vctk-GE2E-ResNetSE34L.json')

#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping)

 Num speakers:  108
Total Speaker Comparisons for,  p232 is  44084
Number of sample incorrect: 3534
Total Speaker Comparisons for,  p305 is  45154
Number of sample incorrect: 2291
Total Speaker Comparisons for,  p341 is  43763
Number of sample incorrect: 1998
Total Speaker Comparisons for,  p362 is  45261
Number of sample incorrect: 5101
Total Speaker Comparisons for,  p334 is  45261
Number of sample incorrect: 3345
Total Speaker Comparisons for,  p317 is  45261
Number of sample incorrect: 2318
Total Speaker Comparisons for,  p254 is  42479
Number of sample incorrect: 2961
Total Speaker Comparisons for,  p236 is  52644
Number of sample incorrect: 4534
Total Speaker Comparisons for,  p277 is  49113
Number of sample incorrect: 4542
Total Speaker Comparisons for,  p376 is  31244
Number of sample incorrect: 3606
Total Speaker Comparisons for,  p323 is  45368
Number of sample incorrect: 2791
Total Speaker Comparisons for,  p360 is  45368
Number of sample incorrect: 5163
Total Speaker Compar

Total Speaker Comparisons for,  p345 is  42479
Number of sample incorrect: 3258
Total Speaker Comparisons for,  p269 is  42372
Number of sample incorrect: 3731
Total Speaker Comparisons for,  p302 is  33277
Number of sample incorrect: 2885
Total Speaker Comparisons for,  p243 is  42051
Number of sample incorrect: 2292
Total Speaker Comparisons for,  p241 is  37771
Number of sample incorrect: 3543
Of  4715490 Samples missed 373075  error percentage:  7.911691043772757


In [7]:
## Teste using Speech2Phone (official implementation) for BRspeech and VCTK

In [8]:

# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/Speech2Phone/speakers-Speech2Phone-Brspeech.json')


#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping)

 Num speakers:  44
Total Speaker Comparisons for,  AI is  43043
Number of sample incorrect: 785
Total Speaker Comparisons for,  AN is  5074
Number of sample incorrect: 52
Total Speaker Comparisons for,  AR is  20855
Number of sample incorrect: 340
Total Speaker Comparisons for,  AS is  43043
Number of sample incorrect: 6194
Total Speaker Comparisons for,  BR is  31906
Number of sample incorrect: 0
Total Speaker Comparisons for,  BZ is  11395
Number of sample incorrect: 821
Total Speaker Comparisons for,  CP is  18662
Number of sample incorrect: 20
Total Speaker Comparisons for,  CZ is  7396
Number of sample incorrect: 15
Total Speaker Comparisons for,  DR is  22188
Number of sample incorrect: 590
Total Speaker Comparisons for,  EB is  2236
Number of sample incorrect: 0
Total Speaker Comparisons for,  EC is  8987
Number of sample incorrect: 13
Total Speaker Comparisons for,  ED is  43043
Number of sample incorrect: 459
Total Speaker Comparisons for,  EF is  301
Number of sample incorrec

In [9]:
# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/Speech2Phone/speakers-Speech2Phone-vctk.json')


#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping) 

 Num speakers:  108
Total Speaker Comparisons for,  p232 is  44084
Number of sample incorrect: 1788
Total Speaker Comparisons for,  p305 is  45154
Number of sample incorrect: 2735
Total Speaker Comparisons for,  p341 is  43763
Number of sample incorrect: 2226
Total Speaker Comparisons for,  p362 is  45261
Number of sample incorrect: 988
Total Speaker Comparisons for,  p334 is  45261
Number of sample incorrect: 1975
Total Speaker Comparisons for,  p317 is  45261
Number of sample incorrect: 1544
Total Speaker Comparisons for,  p254 is  42479
Number of sample incorrect: 238
Total Speaker Comparisons for,  p236 is  52644
Number of sample incorrect: 817
Total Speaker Comparisons for,  p277 is  49113
Number of sample incorrect: 2346
Total Speaker Comparisons for,  p376 is  31244
Number of sample incorrect: 1354
Total Speaker Comparisons for,  p323 is  45368
Number of sample incorrect: 1547
Total Speaker Comparisons for,  p360 is  45368
Number of sample incorrect: 1307
Total Speaker Compariso

Total Speaker Comparisons for,  p345 is  42479
Number of sample incorrect: 967
Total Speaker Comparisons for,  p269 is  42372
Number of sample incorrect: 459
Total Speaker Comparisons for,  p302 is  33277
Number of sample incorrect: 660
Total Speaker Comparisons for,  p243 is  42051
Number of sample incorrect: 1600
Total Speaker Comparisons for,  p241 is  37771
Number of sample incorrect: 343
Of  4715490 Samples missed 159734  error percentage:  3.387431634888421


In [10]:
## Teste using GE2E mozilla implementation (trained on LibriTTS) for BRspeech and VCTK

In [11]:

# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/Mozilla-ge2e/speakers-pt-GE2E-trained-libritts-BRSpeech-beta3.json')


#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping) 

 Num speakers:  44
Total Speaker Comparisons for,  AI is  43043
Number of sample incorrect: 592
Total Speaker Comparisons for,  AN is  5074
Number of sample incorrect: 4
Total Speaker Comparisons for,  AR is  20855
Number of sample incorrect: 51
Total Speaker Comparisons for,  AS is  43043
Number of sample incorrect: 166
Total Speaker Comparisons for,  BR is  31906
Number of sample incorrect: 28
Total Speaker Comparisons for,  BZ is  11395
Number of sample incorrect: 415
Total Speaker Comparisons for,  CP is  18662
Number of sample incorrect: 73
Total Speaker Comparisons for,  CZ is  7396
Number of sample incorrect: 7
Total Speaker Comparisons for,  DR is  22188
Number of sample incorrect: 10
Total Speaker Comparisons for,  EB is  2236
Number of sample incorrect: 1
Total Speaker Comparisons for,  EC is  8987
Number of sample incorrect: 11
Total Speaker Comparisons for,  ED is  43043
Number of sample incorrect: 569
Total Speaker Comparisons for,  EF is  301
Number of sample incorrect: 0

In [12]:
    
# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/Mozilla-ge2e/speakers-vctk-en.json')


#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping)

 Num speakers:  108
Total Speaker Comparisons for,  p262 is  42158
Number of sample incorrect: 391
Total Speaker Comparisons for,  p270 is  49434
Number of sample incorrect: 187
Total Speaker Comparisons for,  p236 is  52644
Number of sample incorrect: 688
Total Speaker Comparisons for,  p250 is  51467
Number of sample incorrect: 1104
Total Speaker Comparisons for,  p376 is  31244
Number of sample incorrect: 61
Total Speaker Comparisons for,  p283 is  50183
Number of sample incorrect: 1277
Total Speaker Comparisons for,  p278 is  43763
Number of sample incorrect: 262
Total Speaker Comparisons for,  p317 is  45261
Number of sample incorrect: 67
Total Speaker Comparisons for,  p260 is  38092
Number of sample incorrect: 279
Total Speaker Comparisons for,  p232 is  44084
Number of sample incorrect: 270
Total Speaker Comparisons for,  p339 is  45261
Number of sample incorrect: 1081
Total Speaker Comparisons for,  p257 is  46438
Number of sample incorrect: 1954
Total Speaker Comparisons for,

Total Speaker Comparisons for,  p351 is  45261
Number of sample incorrect: 574
Total Speaker Comparisons for,  p265 is  37236
Number of sample incorrect: 170
Total Speaker Comparisons for,  p275 is  44512
Number of sample incorrect: 658
Total Speaker Comparisons for,  p258 is  44298
Number of sample incorrect: 662
Of  4715490 Samples missed 53860  error percentage:  1.1421930700733116


In [None]:
## Teste using GE2E CorentinJ implementation (trained on voxceleb1 voxceleb2 e librispeech) for BRspeech and VCTK

In [13]:

# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/ge2e-CorentinJ/BRSpeech/speakers.json')


#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping) 

 Num speakers:  44
Total Speaker Comparisons for,  AI is  43043
Number of sample incorrect: 2
Total Speaker Comparisons for,  AN is  5074
Number of sample incorrect: 0
Total Speaker Comparisons for,  AR is  20855
Number of sample incorrect: 7
Total Speaker Comparisons for,  AS is  43043
Number of sample incorrect: 1
Total Speaker Comparisons for,  BR is  31906
Number of sample incorrect: 1
Total Speaker Comparisons for,  BZ is  11395
Number of sample incorrect: 41
Total Speaker Comparisons for,  CP is  18662
Number of sample incorrect: 0
Total Speaker Comparisons for,  CZ is  7396
Number of sample incorrect: 0
Total Speaker Comparisons for,  DR is  22188
Number of sample incorrect: 34
Total Speaker Comparisons for,  EB is  2236
Number of sample incorrect: 0
Total Speaker Comparisons for,  EC is  8987
Number of sample incorrect: 2
Total Speaker Comparisons for,  ED is  43043
Number of sample incorrect: 1
Total Speaker Comparisons for,  EF is  301
Number of sample incorrect: 0
Total Spea

In [14]:

# Set constants
ROOT_PATH = '../../'
SPEAKER_JSON_PATH = os.path.join(ROOT_PATH, 'Speaker_Embeddings/ge2e-CorentinJ/VCTK/speakers.json')


#test load speaker json
speaker_mapping = load_speaker_mapping(SPEAKER_JSON_PATH)
calculate_error(speaker_mapping) 

 Num speakers:  108
Total Speaker Comparisons for,  p232 is  44084
Number of sample incorrect: 11
Total Speaker Comparisons for,  p305 is  45154
Number of sample incorrect: 2
Total Speaker Comparisons for,  p341 is  43763
Number of sample incorrect: 75
Total Speaker Comparisons for,  p362 is  45261
Number of sample incorrect: 2
Total Speaker Comparisons for,  p334 is  45261
Number of sample incorrect: 4
Total Speaker Comparisons for,  p317 is  45261
Number of sample incorrect: 31
Total Speaker Comparisons for,  p254 is  42479
Number of sample incorrect: 6
Total Speaker Comparisons for,  p236 is  52644
Number of sample incorrect: 2
Total Speaker Comparisons for,  p277 is  49113
Number of sample incorrect: 11
Total Speaker Comparisons for,  p376 is  31244
Number of sample incorrect: 0
Total Speaker Comparisons for,  p323 is  45368
Number of sample incorrect: 66
Total Speaker Comparisons for,  p360 is  45368
Number of sample incorrect: 8
Total Speaker Comparisons for,  p298 is  43335
Numb

Total Speaker Comparisons for,  p243 is  42051
Number of sample incorrect: 10
Total Speaker Comparisons for,  p241 is  37771
Number of sample incorrect: 2
Of  4715490 Samples missed 1499  error percentage:  0.03178884909097464
