This is a noteboook used to generate the speaker embeddings with the Speech2Phone model for multi-speaker training.

Before running this script please DON'T FORGET: 
- to set file paths.
- to download related model files from TTS.
- download or clone related repos, linked below.
- setup the repositories. ```python setup.py install```
- to checkout right commit versions (given next to the model) of TTS.
- to set the right paths in the cell below.

Repositories:
- TTS: https://github.com/mozilla/TTS
- Speech2Phone: https://github.com/Edresson/Speech2Phone

In [None]:
%load_ext autoreload
%autoreload 2
import os
import importlib
import random
import librosa
import torch

import numpy as np
from TTS.utils.generic_utils import load_config
from tqdm import tqdm
from TTS.utils.speakers import save_speaker_mapping, load_speaker_mapping

# you may need to change this depending on your system
os.environ['CUDA_VISIBLE_DEVICES']='1'

In [None]:
# Set constants
ROOT_PATH = '../../'
CONFIG_PATH = os.path.join(ROOT_PATH, 'config.json')
CONFIG = load_config(CONFIG_PATH)

In [None]:
# Install Speech2Phone Requeriments
!pip install pydub tensorflow==1.14.0 tflearn==0.3.2

In [None]:
#Download Speech2Phone Checkpoint
!wget -O ./saver.zip https://www.dropbox.com/s/b19xt2wu3th9p36/Save-Models-Speaker-Diarization.zip?dl=0
!mkdir Speech2Phone
!unzip saver.zip
!mv  Save-Models/  Speech2Phone/Save-Models/


In [None]:
#Utils for Speech2Phone Preprocessing
from pydub import AudioSegment as audio

def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms
 
    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0  # ms
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold:
        #print(trim_ms,len(sound))
        if trim_ms > len(sound):
            return None
        trim_ms += chunk_size
 
    return trim_ms

def remove_silence(sound):
    start_trim = detect_leading_silence(sound)
    if start_trim is None:
        return None
    end_trim = detect_leading_silence(sound.reverse())
    duration = len(sound)
    trimmed_sound = sound[start_trim:duration-end_trim]
    return trimmed_sound
    


In [None]:
import tflearn

#Create model for restore
encoder = tflearn.input_data(shape=[None, 13,int(216)])
encoder = tflearn.dropout(encoder,0.9) #10 % drop - 90% -> 80
encoder = tflearn.dropout(encoder,0.2)# 80 % drop
encoder = tflearn.fully_connected(encoder, 40,activation='crelu')
decoder = tflearn.fully_connected(encoder, int(572), activation='linear')
net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.0007,loss='mean_square', metric=None)#categorical_crossentropy
model = tflearn.DNN(net, tensorboard_verbose=0,tensorboard_dir='tflearn_logs')

model.load('./Speech2Phone/Save-Models/Model3-Best-40loc.tflearn')

encoding_model = tflearn.DNN(encoder, session=model.session)# used for extract embedding in encoder layer


In [None]:
#Preprocess dataset
meta_data = []
datasets=CONFIG.datasets
for dataset in datasets:
    preprocessor = importlib.import_module('TTS.datasets.preprocess')
    preprocessor = getattr(preprocessor,  dataset['name'].lower())
    meta_data += preprocessor(dataset['path'],dataset['meta_file_train'])
      
meta_data= list(meta_data)
#random.shuffle(meta_data)

meta_data = meta_data
embeddings_dict = {}
len_meta_data= len(meta_data)
for i in tqdm(range(len_meta_data)):
    _, wave_file_path, speaker_id = meta_data[i]
    try:
        sound = audio.from_wav(wave_file_path)
    except:
        continue
    wave = remove_silence(sound)
    if wave is None:
        continue
    
    file_embeddings = None
    if int(wave.duration_seconds) > 5: # 5 seconds is the Speech2Phone input
        begin = 0
        end = 5
        step = 5
        while (end) < int(wave.duration_seconds):
            try:        
                segment = wave[begin*1000:end*1000]
                segment.export('aux' + '.wav', 'wav')# its necessary because pydub and librosa load wave in diferent form 
                y, sr = librosa.load('aux.wav',sr=22050)#sample rate = 22050 
                
                if file_embeddings is None:
                    file_embeddings =[np.array(encoding_model.predict([librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)])[0])]
                else:
                    file_embeddings.append(np.array(encoding_model.predict([librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)])[0]))   
                os.system('rm aux.wav')
                begin = begin + step
                end = end + step
            except:
                print('deu erro')
                begin = begin + step
                end = end + step
                break
                
      
        if speaker_id in embeddings_dict.keys():
            embeddings_dict[speaker_id].append(np.mean(np.array(file_embeddings), axis=0) if len(file_embeddings) > 1 else np.array(file_embeddings))
            
        else:
            embeddings_dict[speaker_id]= [np.mean(np.array(file_embeddings), axis=0) if len(file_embeddings) > 1 else np.array(file_embeddings)]

        del file_embeddings


for speaker_id in embeddings_dict.keys():
    embeddings_dict[speaker_id] = np.mean(np.array(embeddings_dict[speaker_id]), axis=0)

In [None]:
# create and export speakers.json  and aplly a L2_norm in embedding
speaker_mapping = {sample: {'name': embeddings_dict[sample][1], 'embedding':torch.nn.functional.normalize(torch.FloatTensor([embeddings_dict[sample][0].reshape(-1).tolist()]), p=2, dim=1).reshape(-1).tolist()} for i, sample in enumerate(embeddings_dict.keys())}
save_speaker_mapping(ROOT_PATH, speaker_mapping)


In [None]:
#test load integrity
speaker_mapping_load = load_speaker_mapping(ROOT_PATH)
assert speaker_mapping == speaker_mapping_load
print("The file speakers.json has been exported to ",ROOT_PATH, ' with ', len(embeddings_dict.keys()), ' speakers')