### **Voice cloning and multispeaker model built using ConquiAPI TTS**

In [None]:
!pip install TTS

In [None]:
#after restart don't run it again
!pip install mecab-python3

In [None]:
#after restart don't run it again
!apt-get install mecab mecab-ipadic-utf8 libmecab-dev swig
!pip install mecab-python3==0.7.0
!mecab -h

In [None]:
from TTS.api import TTS

# Running a multi-speaker and multi-lingual model

# List available 🐸TTS models and choose the first one
model_name = TTS.list_models()[0]
# # Init TTS
tts = TTS(model_name)

**Available Languages and speakers**

In [None]:
# Print available languages
print("Available Languages:")
print(tts.languages)

# Print available speakers
print("Available Speakers:")
print(tts.speakers)

Available Languages:
['en', 'fr-fr', 'pt-br']
Available Speakers:
['female-en-5', 'female-en-5\n', 'female-pt-4\n', 'male-en-2', 'male-en-2\n', 'male-pt-3\n']


In [None]:
from IPython.display import Audio

**Speaker 1**

In [None]:
# ❗ Since this model is multi-speaker and multi-lingual, we must set the target speaker and the language
# Text to speech with a numpy output
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
# Text to speech to a file
tts.tts_to_file(text="This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0], file_path="speaker1_lag1.wav")

Audio(wav, rate=22500)

 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 1.0103602409362793
 > Real-time factor: 0.23639687434166573
 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 1.0466680526733398
 > Real-time factor: 0.24217215471386855


In [None]:
wav = tts.tts("C'est un test! C'est aussi un essai !!", speaker=tts.speakers[0], language=tts.languages[1])
tts.tts_to_file(text="C'est un test! C'est aussi un essai !!", speaker=tts.speakers[0], language=tts.languages[1], file_path="speaker1_lag2.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 1.2506873607635498
 > Real-time factor: 0.36001363291984734
 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.8144075870513916
 > Real-time factor: 0.23995509341525975


In [None]:
wav = tts.tts("Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[0], language=tts.languages[2])
tts.tts_to_file(text="Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[0], language=tts.languages[2], file_path="speaker1_lag3.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.876969814300537
 > Real-time factor: 0.3687563485855672
 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.2828493118286133
 > Real-time factor: 0.248133329173813


**Speaker 2**

In [None]:
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[1], language=tts.languages[0])
tts.tts_to_file(text="This is a test! This is also a test!!", speaker=tts.speakers[1], language=tts.languages[0], file_path="speaker2_lag1.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 2.1409945487976074
 > Real-time factor: 0.48993010269968135
 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 1.538017749786377
 > Real-time factor: 0.37530935817139505


In [None]:
wav = tts.tts("C'est un test! C'est aussi un essai !!", speaker=tts.speakers[1], language=tts.languages[1])
tts.tts_to_file(text="C'est un test! C'est aussi un essai !!", speaker=tts.speakers[1], language=tts.languages[1], file_path="speaker2_lag2.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 1.520371437072754
 > Real-time factor: 0.5051067897251674
 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 1.0709590911865234
 > Real-time factor: 0.3615662022911963


In [None]:
wav = tts.tts("Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[1], language=tts.languages[2])
tts.tts_to_file(text="Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[1], language=tts.languages[2], file_path="speaker2_lag3.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.5938787460327148
 > Real-time factor: 0.3445479347238899
 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.1618969440460205
 > Real-time factor: 0.24606034393181286


**Speaker 3**

In [None]:
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[2], language=tts.languages[0])
tts.tts_to_file(text="This is a test! This is also a test!!", speaker=tts.speakers[2], language=tts.languages[0], file_path="speaker3_lag1.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 0.9544141292572021
 > Real-time factor: 0.23109300950537584
 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 0.9722483158111572
 > Real-time factor: 0.22833450347843054


In [None]:
wav = tts.tts("C'est un test! C'est aussi un essai !!", speaker=tts.speakers[2], language=tts.languages[1])
tts.tts_to_file(text="C'est un test! C'est aussi un essai !!", speaker=tts.speakers[2], language=tts.languages[1], file_path="speaker3_lag2.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.6607158184051514
 > Real-time factor: 0.21719783642509907
 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.6900537014007568
 > Real-time factor: 0.2123242158156175


In [None]:
wav = tts.tts("Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[2], language=tts.languages[2])
tts.tts_to_file(text="Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[2], language=tts.languages[2], file_path="speaker3_lag3.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.5916450023651123
 > Real-time factor: 0.37101282106412875
 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.6286633014678955
 > Real-time factor: 0.3726918309995184


**Speaker 4**

In [None]:
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[3], language=tts.languages[0])
tts.tts_to_file(text="This is a test! This is also a test!!", speaker=tts.speakers[3], language=tts.languages[0], file_path="speaker4_lag1.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 0.9085021018981934
 > Real-time factor: 0.24461553632153835
 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 0.8452527523040771
 > Real-time factor: 0.2275855552784268


In [None]:
wav = tts.tts("C'est un test! C'est aussi un essai !!", speaker=tts.speakers[3], language=tts.languages[1])
tts.tts_to_file(text="C'est un test! C'est aussi un essai !!", speaker=tts.speakers[3], language=tts.languages[1], file_path="speaker4_lag2.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.5425558090209961
 > Real-time factor: 0.2053579897884164
 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.5866727828979492
 > Real-time factor: 0.22071963239200498


In [None]:
wav = tts.tts("Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[3], language=tts.languages[2])
tts.tts_to_file(text="Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[3], language=tts.languages[2], file_path="speaker4_lag3.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 0.958493709564209
 > Real-time factor: 0.2366651134726442
 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 0.9162149429321289
 > Real-time factor: 0.22893926609998225


**Speaker 5**

In [None]:
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[4], language=tts.languages[0])
tts.tts_to_file(text="This is a test! This is also a test!!", speaker=tts.speakers[4], language=tts.languages[0], file_path="speaker5_lag1.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 1.732858419418335
 > Real-time factor: 0.4625890067854605
 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 1.6669855117797852
 > Real-time factor: 0.4450041408915604


In [None]:
wav = tts.tts("C'est un test! C'est aussi un essai !!", speaker=tts.speakers[4], language=tts.languages[1])
tts.tts_to_file(text="C'est un test! C'est aussi un essai !!" ,speaker=tts.speakers[4], language=tts.languages[1], file_path="speaker5_lag2.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 1.2095329761505127
 > Real-time factor: 0.4391913493647468
 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.910414457321167
 > Real-time factor: 0.34046913138413126


In [None]:
wav = tts.tts("Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[4], language=tts.languages[2])
tts.tts_to_file(text="Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[4], language=tts.languages[2], file_path="speaker5_lag3.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 0.9233496189117432
 > Real-time factor: 0.2363926315698267
 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 0.8468883037567139
 > Real-time factor: 0.22511650817562837


**Speaker 6**

In [None]:
wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[5], language=tts.languages[0])
tts.tts_to_file(text="This is a test! This is also a test!!", speaker=tts.speakers[5], language=tts.languages[0], file_path="speaker6_lag1.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 0.928227424621582
 > Real-time factor: 0.23010099767515668
 > Text splitted to sentences.
['This is a test!', 'This is also a test!!']
 > Processing time: 1.2021458148956299
 > Real-time factor: 0.29565809515386865


In [None]:
wav = tts.tts("C'est un test! C'est aussi un essai !!", speaker=tts.speakers[5], language=tts.languages[1])
tts.tts_to_file(text="C'est un test! C'est aussi un essai !!", speaker=tts.speakers[5], language=tts.languages[1], file_path="speaker6_lag2.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.618476152420044
 > Real-time factor: 0.2219943117085585
 > Text splitted to sentences.
["C'est un test!", "C'est aussi un essai !!"]
 > Processing time: 0.5714366436004639
 > Real-time factor: 0.20278092391783673


In [None]:
wav = tts.tts("Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[5], language=tts.languages[2])
tts.tts_to_file(text="Isto é um teste! Isso também é um teste!!", speaker=tts.speakers[5], language=tts.languages[2], file_path="speaker6_lag3.wav")
Audio(wav, rate=22500)

 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.089855432510376
 > Real-time factor: 0.23723453036795297
 > Text splitted to sentences.
['Isto é um teste!', 'Isso também é um teste!!']
 > Processing time: 1.0591516494750977
 > Real-time factor: 0.23463705127937476


**run this cell to install e-speak ng and restart the session**

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# run this cell to clear the error and restart the session
!apt-get update
!apt-get install espeak-ng

In [None]:
exit()

**Single Speaker Model**

In [None]:
import time

In [None]:
# Running a single speaker model
# Init TTS with the target model name
tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=True)
# Run TTS
tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="single_speaker.wav")


 > Downloading model to /root/.local/share/tts/tts_models--de--thorsten--tacotron2-DDC
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Downloading model to /root/.local/share/tts/vocoder_models--de--thorsten--hifigan_v1
 > Model's license - apache 2.0
 > Check https://choosealicense.com/licenses/apache-2.0/ for more info.
 > Using model: tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_line

**Execution time: 64.4503824710846 seconds with gpu of tacotron2DDC**

**Execution time: 84.92006945610046 seconds without gpu of tacatron2DDC**

### **Evaluation**

**For evaluating the cloned voice we used TDNN model that use speechbrain library ,with help of it we first extract the embeddings stored it into voice into a numpy array of and then computed the similarity score of extracted embeddings of cloned voice and natural voice using cosine similarity .As the score get closer to 1 implies our cloned voice is getting more similar to natural one.**

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!pip install speechbrain

In [None]:
import torchaudio
from speechbrain.pretrained import EncoderClassifier
classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")

In [None]:
signal, fs =torchaudio.load('/content/arctic_a0407.wav')
embeddings_og_1 = classifier.encode_batch(signal)

In [None]:
signal, fs =torchaudio.load('/content/college.mp3')
embeddings_og_2 = classifier.encode_batch(signal)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

**Voice Cloning into different language**

In [None]:
# Example voice cloning with YourTTS in English, French and Portuguese

tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
tts.tts_to_file("This is voice cloning.", speaker_wav="/content/arctic_a0407.wav", language="en", file_path="sample1_english.wav")
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="/content/arctic_a0407.wav", language="fr-fr", file_path="sample1_french.wav")
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="/content/arctic_a0407.wav", language="pt-br", file_path="sample1_portuguese.wav")

 > Downloading model to /root/.local/share/tts/tts_models--multilingual--multi-dataset--your_tts
 > Model's license - CC BY-NC-ND 4.0
 > Check https://creativecommons.org/licenses/by-nc-nd/4.0/ for more info.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 

**Evaluation time of loadng multilingual model with gpu 18.482767343521118**

**Evaluation time of loadng multilingual model with gpu 21.874202251434326**

In [None]:
signal, fs =torchaudio.load('/content/1output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_1[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.8349373


In [None]:
signal, fs =torchaudio.load('/content/2output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_1[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.80558157


In [None]:
signal, fs =torchaudio.load('/content/3output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_1[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.85979676


In [None]:
tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
tts.tts_to_file("This is voice cloning.", speaker_wav="/content/college.mp3", language="en", file_path="sample2_english.wav")
tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="/content/college.mp3", language="fr-fr", file_path="sample2_french.wav")
tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="/content/college.mp3", language="pt-br", file_path="sample2_portuguese.wav")

 > tts_models/multilingual/multi-dataset/your_tts is already downloaded.
 > Using model: vits
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > log_func:np.log10
 | > min_level_db:0
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:None
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:None
 | > symmetric_norm:None
 | > mel_fmin:0
 | > mel_fmax:None
 | > pitch_fmin:None
 | > pitch_fmax:None
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:1.0
 | > clip_norm:True
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:10
 | > hop_length:256
 | > win_length:1024
 > Model fully restored. 
 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > log_func:np.log10
 | > min_level_db:-

'sample2_portuguese.wav'

In [None]:
signal, fs =torchaudio.load('/content/4output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_2[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.74992824


In [None]:
signal, fs =torchaudio.load('/content/5output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_2[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.7625568


In [None]:
signal, fs =torchaudio.load('/content/6output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_2[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.8049873


**Source to Target Conversion**

In [None]:
import os
print(os.getcwd())

/content


In [None]:
# Example voice conversion converting speaker of the `source_wav` to the speaker of the `target_wav`
tts = TTS(model_name="voice_conversion_models/multilingual/vctk/freevc24", progress_bar=False, gpu= True)
tts.voice_conversion_to_file(source_wav="/content/arctic_a0407.wav", target_wav="/content/college.mp3", file_path="7output.wav")

 > Downloading model to /root/.local/share/tts/voice_conversion_models--multilingual--vctk--freevc24
 > Model's license - MIT
 > Check https://choosealicense.com/licenses/mit/ for more info.
 > Using model: freevc
 > Loading pretrained speaker encoder model ...
Loaded the voice encoder model on cpu in 2.15 seconds.
 > Downloading WavLM model to /root/.local/share/tts/wavlm/WavLM-Large.pt ...
Execution_time 80.75335383415222


**Execution time for freevc24 with gpu is 79.87649631500244**

**Execution time for freevc24 without gpu is 80.75335383415222**

**Voice cloning using Tacotron2-DDC**

In [None]:
tts = TTS("tts_models/de/thorsten/tacotron2-DDC")

 > tts_models/de/thorsten/tacotron2-DDC is already downloaded.
 > vocoder_models/de/thorsten/hifigan_v1 is already downloaded.
 > Using model: tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:None
 | > pitch_fmin:0.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 2
 > Vocoder Model: hifigan
 > Setting up Audio 

In [None]:

Input_Text="Wie sage ich auf Italienisch, dass ich dich liebe?"

In [None]:
tts.tts_with_vc_to_file(
    "Wie sage ich auf Italienisch, dass ich dich liebe?",
    speaker_wav="/content/arctic_a0407.wav",
    file_path="sample1_tactron2ddc_cloning.wav"
)

 > Text splitted to sentences.
['Wie sage ich auf Italienisch, dass ich dich liebe?']
 > Processing time: 2.71496319770813
 > Real-time factor: 0.8889424226280629
 > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.
 > Using model: freevc
 > Loading pretrained speaker encoder model ...
Loaded the voice encoder model on cuda in 10.14 seconds.


In [None]:
signal, fs =torchaudio.load('/content/8output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_1[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.8399157


In [None]:
tts.tts_with_vc_to_file(
    "Wie sage ich auf Italienisch, dass ich dich liebe?",
    speaker_wav="/content/college.mp3",
    file_path="sample2_tactron2ddc_cloning.wav"
)

 > Text splitted to sentences.
['Wie sage ich auf Italienisch, dass ich dich liebe?']
 > Processing time: 3.256521224975586
 > Real-time factor: 1.0662611815560654


In [None]:
signal, fs =torchaudio.load('/content/9output.wav')
embeddings = classifier.encode_batch(signal)

In [None]:
similarity_score = cosine_similarity(embeddings[0], embeddings_og_2[0])
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.9216579
