Text-to-Speech (TTS) Synthesizer sub-module

* Receives processed text from the NLP Engine
* Gather Requirement from User Personalization Module.
* Generates audio output that mimics natural speech, taking into account user-specific settings adjusted for clarity and ease of understanding.
* Outputs the audio directly to the user interface, facilitating real-time interaction with the system.


In [2]:
pip install gtts


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
from gtts import gTTS
import os

#change the directory to run successfully at different end
os.chdir('/Users/lanqingcui/Desktop')

def text_to_speech(text, filename="output.mp3", lang='en', slow=False):
    """Converts the given text to speech and saves it as an MP3 file with a custom name."""
    tts = gTTS(text=text, lang=lang, slow=slow)
    tts.save(filename)
    os.system(f"afplay {filename}")

# Example usage
if __name__ == "__main__":
    text_to_speech("Hello World! Welcome to our demonstration of text to speech. This is one simple example of easiest TTS. ", "greeting.mp3", lang='en',slow=True)
    text_to_speech("Hello World! Welcome to our demonstration of text to speech. This is one simple example of easiest TTS. ", "greeting2.mp3", lang='en',slow=False)
    text_to_speech("大家好呀！这是中文版测试，你今天过的怎么样? ", "chinese_greeting.mp3", lang='zh-CN',slow=False)




Using AI models: 
1. Fast
2. High Quality
3. With emotions

Find Dataset: 
    Publicly Available Emotional Speech Dataset (ESD) for Speech Synthesis and Voice Conversion
       Dataset contains 10 English speakers with 5 emotional states (neutral, happy, angry, sad and surprise).

Preprocess data and do some training
    Record necessary informations and normalize data
    Sample Method:
        1) extracts phones and feature from file
        2) computes and saves:
            2.1 pitch
            2.2 mel-spectrogram
            2.3 energy
            2.4 durations
        3) write in the data and paths
            :param basename: str, filename without extension
            :param tg_path: Path, path to .TextGrid file
            :param wav_path: Path, path to .wav file
            :param txt_path: Path, path to .txt file

Impliment model and train the model on the data
    FastSpeech2
    
Check the Results of generation data



In [None]:
 def __getitem__(self, idx: int) -> dict:
        speaker_id = self.speaker_id[idx]
        file_id = self.file_id[idx]
        emotion_id = self.emotion_id[idx]
        phone = np.array(
            [self.phones_mapping[i] for i in self.text[idx][1:-1].split(" ")]
        )
        basename = f"{speaker_id}_{file_id}_{emotion_id}"

        if self.n_egemap_features > 0:
            egemap_feature = np.load(
                str(Path(self.preprocessed_path) / "egemap" / f"{basename}.npy"),
                allow_pickle=True,
            )[: self.n_egemap_features]
        else:
            egemap_feature = None
        mel = np.load(
            str(Path(self.preprocessed_path) / "mel" / f"{basename}.npy"),
            allow_pickle=True,
        )
        pitch = np.load(
            str(Path(self.preprocessed_path) / "pitch" / f"{basename}.npy"),
            allow_pickle=True,
        )
        energy = np.load(
            str(Path(self.preprocessed_path) / "energy" / f"{basename}.npy"),
            allow_pickle=True,
        )
        duration = np.load(
            str(Path(self.preprocessed_path) / "duration" / f"{basename}.npy"),
            allow_pickle=True,
        )

        assert duration.shape == phone.shape, (
            f"Duration and phone shapes do not match. Phone shape {phone.shape}, "
            f"duration: {duration.shape} for sample: {basename}."
        )

        sample = {
            "id": basename,
            "speaker": speaker_id,
            "emotion": emotion_id,
            "mel": mel,
            "pitch": pitch,
            "energy": energy,
            "duration": duration,
            "text": phone,
            "egemap_feature": egemap_feature,
        }

        return sample

In [None]:
class TrainConfig:
    # Set Preprocess Parameters
    n_threads: int = 16  # n_threads to parallel process utterance
    include_empty_intervals: bool = (
        True  # if True silence will be loaded from .TextGrid
    )

    mel_fmin: int = 0
    mel_fmax: int = 8000
    hop_length: int = 192
    stft_length: int = 768
    sample_rate: int = 16000
    window_length: int = 768
    n_mel_channels: int = 80

    raw_data_path: Path = "/app/data/data/ssw_esd"
    val_ids_path: Path = "/app/data/val_ids.txt"
    test_ids_path: Path = "/app/data/test_ids.txt"
    preprocessed_data_path: Path = Path("/app/data/preprocessed")


In [None]:
 # Read and trim wav files
        wav = torchaudio.load(wav_path)[0].cpu().numpy().squeeze(0)
        wav = wav[
            int(self.config.sample_rate * start) : int(self.config.sample_rate * end)
        ].astype(np.float32)
        speaker_idx, filename_idx, emotion_idx = basename.split("_")


# Compute pitch for certain audio
        pitch, t = pw.dio( #Distributed Inline-filter Operation (DIO) algorithm.
            wav.astype(np.float64),
            self.config.sample_rate,
            frame_period=self.config.hop_in_ms * 1000,
        )
        #Refined Pitch Estimation
        pitch = pw.stonemask(wav.astype(np.float64), pitch, t, self.config.sample_rate)


In [None]:
class Speech(nn.Module):
    def __init__(self, config):
        super(Speech, self).__init__()
        self.encoder = Encoder(config)
        self.decoder = Decoder(config)
        self.n_emotions = config.n_emotions

        self.variance_adaptor = VarianceAdaptor(config)

        self.emotion_emb = nn.Embedding(
            config.n_emotions, config.emotion_emb_hidden_size
        )
        self.mel_linear = nn.Linear(
            config.transformer_decoder_hidden, config.n_mel_channels
        )
        self.speaker_emb = nn.Embedding(
            config.n_speakers + 1, config.speaker_emb_hidden_size
        )

        # Advanced Emotion Conditioning
        self.conditional_cross_attention = config.conditional_cross_attention
        self.conditional_layer_norm_usage = config.conditional_layer_norm
        self.stack_speaker_with_emotion_embedding = (
            config.stack_speaker_with_emotion_embedding
        )


    def forward(self, device, batch_dict) -> dict:

        # Retrieves emotion and speaker embeddings based on the IDs
        emotion_embedding = self.emotion_emb(batch_dict["emotions"].to(device))
        speaker_embedding = self.speaker_emb(batch_dict["speakers"].to(device))

        #if there is need of Conditional Processing
        if self.conditional_cross_attention or self.conditional_layer_norm_usage:
            encoder_output, encoder_attention = self.encoder(
                batch_dict["texts"].to(device),
                src_masks.to(device),
                speaker_emotion_embedding=emotion_embedding,
            )
        else:
            encoder_output, encoder_attention = self.encoder(
                batch_dict["texts"].to(device),
                src_masks.to(device),
                speaker_emotion_embedding=None,
            )

        #checks if the speaker and emotion embeddings are not meant to be stacked together before being added to the encoder output.
        if not self.stack_speaker_with_emotion_embedding:
            max_src_len = torch.max(batch_dict["text_lens"]).item()
            encoder_output = (
                encoder_output
                + speaker_embedding.unsqueeze(1).expand(-1, max_src_len, -1)
                + emotion_embedding.unsqueeze(1).expand(-1, max_src_len, -1)
            )

        output_dict = {
            "predicted_pitch": var_adaptor_output["pitch_prediction"],
            "predicted_energy": var_adaptor_output["energy_prediction"],
            "predicted_egemap": var_adaptor_output["egemap_prediction"],
            "predicted_log_durations": var_adaptor_output["log_duration_prediction"],
            "predicted_durations_rounded": var_adaptor_output["duration_rounded"],
            "emotion_embedding": emotion_embedding,
            "encoder_attention": encoder_attention,
            "decoder_attention": decoder_attention,
        }

        return output_dict