# **Generate data for model training**

#### SILERO TEXT-TO-SPEECH MODEL is used as pre-trained model from PyTorch

<a href="https://pytorch.org/hub/snakers4_silero-models_tts/">Please follow the link to acquire more information about the model</a>

In [1]:
# in the beginning install libraries to download and save pre-trained model
!pip install -q torchaudio omegaconf
!pip install -q logmmse

# install libraries for data augmentation
!pip install -q audiomentations
!pip install -q cylimiter

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/79.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m71.7/79.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/117.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.5/127.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  

In [8]:
# import necessary libraries to generate voice command
import torch
import numpy as np
from logmmse import logmmse
from omegaconf import OmegaConf
from IPython.display import Audio
import librosa as lb
from librosa import feature as lbf
import gdown

# import necessary libraries for augmentation
import numpy as np
import os
from random import choice
# import functions for data augmentation and create object for transformation
from audiomentations import Compose, AddGaussianNoise, AirAbsorption, Limiter, RepeatPart, TanhDistortion, TimeMask, TimeStretch

In [9]:
# download and save the model in the variable 'model'
language = 'ru'
model_id = 'v4_ru'
device = torch.device('cpu')

model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                     model='silero_tts',
                                     language=language,
                                     speaker=model_id)
model.to(device)

Downloading: "https://github.com/snakers4/silero-models/zipball/master" to /root/.cache/torch/hub/master.zip
100%|██████████| 38.2M/38.2M [00:04<00:00, 8.84MB/s]


In [10]:
# create a function to generate voice command
def commandGenerator(size, command, model, speaker='random', sample_rate=24000):
    """
    the function takes desired size for generated data, text of a voice command,
    saved model to generate data, random speaker to overvoice command and
    frequency rate
    size: size of the final generated tensor
    command: text version of voice command
    speaker: voice of speaker
    sample_rate: frequency rate
    return: generated tensor based on function parameters
    """

    # create initial tensor with zeros and shape equals to 'size' parameter
    data = torch.zeros(size)

    for i in range(size[0]):
        # generate a sample
        audio = model.apply_tts(text=command,
                            speaker=speaker,
                            sample_rate=sample_rate)
        # enhance synthesis with logmmse
        audio = torch.tensor(logmmse(np.array(audio), sample_rate,
                                     output_file=None, initial_noise=1,
                                     window_size=160, noise_threshold=0.15))
        # calculate how many zeros to add in the beginning of the generated sample
        zeros_add = (size[1] - audio.size(0)) // 2
        # Create tensors of zeros for padding
        padding_start, padding_end = torch.zeros(zeros_add), torch.zeros(size[1] - audio.size(0) - zeros_add)
        # Concatenate tensors
        sample = torch.cat([padding_start, audio, padding_end])
        data[i] += sample

    return data if data.size(0) == size[0] and data.size(1) == size[1] else "Error Size"

In [11]:
def augmentation(source, key, size=2500):
    """
    the function takes generated data from nueral networks as 2D tensor
    and string as key to apply for dictionaries values
    source: 2D tensor
    key: key of a dictionary
    return data_x, data_y as list of numpy arrays comprise of augmented data
    """
    data_x, data_y = list(), list()
    while len(data_x) != size:

        augment = Compose([
            AddGaussianNoise(min_amplitude=0.01, max_amplitude=0.04, p=.5),
            AirAbsorption(min_distance=300.0, max_distance=700.0, p=.5),
            Limiter(min_threshold_db=-40.0, max_threshold_db=-0.0, threshold_mode="relative_to_signal_peak", p=.5),
            TanhDistortion(min_distortion=0.01, max_distortion=0.7, p=.5),
            TimeMask(min_band_part=0.1, max_band_part=0.2, fade=True, p=.5),
            TimeStretch(min_rate=0.7, max_rate=1.5, leave_length_unchanged=True, p=.5),
        ])

        try:
            sample = augment(source[np.random.randint(0, source.size(0))], 24000)
        except:
            continue
        else:
            data_x.append(sample), data_y.append(commands_list[key][0])

    return data_x, data_y

In [12]:
# create auxiliary data to generate train dataset
rows = 100
commands_size = {'forward': (rows, 60000), 'backward': (rows, 60000), 'stop': (rows, 50000), 'left': (rows, 73000), 'right': (rows, 73000)}
commands_list = {'forward': (0, 'иди вперёд'), 'backward': (1, 'иди назад'), 'stop': (2, 'остановись'),
                 'left': (3, 'поворачивай влево'), 'right': (4, 'поворачивай вправо')}
# create lists to collect data right after an augmentation
X_data, Y_data = list(), list()

In [13]:
# create data for train dataset with the help of "commandGenerator" function using loop
for key in commands_size:
    print(commands_size[key], commands_list[key][1], commands_list[key][0])
    source = commandGenerator(commands_size[key], commands_list[key][1], model, speaker='random', sample_rate=24000)
    data_x, data_y = augmentation(source, key)
    print(len(data_x), len(data_y), data_y[0], data_y[-1])
    X_data.extend(data_x), Y_data.extend(data_y)

(100, 60000) иди вперёд 0
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new voice
Generated new 

In [14]:
# check the correctness of the data and save that
print(len(X_data), len(Y_data))

torch.save(Y_data, 'tensor_y.pt'), torch.save(X_data, 'tensor_x.pt')

12500 12500


(None, None)

In [15]:
a = 10000
display(Audio(X_data[a], rate=24000)), Y_data[a]

(None, 4)

In [None]:
# Define the downloader's link for the file on yandex disk cloud storage
file_link = 'https://downloader.disk.yandex.ru/disk/c2478dcdbedf0f7c4d9a2f4472918f881e3ccee1082808f4c7b6f6213469f482/65553f37/fKqInKw3d7bLFOeFnMGnhNZsoTigsCIpYapgTpsof6zYW45Nj5YUVxW_mxlG5Ock8SFQlGtGuWGgj4_L4UpnQLSKSGLNlve4hFbmF5cZiRyr8npumZHI4midPdWhecNq?uid=1130000058358976&filename=tensor_x.pt&disposition=attachment&hash=&limit=0&content_type=application%2Fzip&owner_uid=1130000058358976&fsize=1730339726&hid=d766bf1215d7ba8783d3ccbb6c7af7b9&media_type=compressed&tknv=v2&etag=b6b0a0d438adc2e7bd67362f1f334177'
# Download the file using gdown
output_path = '/content/commands_x'
gdown.download(file_link, output_path, quiet=True)

# Load the PyTorch tensor from the downloaded file
data_x = torch.load(output_path)

# Verify the loaded tensor
print(len(data_x))

In [None]:
# Define the downloader's link for the file on yandex disk cloud storage
file_link = 'https://downloader.disk.yandex.ru/disk/7ca8dc8dcb1a6ab171c99d76f3e60f801fc1e54b1989caa27e820f13c1491781/6555403b/fKqInKw3d7bLFOeFnMGnhNa4Akt7dfMo_GA1DeFMnTLRCRRoGbMbtdXURS8aTmjm58eZyPCGfAklXGXo-gyqn5Tzk9p7Ju-nKA7_IdpEqfur8npumZHI4midPdWhecNq?uid=1130000058358976&filename=tensor_y.pt&disposition=attachment&hash=&limit=0&content_type=application%2Fzip&owner_uid=1130000058358976&fsize=25892&hid=2f4f6ab50d7ee2b8f33766e2c6786868&media_type=compressed&tknv=v2&etag=b1a729b6a797352739bfad624cf1cfe2'
# Download the file using gdown
output_path = '/content/commands_y'
gdown.download(file_link, output_path, quiet=True)

# Load the PyTorch tensor from the downloaded file
data_y = torch.load(output_path)

# Verify the loaded tensor
print(len(data_y))

12500


In [None]:
a = 2500
display(Audio(data_x[a], rate=24000)), data_y[a]

(None, 1)