# IP8: Creation of Labelled Data
The RNN will be trained on spectrograms of the audio data from the created corpora. Since this process is computationally expensive and requires a lot of time. To speed up the iterations when training the RNN and get feedback faster, the input data (the spectrograms) are pre-computed and stored on disk. Also, the labels (the information about speech pauses) need to be encoded in a suitable format. This notebook describes how this is done.

Before we start, define a path to an empty directory with enough free storage where the labelled data can be stored:

In [None]:
target_root = r'E:/'

As usual, let's do the imports and some helper functions before we start.

In [None]:
%matplotlib inline
# %matplotlib notebook

import random
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

import numpy as np
from IPython.display import HTML, Audio
import ipywidgets as widgets
from IPython.display import HTML, Audio
import plotly.graph_objs as go

from create_labelled_data import create_X_Y
from util.corpus_util import *
from util.audio_util import *
from util.webrtc_util import *

import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
from IPython.display import HTML, Audio
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd

rl_corpus_root = os.path.join(target_root, 'readylingua-corpus')
ls_corpus_root = os.path.join(target_root, 'librispeech-corpus')

default_figsize = (12,5)
default_facecolor = 'white'
default_font = {'family': 'serif', 
                'weight': 'normal', 
#                 'size': 12
               }
plt.rc('font', **default_font)

def show_labelled_data(corpus_entry):
    display(HTML(f'<h3>{corpus_entry.name} (id={corpus_entry.id})</h3>'))
    display(HTML(f'{len(corpus_entry.speech_segments)} speech segments, {len(corpus_entry.pause_segments)} pause segments'))
    
    # audio data
    audio, rate = corpus_entry.audio, corpus_entry.rate
    display(Audio(data=audio, rate=rate))
    
    fig = plt.figure(figsize=default_figsize, facecolor=default_facecolor)

    # plot raw wave
    ax_wave = fig.add_subplot(212)
    title = f'Raw wave of {corpus_entry.audio_file} with speech pauses'
    ax_wave = show_wave(audio, rate, ax_wave, title)
    
    # plot spectrogram
    window_size_ms, step_size_ms = 20, 10
    window_size, step_size = ms_to_frames(window_size_ms, rate), ms_to_frames(step_size_ms, rate)
    ax_spec = fig.add_subplot(211)
    title = f'Spectrogram of ' + corpus_entry.audio_file
    spec = corpus_entry.spectrogram(window_size=window_size_ms, step_size=step_size_ms)
    ax_spec = show_spectrogram(spec, rate, step_size, ax_spec, title=title, scale=None)
    
    # overlay speech and pause segments
    speech_boundaries = calculate_boundaries(corpus_entry.speech_segments)
    speech_boundaries_u = calculate_boundaries(corpus_entry.speech_segments_unaligned)
    pause_boundaries = calculate_boundaries(corpus_entry.pause_segments)
    
    # rescale boundaries from frames to seconds
    speech_boundaries = speech_boundaries / corpus_entry.rate
    speech_boundaries_u = speech_boundaries_u / corpus_entry.rate
    pause_boundaries = pause_boundaries / corpus_entry.rate
    
#     show_segments(ax_spec, speech_boundaries, color='green')
    show_segments(ax_wave, speech_boundaries, color='green')
#     show_segments(ax_spec, speech_boundaries_u, color='yellow')
    show_segments(ax_wave, speech_boundaries_u, color='yellow')
#     show_segments(ax_spec, pause_boundaries, color='red')
    show_segments(ax_wave, pause_boundaries, color='red')
    
    speech_segments = mpatches.Patch(color='green', alpha=0.6, label='speech segments')
    speech_segments_u = mpatches.Patch(color='yellow', alpha=0.6, label='speech segments unaligned')
    pause_segments = mpatches.Patch(color='red', alpha=0.6, label='pause segments')
    ax_wave.legend(handles=[speech_segments, speech_segments_u, pause_segments], bbox_to_anchor=(0, -0.5, 1., -0.4), loc=3, mode='expand', borderaxespad=0, ncol=3)
    
    return ax_spec, ax_wave

def show_wave(audio, sample_rate, ax=None, title=None):
    if not ax:
        plt.figure(figsize=default_figsize, facecolor=default_facecolor)
        
    p = librosa.display.waveplot(audio.astype(float), sample_rate)
    ax = p.axes
    ax.set_ylabel('Amplitude')
    if title:
        plt.title(title)
    plt.tight_layout()
    return ax

def show_spectrogram(spec, sample_rate, step_size, ax=None, title=None, scale='db'):
    if not ax:
        plt.figure(figsize=default_figsize, facecolor=default_facecolor)

    ax = librosa.display.specshow(spec, sr=sample_rate, hop_length=step_size, 
                                  x_axis='time', y_axis='hz', cmap='viridis')
    if scale == 'db':
        plt.colorbar(format='%+2.0f dB')
    if title:
        plt.title(title)
    plt.tight_layout()    
    return ax
    
def show_spectrogram_3d(spec, window_size=320, step_size=160, sample_rate=16000, title=None):
    times = np.arange(window_size/2, spec.shape[-1] - window_size/2 + 1, window_size - step_size)/float(sample_rate)
    data = [go.Surface(z=spec)]
    layout = go.Layout(
        title=title,
        scene = dict(
            xaxis = dict(title='Time', range=times),
            yaxis = dict(title='Frequencies', range=freqs),
            zaxis = dict(title='Log amplitude'),
            ),
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)      


def show_segments(ax, boundaries, ymin=0, ymax=1, color='red'):
    for i, (start_frame, end_frame) in enumerate(boundaries):
        rect = ax.axvspan(start_frame, end_frame, ymin=ymin, ymax=ymax, color=color, alpha=0.5)
        y_0, y_1 = ax.get_ylim()
        x = start_frame + (end_frame - start_frame)/2
        y = y_0 + 0.01*(y_1-y_0) if ymin==0 else y_1 - 0.05*(y_1-y_0)
        ax.text(x, y, str(i+1), horizontalalignment='center', fontdict={'family': 'sans-serif', 'size': 15, 'color': 'white'})

def calculate_boundaries(segments):
    start_frames = (seg.start_frame for seg in segments)
    end_frames = (seg.end_frame for seg in segments)
    return np.array(list(zip(start_frames, end_frames)))

def on_create_data_rl_button_click(sender):
    rl_target_root = os.path.join(target_root, 'readylingua-data')
    create_X_Y(ls_corpus, rl_target_root)
    
def on_create_data_ls_button_click(sender):
    ls_target_root = os.path.join(target_root, 'librispeech-data')
    create_X_Y(ls_corpus, ls_target_root)      
    
# UI elements
layout = widgets.Layout(width='250px', height='50px')
create_data_rl_btn = widgets.Button(description="Create labelled data for ReadyLingua", button_style='info', layout=layout, icon='download')
create_data_rl_btn.on_click(on_create_data_rl_button_click)
create_data_ls_btn = widgets.Button(description="Create labelled data for LibriSpeech", button_style='info', layout=layout, icon='download')
create_data_ls_btn.on_click(on_create_data_ls_button_click)

After having created the corpora from raw data we can now start creating labelled data (spectrograms and labels) for the RNN. This data is stored as numpy arrays whose dimensions partially depend on the proposed network architecture. 

The RNN is trained on audio data (sequence of frames) and will output whether a specific section in the audio signal is speech or pause (sequence of labels). Because both the input and the output is a sequence, it is a sequence-to-sequence model with a **many-to-many** architecture. This means we have the following values to consider:

* $T_x$: Number of sequence tokens in an individual sample. This value may be different for each sample!
* $T_y$: Number of sequence tokens in the output. This value is always the same for each sample but may be different from $T_x$

In the following sections the following variable names are used to denote the two components of the labelled data:

* `X`: The actual data, i.e. the spectrograms. One spectrogram is created per corpus entry and saved to disk. The saved data consists of three components:
  * `freqs`: The frequencies used in the spectrogram (array of shape $(161, 1)$)
  * `times`: The time steps used in the spectrogram (array of shape $(T_x, 1)$)
  * `spec`: The spectrogram data (array of shape $(T_x, 161)$)
  'freqs' and 'times' are only needed to plot the spectrogram along in a Cartesian coordinate system, where the time steps will be plotted along the x-axis and  frequencies along the y-axis. For training, only the `spec` part is needed.
* `Y`: The labels, i.e. the information about speech- or pause segments. The labels are encoded as 1-dimensional binary vectors of shape $(1, T_y)$. A speech segment will be encoded as a sequence of zeroes and a pause segment as a sequence of ones. Pause sections may contain some signal (e.g. background noise) but no spoken text from the transcript.

Let's load the created corpora to make them available to this notebook.

In [None]:
rl_corpus = load_corpus(rl_corpus_root)
ls_corpus = load_corpus(ls_corpus_root)

## Train/Dev/Test split
The labelled data is split into subsets for training (_train-set_), parameter tuning (_dev-set_) and model evaluation (_test-set_). Since the corpora were create from different sources of raw data, they vary in size and probability distribution (number of languages, homogeneity of the recording quality, ratio of male vs. female speakers, presence of distortions like reverb or overdrive, and many more). Since the starting point for the creation of the corpus was so variable, different approaches were taken to split the corpus up into train-, dev- and test-set.

### ReadyLingua corpus
The raw data exhibits a high variance with respect to relevant features (recording quality, length of samples, presence of distortion, ...). Since the corpus is rather small there may be only one sample for a specific feature value (e.g. only one recording with reverb). Therefore to keep things simple the split into train-, dev- and test-set was done with a 80/10/10-rule without closer examination of the underlying data. This might not result in an optimal split since it would be possible for example that all the female speakers will be put in one subset.

Improvements could be made by manually assigning each sample to a specific set by carefully inspecting the relevant features. The corpus could also be extended by creating synthetisized data, e.g. creating samples with reverb from the original samples. Because the LibriSpeech corpus looks much more promising at the moment, this time was not invested.

### LibriSpeech corpus
The LibriSpeech raw data is already split into train-, dev- and test-set. Each chapter is read by a different speaker. Each speaker is only contained in one of the subsets. Efforts have been made to keep the different sets within the same probability distributions (regarding to accents, ratio of male/female speakers, ...). The information about the subset has been preserved when creating the corpora from raw data. To leverage the efforts made by the LibriSpeech project, the corresponding labelled data will be kept in the same subset.

---

You can explore the size of the subsets for each corpus by executing the cell below to see the number of samples (corpus entries) in each subset.

In [None]:
ls_train, ls_dev, ls_test = ls_corpus.train_dev_test_split()
print(f'LibriSpeech corpus ({len(ls_corpus)} samples): #train-samples: {len(ls_train)}, #dev-samples: {len(ls_dev)}, #test-samples: {len(ls_test)}')

rl_train, rl_dev, rl_test = rl_corpus.train_dev_test_split()
print(f'ReadyLingua corpus ({len(rl_corpus)} samples): #train-samples: {len(rl_train)}, #dev-samples: {len(rl_dev)}, #test-samples: {len(rl_test)}')

##  Feature extraction from audio signal

In order to train an RNN, each sample needs to be converted into some sort of sequence of features. In this case the samples are the audio files from the corpus entries that were converted to wave files (`*.wav`) and downsampled to 16kHz (mono). This chapter describes different ways of extracting features from the audio signal that can be used for training.

### Raw waves
As the name suggests the wave files contain the audio signal as a raw wave, which is just a series of discrete sample values. Because we used a sampling rate of 16kHz we get 16'000 sample values per second. A sample value corresponds to the _amplitude_ of the waveform at the given time step. These values can be stored in a 1-dimensional Numpy array and plotted in two dimension (time vs. amplitude).

For example consider the a raw wave for a random speech segment. Feel free to change the first line to visualize the raw wave for a specific corpus entry.

In [None]:
corpus_entry = rl_corpus['20161124weeklyaddressthanksgiving']
speech_segment = corpus_entry.speech_segments[49]

# uncomment the following lines for random corpus entry and/or speech segment
# corpus_entry = random.choice(rl_corpus)
# speech_segment = random.choice(corpus_entry.speech_segments)

print(f'number of sampling points: {speech_segment.audio.shape[0]}, sampling rate: {speech_segment.rate}')
print(f'transcript: {speech_segment.transcript}')

display(Audio(data=speech_segment.audio, rate=speech_segment.rate))
title = f'Raw wave of speech segments in {corpus_entry.id}.wav'

fig = plt.figure(figsize=default_figsize, facecolor=default_facecolor)
show_wave(speech_segment.audio, speech_segment.rate, title=title)

### From raw waves to spectrograms

Although already a sequence, training on the raw wave would not be very useful since we would only have one feature (the amplitude) per time step. However, an audio signal just a bunch of overlaying frequencies of different phases and amplitudes. For a given time slot (_window_), the raw signal can be decomposed into its underlying frequencies using Fourier Transformation, yielding the amplitude of each frequency. 
These values can be stored in a 1-D array of shape $(f \times 1)$, whereas $f$ denotes the number of frequencies.

Since we will be using spectrograms as input values `X` to train an RNN, $T_x$ denotes the number of windows that can be calculated from the audio signal. Hence all the windows together form a matrix of shape ($f \times T_x$) where each entry corresponds to the _magnitude_ of frequency $f$ in window $T_x$. Such a matrix is called a **spectrogram**. A spectrogram can be visualized by color-coding the values. Consider the following spectrogram derived from the raw wave above.

In [None]:
mag_spec = mag_specgram(speech_segment.audio, speech_segment.rate)
show_spectrogram(mag_spec, speech_segment.rate, ms_to_frames(10, speech_segment.rate), scale=None)

Such a spectrogram can now be calculated for every speech segment. The following table contains all relevant parameters:

| Symbol | Variable in code | Value | Description |
|---|---|---|---|
| $n$ | `num_values` | - | number of discrete sampling values in audio signal |
| $r$ | `sample_rate` | - | sampling rate of audio signal |
| $w_{ms}$ | `window_size_ms` | 20 | Window length in ms |
| $w$ | `window_size` | 320 | Window length in frames |
| $s_{ms}$ | `step_size_ms` | 10 | Step length in ms |
| $s$ | `step_size` | 160 | Step length in frames $(s = \frac{r \cdot s_{ms}}{1000})$ |

Note that the window and step length in frames unit can be derived from their values in milliseconds by calculating $w = \frac{r \cdot w_{ms}}{1000}$ or $s = \frac{r \cdot s_{ms}}{1000}$ respectively.

To calculate the spectrogram for an audio signal, a sliding window of size $w$ is moved over the sample values with step size $s$. Note that the step size is usually smaller than the window size which means the windows will overlap to a certain degree. For any given audio signal $x$ the number of windows $T_x$ can be calculated by dividing the number of sampling values by the size of the overlap:

$$
T_x = \left\lfloor \frac{n}{(w-s)} + 1 \right\rfloor
$$

The flooring is needed because the window size might not match up exactly with the number of sample values, resulting in fractional values for $T_x$. Since we will use the windows of the spectrogram as input to an RNN, $T_x$ corresponds to the number of training samples. Therefore only whole numbers make sense.

According to the [Nyquist theorem](https://en.wikipedia.org/wiki/Nyquist_rate) the sampling frequency of an audio signal must be (at least) twice the frequency of the signal frequency in order to being able to reconstruct the original signal from the discrete sampling values. Since our sampling rate is 16kHz this means the maximum frequency that can be reproduced is 8kHz. Therefore the frequencies in our spectrogram are all in the range $0..\frac{r}{2} = 0..8000$ Hz. This interval can be divided into equally sized sections. Including the borders of these sections this gives us $f$ equidistant sampling frequencies. The value for $f$ can be calculated as follows:

$$
f = \frac{w}{2} + 1
$$

Note that we add 1 at the end because the borders (lowest and the highest frequency) are both included.

Since the frequency band of the spectrogram will be spaced equally, the distance between two sample frequencies is $\frac{r}{2\cdot (f - 1)}$. This means that frequency phase $F_i$ in the frequency band can be calculated as follows:

$$
F_i = i \cdot \frac{r}{2 \cdot (f - 1)}
$$

**Example**:

For this project all audio signals were re-sampled with with a sampling rate $r=16000$. To calculate the spectrogram we use a sliding window of $w_{ms}=20ms$ length and a step size of $s_{ms}=10ms$ . In frame units this gives us the values $w=\frac{16000 \cdot 20 ms}{1000 ms} = 320$ and $r=\frac{16000 \cdot 10 ms}{1000 ms} = 160$.

As stated above the frequencies all lie in the interval $[0..8000]$. This band is now divided into sections giving us $f = \frac{320}{2} + 1 = 161$ sample frequencies, whereas the distance between each frequency is $\Delta f = \frac{16000}{2 \cdot (161 - 1)} = 50 Hz$. The $i$-th sample frequency can therefore be calculated as. $F_i = i \cdot 50$. The frequencies in the spectrogram are then:

    [0, 50, 100, 150, ... , 7950, 8000]

The raw wave for the example speech sequence above consists of $n = 9760$ sample values. Using a window size of $w=320$ frames and a step size of $s=160$ frames we arrive at a value of $T_x = \left\lfloor \frac{9760}{(320-160)} + 1 \right\rfloor = 62$ training samples.

We can verify this for the above spectrogram:

In [None]:
window_size_ms, step_size_ms, num_vals = 20, 10, speech_segment.audio.shape[0]
window_size = ms_to_frames(window_size_ms, speech_segment.rate)
step_size = ms_to_frames(step_size_ms, speech_segment.rate)

print(f'n = {num_vals}\t(number of sample values)')
print(f'r = {speech_segment.rate}\t(sample rate)')
print(f'w_ms = {window_size_ms}\t(window size in ms)\t\t ==> w = {window_size}\t\t(window size in frames)')
print(f's_ms = {step_size_ms}\t(step size in ms)\t\t ==> s = {step_size}\t\t(step size in frames)')
print()

f, T_x = mag_spec.shape
print(f'spec.shape = (f, T_x) = ({f}, {T_x})')
print()

delta_f = int(speech_segment.rate / (2 * (f - 1)))
print(f'delta_f = {delta_f}\t(difference between sample frequencies in Hz)')
print()

freqs = np.array(range(0, f*delta_f, delta_f))
print('Frequencies (y-Axis):')
print(freqs)

#### Power spectrograms

Above spectrogram visualizes the magnitude of the frequencies. Because of the way people hear, we recalculate the values to decibel (dB) units. To do this we take the logarithm of the squared amplitude and get so-called power spectrograms. The logarithmic scale of decibels corresponds with how humans perceive loudness: To double the perceived volume of a sound you would need to put 8 times more energy in it.

We can visualize the results by plotting the log-scaled dB-values along the two axes (time and frequency):

In [None]:
pow_spec = pow_specgram(speech_segment.audio, speech_segment.rate)

title = f'3D spectrogram of speech segments in {corpus_entry.id}.wav'
show_spectrogram_3d(pow_spec, window_size=window_size, step_size=step_size, sample_rate=speech_segment.rate, title=title)

We can further reduce the above 3D-plot by one dimension by flattening it along the z-axis (magnitude). We don't lose any information because the third dimension (dB value) is color-coded. From this plot we clearly see that most of the high-valued entries of the spectrogram all lie within a frequency range of approximately 300-3400 Hz, which corresponds to the usable voice frequency band [used e.g. in telephony](https://en.wikipedia.org/wiki/Voice_frequency).

In [None]:
show_spectrogram(pow_spec, speech_segment.rate, step_size=step_size)

#### Mel power spectrograms

In the power spectrogram above the features were extracted by putting the values on the Hertz scale. This scale is logarithmic, i.e. a value of 2000Hz is considered twice as high than a value of 1000 Hz, which in turn is twice as high as a value of 500 Hz. In other words: doubling a Hertz value corresponds to setting the pitch of a tone an octave higher. Consider the following examples for reference:

| 500Hz | 1000Hz | 2000Hz
|---|---|---
|<audio src="../assets/500Hz.wav" style="width: 150px;" controls preload></audio>|<audio src="../assets/1000Hz.wav" style="width: 150px;" controls preload></audio>|<audio src="../assets/2000Hz.wav" style="width: 150px;" controls preload></audio>

Alternatively we could also put the values on the the [Mel scale](https://en.wikipedia.org/wiki/Mel_scale) which is logarithmic too but based on psycho-acoustic findings about how pitches of equal distance are perceived by humans. It turns out that that until approximately 500Hz the Mel scale corresponds roughly with the Hertz scale, whereas for sounds above 500Hz the intervals between two sounds must increase in order to be perceived the same distance from another. Thus the Mel scale is more discriminative for sounds on low frequencies and less discriminative for sounds at high frequencies. As a result, four octaves on the hertz scale above 500 Hz are judged to comprise about two octaves on the mel scale. The reference point is set at 1000Hz, which corresponds to 1000MEL. 

Consider the following plot which maps values on the Hertz-scale to their counterparts on the Mel-scale:

<figure>
    <img src="../assets/mel-scale.svg" alt="Mel scale">
    <center><figcaption>_Source: Wikipedia_</figcaption></center>
</figure>

There are various formulas to convert Hz to MEL. One possible choice is the one from Douglas O'Shaughnessy (1987, _Speech communication: human and machine_):

$$
m = 2995 \log_{10}\left( 1 + \frac{f}{700} \right) = 1127 \ln \left( 1 + \frac{1}{700} \right)
$$

Using this formula we arrive at the following values for the above values on the Hertz scale:

| ~607MEL = 500Hz | 1000MEL = 1000Hz| ~1521MEL = 2000Hz
|---|---|---
|<audio src="../assets/607Hz.wav" style="width: 150px;" controls preload></audio>|<audio src="../assets/1000Hz.wav" style="width: 150px;" controls preload></audio>|<audio src="../assets/1521Hz.wav" style="width: 150px;" controls preload></audio>

We can now use the Mel scale instead of the Hertz scale to create a **Mel power spectrogram** as an alternative to the "normal" spectrogram. To do this, each bin in the spectrogram is divided into chunks of different sizes. For lower frequencies, the chunks are smaller because the human ear is able to discern more subtle changes in frequency in low-frequency areas (i.e. the Mel scale is more discriminative here). For higher frequencies the chunks become larger.

Note that the number of features is usually smaller than when calculating the spectrograms

In [None]:
mel_spec = mel_specgram(speech_segment.audio, speech_segment.rate, n_mels=40)
title=f'Mel-spectrogram of speech segment in {corpus_entry.id}.wav'
show_spectrogram(mel_spec, sample_rate=speech_segment.rate, step_size=step_size, title=title)

### Alternative: MFCC

As an alternative to Spectrograms we could use Mel Frequency Cepstral Coefficients (MFCC) as features.

Such a spectrogram is now created as a matrix `x` for every single corpus entry. A label vector `y` is also created for each corpus entry. This leaves us with two files for each entry. Since the spectrograms can become quite big, separate files are created for each entry. The files share a common naming pattern to identify their type (spectrogram or label), subset membership (train-, dev- or test-set) and corresponding corpus entry (ID of the corpus entry):

`{id}.{X|Y}.{train|dev|test}.npy`

For example the following files will be created for a corpus entry in the dev-set with ID `1234`:

```
1234.X.dev.npy
1234.Y.dev.npy
```

### Precomputing the featuers

To speed up the training the spectrograms can be pre-computed and stored on disk. Creating the labelled data might take some time (around 15 minutes for the ReadyLingua corpus up to several hours for the LibriSpeech corpus). Click the button below to start computing the spectrograms and label vectors.

In [None]:
display(widgets.HBox([create_data_rl_btn, create_data_ls_btn]))

## Exploring the labelled data

After the labelled data has been created, we can explore an entire corpus entry by visualizing its spectrogram together with the segmentation information. Execute the following cell to explore a random sample from the ReadyLingua corpus. In contrast to the spectrograms above this will calculate the spectrogram for the entire entry (and not just for a single speech segment). Additionally, the segmentation information is overlaid.

In [None]:
corpus_entry = rl_corpus['news170524']

# uncomment one of the two lines below to get a random / the first entry
# corpus_entry = random.choice(rl_corpus)
# corpus_entry = rl_corpus[0]

show_labelled_data(corpus_entry)

### Voice Activity Detection

The above plots show the segmentation of the audio signal into speech and pause segments using the ground truth derived from the metadata that was provided together with the raw data. However, instead of having to rely on such metadata being present, we could try out detecting speech pauses automatically using a VAD (Voice Activity Detection) algorithm. A VAD algorithm that is able to detect speech pauses with reasonable accuracy would free us from the task of detecting them ourselves (by training an RNN e.g.).

#### WebRTC

[WebRTC](https://webrtc.org/) is a free, open project that provides browsers and mobile applications with Real-Time Communications (RTC) capabilities via simple APIs. The WebRTC components have been optimized to best serve this purpose. There is also a VAD component, whose functionality has been [ported to Python by John Wiseman](https://github.com/wiseman/py-webrtcvad). It uses C code under the hood and is therefore very performant.

Execute the cell below to compare the pause segments detected by WebRTC together with the pause segments from the metadata.

In [None]:
def calculate_boundaries_webrtc(corpus_entry, aggressiveness=3):
    voiced_segments, _ = split_segments(corpus_entry, aggressiveness=aggressiveness)
    boundaries = []
    for frames in voiced_segments:
        start_time = frames[0].timestamp
        end_time = (frames[-1].timestamp + frames[-1].duration)
        boundaries.append((start_time, end_time))
    return 2*np.array(boundaries), voiced_segments

# corpus_entry = random.choice(rl_corpus)
corpus_entry = rl_corpus['news170524']
# corpus_entry = rl_corpus[0]

audio, rate = corpus_entry.audio, corpus_entry.rate
display(Audio(data=audio, rate=rate))

# pause boundaries from raw data
original_boundaries = calculate_boundaries(corpus_entry.speech_segments)
original_boundaries = original_boundaries / rate

# pause boundaries from WebRTC
webrtc_boundaries, voiced_segments = calculate_boundaries_webrtc(corpus_entry)

title = f'Raw wave of {corpus_entry.audio_file}'
ax_wave = show_wave(audio, rate, title=title)
show_segments(ax_wave, original_boundaries, ymax=0.5, color='green')
show_segments(ax_wave, webrtc_boundaries, ymin=0.5, color='blue')

pause_segments_original = mpatches.Patch(color='green', alpha=0.6, label=f'original speech segments ({len(original_boundaries)})')
pause_segments_webrtc = mpatches.Patch(color='blue', alpha=0.6, label=f'speech segments detected by WebRTC ({len(webrtc_boundaries)})')
ax_wave.legend(handles=[pause_segments_original, pause_segments_webrtc], bbox_to_anchor=(0, -0.2, 1., -0.1), loc=3, mode='expand', borderaxespad=0, ncol=2)

You can also listen to speech segments detected by WebRTC:

In [None]:
import itertools

def play_webrtc_sample(webrtc_sample):
    audio = np.concatenate([frame.audio for frame in webrtc_sample])
    display(Audio(data=audio, rate=rate))
    
[play_webrtc_sample(sample) for sample in (voiced_segments[i] for i in range(10))]

#### WebRTC vs. manual segmentation

We can calculate how much the speech pauses automatically detected by WebRTC coincide with the speech pauses from raw data, which were manually defined. To do this we can compare different metrics of the two results:

* **Precision**: Percentage of audio frames in classified as "speech" by WebRTC that are were actually manually classified "speech"
* **Recall**: Percentage of manually classified "speech" frames that were also detected by WebRTC
* **Difference**: Difference between the number of speech segments detected by WebRTC and manual segmentation. A negative value means WebRTC detected fewer speech segments. A positive value means WebRTC detected more speech segments. A value of zero means both methods produced the same number of (but not neccessarily the same) speech segments.

These metrics can be calculated for a corpus entry or the whole corpus. Precision and Recall can be further combined to a single value by calculating its **F-Score**:

$$ F = 2 \cdot \frac{P \cdot R}{P+R} $$

The first two metrics have to be taken with a grain of salt though, because they depend on the definition of a speech pause, which is highly subjective. WebRTC provides a parameter which controls the "aggressiveness" of speech detection (values between 0 and 3). A higher value means higher aggressiveness, which results in a higher probability for a frame being classified as "speech" and therefore in more speech segments.

In [None]:
from operator import itemgetter

def getOverlap(a, b):
    return max(0, min(a[1], b[1]) - max(a[0], b[0]))

def calc_intersection(a, b):
    a = sorted(a, key=itemgetter(0))
    b = sorted(b, key=itemgetter(0))
    for start_a, end_a in a:
        x = set(range(start_a, end_a + 1))
        for start_b, end_b in ((s, e) for (s, e) in b if getOverlap((s, e), (start_a, end_a))):
            y = range(start_b, end_b + 1)
            intersection = x.intersection(y)
            if intersection:
                yield min(intersection), max(intersection)

def precision_recall(corpus_entry, aggressiveness):
    boundaries_original = calculate_boundaries(corpus_entry.speech_segments)
    boundaries_webrtc, _ = calculate_boundaries_webrtc(corpus_entry, aggressiveness=aggressiveness)
    boundaries_webrtc = boundaries_webrtc * corpus_entry.rate # convert to frames
    boundaries_webrtc = boundaries_webrtc.astype(int)
    
    intersections = calc_intersection(boundaries_original, boundaries_webrtc)
    n_frames_intersection = sum(len(range(start, end + 1)) for start, end in intersections)
    n_frames_original = sum(len(range(start, end + 1)) for start, end in boundaries_original)
    n_frames_webrtc = sum(len(range(start, end + 1)) for start, end in boundaries_webrtc)
    
    precision = n_frames_intersection / (n_frames_webrtc + 1e-3)
    recall = n_frames_intersection / (n_frames_original + 1e-3)
    num_diff = len(boundaries_webrtc) - len(boundaries_original)
    
    return precision, recall, num_diff

for aggressiveness in 0,1,2,3:
    print(f'measuring precision/recall for WebRTC-VAD with aggressiveness={aggressiveness}')
    precision, recall, num_diff = precision_recall(corpus_entry, aggressiveness)
    print(f'precision is: {precision}')
    print(f'recall is: {recall}')
    print(f'difference: {num_diff}')

We can further examine to what degree the speech pauses detected by WebRTC overlap with the speech pauses from the raw data for a whole corpus. We do this by iterating over the whole corpus and perform above calculations for each corpus entry. The results for precision and recall can be averaged to get an idea of how well WebRTC generally performs. The results for the difference must be inspected more closely because the negative and positive values might cancel each other out, yielding an overall difference of zero, which is not correct since we are interested in the average difference of produced speech segments. We therefore differenciate three values for the difference:

* **Absolute Difference**: Average of the absolute values of the differences over all corpus entries
* **Negative Difference**: Average of the negative values of the differences over all corpus entries (corpus entries where WebRTC produced less speech segments than a human)
* **Positive Difference**: Average of the positive values of the differences over all corpus entries (corpus entries where WebRTC produced more speech segments than a human)

In [None]:
from tqdm import tqdm
from tabulate import tabulate
from util.log_util import print_to_file_and_console

def compare_corpus(corpus, corpus_root, aggressiveness):
    p_r_d = list(tqdm((precision_recall(corpus_entry, aggressiveness) for corpus_entry in corpus), total=len(corpus)))
    avg_precision = sum(p for p, _, _ in p_r_d) / len(corpus)
    avg_recall = sum(r for _, r, _ in p_r_d) / len(corpus)
    avg_diff_abs = sum(abs(d) for _, _, d in p_r_d) / len(corpus)
    avg_diff_neg = sum(d for _, _, d in p_r_d if d < 0) / len(corpus)
    avg_diff_pos = sum(d for _, _, d in p_r_d if d > 0) / len(corpus)

    return avg_precision, avg_recall, avg_diff_abs, avg_diff_neg, avg_diff_pos

def create_corpus_stats(corpus):
    print(f'Comparing automatic/manual VAD for {corpus.name} corpus')
    stats = {'Aggressiveness': [0,1,2,3], 'Precision': [], 'Recall': [], 'Difference (absolute)': [], 'Difference (negative)': [], 'Difference (positive)': []}
    for aggressiveness in stats['Aggressiveness']:
        print(f'precision/recall with aggressiveness={aggressiveness}\n')
        avg_precision, avg_recall, avg_diff_abs, avg_diff_neg, avg_diff_pos = compare_corpus(rl_corpus, rl_corpus_root, aggressiveness)
        stats['Precision'].append(avg_precision)
        stats['Recall'].append(avg_recall)
        stats['Difference (absolute)'].append(avg_diff_abs)
        stats['Difference (negative)'].append(avg_diff_neg)
        stats['Difference (positive)'].append(avg_diff_pos)

    stats_file = os.path.join(corpus.root_path, 'corpus.stats')
    if os.path.exists(stats_file):
        os.remove(stats_file)
    print(f'Writing results to {stats_file}')
    f = print_to_file_and_console(stats_file)        
    print(tabulate(stats, headers='keys'))
    f.close()
    return stats

def plot_stats(stats, title=None):
    x = stats['Aggressiveness']
    
    fig, ax1 = plt.subplots(figsize=default_figsize, facecolor=default_facecolor)
    if title:
        ax1.set_title(title)
    ax1.set_xticks(x)
    ax1.set_xlabel('aggressiveness')
    ax1.set_ylabel('precision/recall')
    p, = ax1.plot(x, np.array(stats['Precision']), color='r', label='Precision')
    r, = ax1.plot(x, np.array(stats['Recall']), color='g', label='Recall')
    
    ax2 = ax1.twinx()
    ax2.set_ylabel('difference')
    d_abs, = ax2.plot(x, np.array(stats['Difference (absolute)']), color='b', label='Difference (absolute)')
    d_neg, = ax2.plot(x, np.array(stats['Difference (negative)']), color='m', label='Difference (negative)')
    d_pos, = ax2.plot(x, np.array(stats['Difference (positive)']), color='y', label='Difference (positive)')
    
    plt.legend(handles=[p, r, d_abs, d_neg, d_pos], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    fig.tight_layout()
    plt.show()
    
title = f'Comparison of automatic/manual VAD for {rl_corpus.name} corpus'
plot_stats(create_corpus_stats(rl_corpus), title=title)

# title = f'Comparison of automatic/manual VAD for {ls_corpus.name} corpus'
# plot_stats(create_corpus_stats(ls_corpus))

##### Results and interpretation

Aboce cell compares the manual and automatic segmentation by calculating the average precision, average recall and average difference in number of speech segments created. The comparison has been made for each corpus and for all levels of aggressiveness. Since this process takes some time, the following figures and table show the result of a previous run. The best results are marked green.

###### Avg. Precision
<table>
  <tr>
    <th>Corpus</th>
    <th colspan="4">Aggressiveness</th>
  </th>
  <tr>
    <th></th>
    <th>0</th>
    <th>1</th>
    <th>2</th>
    <th>3</th>
  </tr>
  <tr>
    <td>ReadyLingua</td>
    <td>.849</td>
    <td>.850</td>
    <td>.873</td>
    <td style="background-color: lightgreen;">.901</td>
  </tr>
  <tr>
    <td>LibriSpeech</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
  </tr>
</table>

###### Avg. Recall
<table>
  <tr>
    <th>Corpus</th>
    <th colspan="4">Aggressiveness</th>
  </th>
  <tr>
    <th></th>
    <th>0</th>
    <th>1</th>
    <th>2</th>
    <th>3</th>
  </tr>
  <tr>
    <td>ReadyLingua</td>
    <td>.988</td>
    <td>.987</td>
    <td>.982</td>
    <td style="background-color: lightgreen;">.970</td>
  </tr>
  <tr>
    <td>LibriSpeech</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>    
  </tr>
</table>

###### F-Score
<table>
  <tr>
    <th>Corpus</th>
    <th colspan="4">Aggressiveness</th>
  </th>
  <tr>
    <th></th>
    <th>0</th>
    <th>1</th>
    <th>2</th>
    <th>3</th>
  </tr>
  <tr>
    <td>ReadyLingua</td>
    <td>.456</td>
    <td>.457</td>
    <td>.462</td>
    <td style="background-color: lightgreen;">.467</td>
  </tr>
  <tr>
    <td>LibriSpeech</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
  </tr>
</table>

###### Differences in number of speech segments
<table>
  <tr>
    <th>Corpus</th>
    <th colspan="4">Avg. Difference (abs)</th>
    <th colspan="4">Avg. Difference (neg)</th>
    <th colspan="4">Avg. Difference (pos)</th>
  </th>
  <tr>
    <th></th>
    <th>0</th>
    <th>1</th>
    <th>2</th>
    <th>3</th>
    <th>0</th>
    <th>1</th>
    <th>2</th>
    <th>3</th>
    <th>0</th>
    <th>1</th>
    <th>2</th>
    <th>3</th>
  </tr>
  <tr>
    <td>ReadyLingua</td>
    <td>30</td>
    <td>29</td>
    <td>20</td>
    <td style="background-color: lightgreen;">17</td>
    <td>-29</td>
    <td>-18</td>
    <td>-16</td>
    <td style="background-color: lightgreen;">-6</td>
    <td style="background-color: lightgreen;">1</td>
    <td style="background-color: lightgreen;">1</td>
    <td>4</td>
    <td>11</td>
  </tr>
  <tr>
    <td>LibriSpeech</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
    <td>tbd</td>
  </tr>
</table>

###### ReadyLingua corpus

The following plot visualizes the results for the ReadyLingua corpus. We can clearly observe that the precision increases quite significantly with increasing aggressiveness. At the same time, recall decreases, but not to the same rate. In its highest setting for aggressiveness WebRTC is able to detect speech segments with an F-Score of 0.467, which corresponds to values for Precision and Recall of over 90%.

The average difference in number of speech segments also approaches to zero with increasing aggressiveness. The average difference of corpus entries, where WebRTC would produce less speech segments than a human is at only -6, meaning that when WebRTC produces fewer speech segments than a human there are on average 6 speech segments less than a human would produce. Again, this is valid for highest aggressiveness. On the other hand the average difference when WebRTC produces more segments than a human, the difference starts to increase with increasing aggressiveness.  However, the sum of absolute values of difference is still lowest with a value of 3 for the agressivenes. We can conclude that generally WebRTC will produce more speech segments with increasing aggressiveness.

Generally speaking the performance of WebRTC-VAD can be considered very good, yielding results near-par to human performance when set to highest aggressiveness. The conclusion is to leave the aggressiveness of WebRTC-VAD at its highest setting (`3`).

![WebRTC VAD vs. manual speech segmentation](../assets/webrtc_vs_manual_rl.png)

###### LibriSpeech corpus

tbd.

## Label extraction from transcripts

Because our approach is to train an RNN that can produce a transcript for a given audio signal (ASR) we need to derive the target labels for each speech sequence. We can derive those labels from the transcript, which is included for each speech sequence in the corpora. However, because an RNN only works with numerical values, we must find a way to convert the representation from alphanumeric to numeric. Additionally, since transcripts often contain unwanted characters and to reduce the number of possible target labels, some preprocessing needs to be done. This chapter documents how both is done.

### Normalizing the transcript

In order to facilitate learning, the set of possible characters in the transcript is limited. This is expected to speed up the learning process and improve the quality of the result. We do this by normalizing the transcript. Normalization involves the following steps:

1. remove leading and trailing whitespaces (trimming)
2. remove multiple subsequent occurences of whitespace within the transcript
3. replacing accentuated characters with character from the alphabet (e.g. _é_/_è_/_ê_/...->e, _ß_->ss, etc...)
4. removing non-alphanumeric characters (removes punctuation)
5. make everything lowercase

You can edit/execute the cell below with your own examples to see the result of normalization.

In [None]:
from util.string_util import *
samples = [ 'Crème-brûlée', 'Außerirdische', ' foo    bar   ']
for sample in samples:
    print(f'{sample} ==> {normalize(sample)}')

### Tokenizing the transcript

In order to arrive at the target labels for a speech sequence, its normalized transcript needs to be tokenized first. By tokenizing we mean splitting the transcription into words and then into characters. The tokens are the characters of the transcription, whereas a special token `<space>` is used between the characters of two words.

The mapping of audio to text is actually a classification problem: Parts of the audio signal are mapped each to a specific character (i.e. _token_). Since an RNN resp. TensorFlow only works with numeric data, we need to encode the tokens to put them on an ordinal scale. The following table shows how the encoding is done:

| **Token**    | `<space>` | `a` | `b` | `c` | ... | `z`  |
|--------------|:---------:|:---:|:---:|:---:|:---:|:----:|
| **Encoding** | `0`       | `1` | `2` | `3` | ... | `26` |

The following table shows how an example transcript is converted to its encoded form:

| **Original transcript** | The quick, brown fox jumps over the lazy dog!  |
|-------------------------|------------------------------------------------|
| **Normalized transcript** | the quick brown fox jumps over the lazy dog |
| **Tokenized transcript** | `['t', 'h', 'e', '<space>, 'q', 'u', 'i', 'c', 'k', '<space>', 'b', 'r', 'o', 'w', 'n', ...]` |
| **Encoded transcript** | `[ 20, 8, 5, 0, 17, 21, 9, 3, 11, 0, 2, 18, 15, 23, 14, ...]` |

#### The issue with numbers

Numbers in transcript pose a special problem, since their pronunciation differs fundamentally from their lexical representation, if written with digits (which is usually the case). Consider the number `8`, which is represented textually by the digit `'8'` and is pronounced as _'eight'_. In this case, the actual sequence of characters (`'e', 'i', 'g', 'h', 't')` is replaced by a single character `'8'` and can therefore not be approximated like ordinary words.

The problem becomes even harder since compound number are sometimes pronounced differently than their individual parts would be pronounced. Consider the number `13` which is pronounced `'thirteen'` (and not `'onethree'`!). This becomes especially important in languages like German which swap the decimal part (e.g. `'21'` is pronounced as `'one-and-twenty'`).

Since numbers are a problem of their own we want to limit their influence on the training process by training the RNN only on transcripts without numbers. We can filter those out by using the corpus entry as a function and pass in the `numeric=False` argument to get only those speech segments whose transcripts do not contain numbers:

## Conclusion

In this chapter we have seen how to transform the standardized data from the corpora to create labelled data that can be used to train an RNN. We have seen that the features `X` can be extracted from the audio signal of a speech segment in different ways (power spectrogram or MFCC) and on different scales (Hertz and Mel scale). We have also seen that the labels can easily be created by normalizing the transcript of a speech segment and mapping each character to an ordinal value.

Both `X` and `Y` make up the labelled data which can be further subdivided into three different sets (train-, dev- and test-set) used to train, validate and evaluate an RNN.