# IP8: Creation of Labelled Data
The RNN will be trained on spectrograms of the audio data from the created corpora. Since this process is computationally expensive and requires a lot of time. To speed up the iterations when training the RNN and get feedback faster, the input data (the spectrograms) are pre-computed and stored on disk. Also, the labels (the information about speech pauses) need to be encoded in a suitable format. This notebook describes how this is done.

Before we start, define a path to an empty directory with enough free storage where the labelled data can be stored:

In [None]:
target_root = r'E:/'

As usual, let's do the imports and some helper functions before we start.

In [None]:
%matplotlib inline
# %matplotlib notebook

import random
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import HTML, Audio
import ipywidgets as widgets
from IPython.display import HTML, Audio
import plotly.graph_objs as go

from create_labelled_data import create_X_Y
from util.corpus_util import *
from util.audio_util import *
from util.webrtc_util import *

import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
from IPython.display import HTML, Audio
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd

rl_corpus_root = os.path.join(target_root, 'readylingua-corpus')
ls_corpus_root = os.path.join(target_root, 'librispeech-corpus')

rl_data_root = os.path.join(target_root, 'readylingua-data')
ls_data_root = os.path.join(target_root, 'librispeech-data')

default_figsize = (12,4)
default_facecolor = 'white'
default_font = {'family': 'serif', 
                'weight': 'normal', 
#                 'size': 12
               }

plt.rc('font', **default_font)

def show_labelled_data(corpus_entry, data_root):
    display(HTML(f'<h3>{corpus_entry.name} (id={corpus_entry.id})</h3>'))
    display(HTML(f'{len(corpus_entry.speech_segments)} speech segments, {len(corpus_entry.pause_segments)} pause segments'))
    
    # audio data
    display(Audio(data=corpus_entry.audio, rate=corpus_entry.rate))
    
    fig = plt.figure(figsize=default_figsize, facecolor=default_facecolor)
    
    # plot spectrogram
    ax_spec = fig.add_subplot(211)
    title = f'Spectrogram of ' + corpus_entry.audio_file
    freqs, times, spec = corpus_entry.spectrogram()
    ax_spec, extent = show_spectrogram(freqs, times, spec, ax_spec, title)
    
    # plot raw wave
    ax_wave = fig.add_subplot(212)
    title = f'Raw wave of {corpus_entry.audio_file} with speech pauses'
    ax_wave = show_wave(corpus_entry.audio, corpus_entry.rate, ax_wave, title)
    
    # overlay pauses
    left, right, bottom, top = extent
    boundaries_frames = calculate_pause_boundaries_from_ground_truth(corpus_entry)

    show_pause_segments(ax_spec, (right-left) * boundaries_frames / len(corpus_entry.audio))
    show_pause_segments(ax_wave, boundaries_frames)
        
    return ax_spec, ax_wave

def show_spectrogram(freqs, times, spec, ax=None, title=None):
    if not ax:
        plt.figure(figsize=default_figsize, facecolor=default_facecolor)

    extent = [times.min(), times.max(), freqs.min(), freqs.max()]
    
    print(f'spec.shape = (f, T_x) = {spec.shape}')
    im = plt.imshow(spec, aspect='auto', origin='lower', extent=extent)
    
    ax = im.axes
    
    if title:
        ax.set_title(title)
    
    ax.set_xlim(times.min(), times.max())
    ax.set_yticks(freqs[::16])
    ax.set_xticks(times[::int(len(times)/10)])
    
    ax.set_xlabel('Seconds')
    ax.set_ylabel('Freqs in Hz')
    
    plt.tight_layout()
    
    return im.axes, extent    
        
# def show_spectrogram(spec, sample_rate, step_size, scale=None, title=None):
#     y_axis = scale if scale else 'hz'
#     ax = librosa.display.specshow(spec, sr=sample_rate, hop_length=step_size, 
#                                   cmap='viridis', x_axis='time', y_axis=y_axis)
    
#     if scale:
#         plt.colorbar(format='%+2.0f dB')
#     if title:
#         plt.title(title)
#     plt.tight_layout()    
#     return ax
    
def show_spectrogram_3d(spec, window_size=320, step_size=160, sample_rate=16000, title=None):
    times = np.arange(window_size/2, spec.shape[-1] - window_size/2 + 1, window_size - step_size)/float(sample_rate)
    data = [go.Surface(z=spec)]
    layout = go.Layout(
        title=title,
        scene = dict(
            xaxis = dict(title='Time', range=times),
            yaxis = dict(title='Frequencies', range=freqs),
            zaxis = dict(title='Log amplitude'),
            ),
    )
    fig = go.Figure(data=data, layout=layout)
    py.iplot(fig)      
    
# def show_wave(audio, sample_rate, title=None):
#     p = librosa.display.waveplot(audio.astype(float), sample_rate)
#     ax = p.axes
#     ax.set_ylabel('Amplitude')
#     plt.title(title)
#     plt.tight_layout()
#     return ax

def show_wave(audio, sample_rate, ax=None, title=None):
    if not ax:
        plt.figure(figsize=default_figsize, facecolor=default_facecolor)
        ax = plt.axes()
        
    ax.set_xlim(0, len(audio))
    if title:
        ax.set_title(title)
        
    ax.set_ylabel('Amplitude')
    ax.set_xlabel('Audio frames')
    ax.plot(np.linspace(0, len(audio), len(audio)), audio)
    
    plt.tight_layout()
    return ax
    
def show_pause_segments(ax, boundaries, ymin=0, ymax=1, color='red'):
    for pause_start, pause_end in boundaries:
        ax.axvspan(pause_start, pause_end, ymin=ymin, ymax=ymax, color=color, alpha=0.8)
    
def calculate_pause_boundaries_from_ground_truth(corpus_entry):
    """calculates the boundaries of pause segments in x given a label vector y.
    
    :y: numpy array of shape (1, T_y) containing the labels ("speech"/"no speech") for a RNN
    :x: numpy array of shape (T_x, ) containing the audio signal for a RNN
    """
    
    x = corpus_entry.audio
    y = corpus_entry.labels
    
    num_x = len(x)
    
    # pause boundaries as binary vector: [0,0,1,1,1,0,...]
    y = np.ravel(y)
    
    # pause boundaries as indices of 1-groups in y (start and end indices of group): [[2,4], ...]
    boundaries = np.flatnonzero(np.diff(np.r_[0,y,0]) != 0).reshape(-1,2) - [0,1]
    
    # pause boundaries as indices of 1-groups in x (calculated from relative position of frames in y)
    boundaries = len(x) * boundaries / len(y)
    
    # no fractional indices
    return boundaries.astype(int)

def calculate_boundaries_from_webrtc(segments, audio, sample_rate):
    boundaries = []
    for frames in segments:
        start_frame = frames[0].timestamp * sample_rate
        end_frame = (frames[-1].timestamp + frames[-1].duration) * sample_rate
        boundaries.append((start_frame, end_frame))
    return 2 * np.array(boundaries).astype(int)
    
def on_create_data_rl_button_click(sender):
    rl_target_root = os.path.join(target_root, 'readylingua-data')
    create_X_Y(ls_corpus, rl_target_root)
    
def on_create_data_ls_button_click(sender):
    ls_target_root = os.path.join(target_root, 'librispeech-data')
    create_X_Y(ls_corpus, ls_target_root)      
    
# UI elements
layout = widgets.Layout(width='250px', height='50px')
create_data_rl_btn = widgets.Button(description="Create labelled data for ReadyLingua", button_style='info', layout=layout, icon='download')
create_data_rl_btn.on_click(on_create_data_rl_button_click)
create_data_ls_btn = widgets.Button(description="Create labelled data for LibriSpeech", button_style='info', layout=layout, icon='download')
create_data_ls_btn.on_click(on_create_data_ls_button_click)

After having created the corpora from raw data we can now start creating labelled data (spectrograms and labels) for the RNN. This data is stored as numpy arrays whose dimensions partially depend on the proposed network architecture. 

The RNN is trained on audio data (sequence of frames) and will output whether a specific section in the audio signal is speech or pause (sequence of labels). Because both the input and the output is a sequence, it is a sequence-to-sequence model with a **many-to-many** architecture. This means we have the following values to consider:

* $T_x$: Number of sequence tokens in an individual sample. This value may be different for each sample!
* $T_y$: Number of sequence tokens in the output. This value is always the same for each sample but may be different from $T_x$

In the following sections the following variable names are used to denote the two components of the labelled data:

* `X`: The actual data, i.e. the spectrograms. One spectrogram is created per corpus entry and saved to disk. The saved data consists of three components:
  * `freqs`: The frequencies used in the spectrogram (array of shape $(161, 1)$)
  * `times`: The time steps used in the spectrogram (array of shape $(T_x, 1)$)
  * `spec`: The spectrogram data (array of shape $(T_x, 161)$)
  'freqs' and 'times' are only needed to plot the spectrogram along in a Cartesian coordinate system, where the time steps will be plotted along the x-axis and  frequencies along the y-axis. For training, only the `spec` part is needed.
* `Y`: The labels, i.e. the information about speech- or pause segments. The labels are encoded as 1-dimensional binary vectors of shape $(1, T_y)$. A speech segment will be encoded as a sequence of zeroes and a pause segment as a sequence of ones. Pause sections may contain some signal (e.g. background noise) but no spoken text from the transcript.

Let's load the created corpora to make them available to this notebook.

In [None]:
rl_corpus = load_corpus(rl_corpus_root)
ls_corpus = load_corpus(ls_corpus_root)

## Train/Dev/Test split
The labelled data is split into subsets for training (_train-set_), parameter tuning (_dev-set_) and model evaluation (_test-set_). Since the corpora were create from different sources of raw data, they vary in size and probability distribution (number of languages, homogeneity of the recording quality, ratio of male vs. female speakers, presence of distortions like reverb or overdrive, and many more). Since the starting point for the creation of the corpus was so variable, different approaches were taken to split the corpus up into train-, dev- and test-set.

### ReadyLingua corpus
The raw data exhibits a high variance with respect to relevant features (recording quality, length of samples, presence of distortion, ...). Since the corpus is rather small there may be only one sample for a specific feature value (e.g. only one recording with reverb). Therefore to keep things simple the split into train-, dev- and test-set was done with a 80/10/10-rule without closer examination of the underlying data. This might not result in an optimal split since it would be possible for example that all the female speakers will be put in one subset.

Improvements could be made by manually assigning each sample to a specific set by carefully inspecting the relevant features. The corpus could also be extended by creating synthetisized data, e.g. creating samples with reverb from the original samples. Because the LibriSpeech corpus looks much more promising at the moment, this time was not invested.

### LibriSpeech corpus
The LibriSpeech raw data is already split into train-, dev- and test-set. Each chapter is read by a different speaker. Each speaker is only contained in one of the subsets. Efforts have been made to keep the different sets within the same probability distributions (regarding to accents, ratio of male/female speakers, ...). The information about the subset has been preserved when creating the corpora from raw data. To leverage the efforts made by the LibriSpeech project, the corresponding labelled data will be kept in the same subset.

---

You can explore the size of the subsets for each corpus by executing the cell below to see the number of samples (corpus entries) in each subset.

In [None]:
ls_train, ls_dev, ls_test = ls_corpus.train_dev_test_split()
print(f'LibriSpeech corpus ({len(ls_corpus)} samples): #train-samples: {len(ls_train)}, #dev-samples: {len(ls_dev)}, #test-samples: {len(ls_test)}')

rl_train, rl_dev, rl_test = rl_corpus.train_dev_test_split()
print(f'ReadyLingua corpus ({len(rl_corpus)} samples): #train-samples: {len(rl_train)}, #dev-samples: {len(rl_dev)}, #test-samples: {len(rl_test)}')

##  Feature extraction

In order to train an RNN, each sample needs to be converted into some sort of sequence. In this case the samples are the audio files from the corpus entries that were converted to wave files (`*.wav`) and downsampled to 16kHz (mono).

### Raw waves
As the name suggests the wave files contain the audio signal as a raw wave, which is just a series of discrete sample values. Because we used a sampling rate of 16kHz we get 16'000 sample values per second. A sample value corresponds to the amplitude of the waveform at the given time step. These values can be stored in a 1-dimensional Numpy array and plotted in two dimension (time vs. amplitude).

For example consider the a raw wave for a random speech segment. Feel free to change the first line to visualize the raw wave for a specific corpus entry.

In [None]:
corpus_entry = rl_corpus['20161124weeklyaddressthanksgiving']
speech_segment = corpus_entry.speech_segments[49]

# corpus_entry = random.choice(rl_corpus) # uncomment for random corpus entry
# speech_segment = random.choice(corpus_entry.speech_segments) # uncomment for random speech segment

print(f'number of sampling points: {speech_segment.audio.shape[0]}, sampling rate: {speech_segment.rate}')
print(f'transcript: {speech_segment.transcript}')

display(Audio(data=speech_segment.audio, rate=speech_segment.rate))
title = f'Raw wave of speech segments in {corpus_entry.id}.wav'

fig = plt.figure(figsize=default_figsize, facecolor=default_facecolor)
show_wave(speech_segment.audio, speech_segment.rate, title=title)

### From raw waves to spectrograms

Although already a sequence, training on the raw wave would not be very useful since we would only have one feature (the amplitude) per time step. However, an audio signal just a bunch of overlaying frequencies of different phases and amplitudes. For a given time slot (_window_), the raw signal can be decomposed into its underlying frequencies using Fourier Transformation, yielding the amplitude of each frequency. 
These values can be stored in a 1-D array of shape $(f \times 1)$, whereas $f$ denotes the number of frequencies.

Since we will be using spectrograms as input values `X` to train an RNN, $T_x$ denotes the number of windows that can be calculated from the audio signal. Hence all the windows together form a matrix of shape ($f \times T_x$) where each entry corresponds to the amplitude of frequency $f$ in window $T_x$. Such a matrix is called a **spectrogram**. A spectrogram can be visualized by color-coding the values. Consider the following spectrogram derived from the raw wave above.

In [None]:
freqs, times, spec = log_specgram(speech_segment.audio, speech_segment.rate)
show_spectrogram(freqs, times, spec)

Note that the values have been put on a logarithmic scale. Such a spectrogram can now be calculated for every speech segment. The following table contains all relevant parameters:

| Symbol | Variable in code | Value | Description |
|---|---|---|---|
| $n$ | `num_values` | - | number of discrete sampling values in audio signal |
| $r$ | `sample_rate` | - | sampling rate of audio signal |
| $w_{ms}$ | `window_size_ms` | 20 | Window length in ms |
| $w$ | `window_size` | 320 | Window length in frames |
| $s_{ms}$ | `step_size_ms` | 10 | Step length in ms |
| $s$ | `step_size` | 160 | Step length in frames $(s = \frac{r \cdot s_{ms}}{1000})$ |

Note that the window and step length in frames unit can be derived from their values in milliseconds by calculating $w = \frac{r \cdot w_{ms}}{1000}$ or $s = \frac{r \cdot s_{ms}}{1000}$ respectively.

To calculate the spectrogram for an audio signal, a sliding window of size $w$ is moved over the sample values with step size $s$. Note that the step size is usually smaller than the window size which means the windows will overlap to a certain degree. For any given audio signal $x$ the number of windows $T_x$ can be calculated by dividing the number of sampling values by the size of the overlap:

$$
T_x = \left\lfloor \frac{n}{(w-s)} + 1 \right\rfloor
$$

The flooring is needed because the window size might not match up exactly with the number of sample values, resulting in fractional values for $T_x$. Since we will use the windows of the spectrogram as input to an RNN, $T_x$ corresponds to the number of training samples. Therefore only whole numbers make sense.

According to the [Nyquist theorem](https://en.wikipedia.org/wiki/Nyquist_rate) the sampling frequency of an audio signal must be (at least) twice the frequency of the signal frequency in order to being able to reconstruct the original signal from the discrete sampling values. Since our sampling rate is 16kHz this means the maximum frequency that can be reproduced is 8kHz. Therefore the frequencies in our spectrogram are all in the range $0..\frac{r}{2} = 0..8000$ Hz. This interval can be divided into equally sized sections. Including the borders of these sections this gives us $f$ equidistant sampling frequencies. The value for $f$ can be calculated as follows:

$$
f = \frac{w}{2} + 1
$$

Note that we add 1 at the end because the borders (lowest and the highest frequency) are both included.

Since the frequency band of the spectrogram will be spaced equally, the distance between two sample frequencies is $\frac{r}{2\cdot (f - 1)}$. This means that frequency phase $F_i$ in the frequency band can be calculated as follows:

$$
F_i = i \cdot \frac{r}{2 \cdot (f - 1)}
$$

**Example**:

For this project all audio signals were re-sampled with with a sampling rate $r=16000$. To calculate the spectrogram we use a sliding window of $w_{ms}=20ms$ length and a step size of $s_{ms}=10ms$ . In frame units this gives us the values $w=\frac{16000 \cdot 20 ms}{1000 ms} = 320$ and $r=\frac{16000 \cdot 10 ms}{1000 ms} = 160$.

As stated above the frequencies all lie in the interval $[0..8000]$. This band is now divided into sections giving us $f = \frac{320}{2} + 1 = 161$ sample frequencies, whereas the distance between each frequency is $\Delta f = \frac{16000}{2 \cdot (161 - 1)} = 50 Hz$. The $i$-th sample frequency can therefore be calculated as. $F_i = i \cdot 50$. The frequencies in the spectrogram are then:

    [0, 50, 100, 150, ... , 7950, 8000]

The raw wave for the example speech sequence above consists of $n = 9760$ sample values. Using a window size of $w=320$ frames and a step size of $s=160$ frames we arrive at a value of $T_x = \left\lfloor \frac{9760}{(320-160)} + 1 \right\rfloor = 62$ training samples.

We can verify this for the above spectrogram:

In [None]:
window_size_ms, step_size_ms, num_vals = 20, 10, speech_segment.audio.shape[0]

print(f'n = {num_vals}\t(number of sample values)')
print(f'r = {speech_segment.rate}\t(sample rate)')
print(f'w_ms = {window_size_ms}\t(window size in ms)')
print(f's_ms = {step_size_ms}\t(step size in ms)')
print()

window_size = ms_to_frames(window_size_ms, speech_segment.rate)
step_size = ms_to_frames(step_size_ms, speech_segment.rate)

print(f'w = {window_size}\t\t(window size in frames)')
print(f's = {step_size}\t\t(step size in frames)')
print()

f, T_x = spec.shape
print(f'spec.shape = (f, T_x) = ({f}, {T_x})')
print()

delta_f = int(speech_segment.rate / (2 * (f - 1)))
print(f'delta_f = {delta_f}\t(difference between sample frequencies in Hz)')
print()

freqs = np.array(range(0, f*delta_f, delta_f))
print('Frequencies (y-Axis):')
print(freqs)

#### Power spectrograms

We can measure the power spectrum of a spectrogram by putting the values on a logarithmic scale (decibel units). We can visualize the results by plotting the DB values along the two axes (time and frequency):

In [None]:
# spec_log = pow_specgram(speech_segment.audio, window_size, step_size)
title = f'3D spectrogram of speech segments in {corpus_entry.id}.wav'
show_spectrogram_3d(spec, window_size=window_size, step_size=step_size, sample_rate=speech_segment.rate, title=title)

We can reduce the above 3D-plot by one dimension by flattening it along the z-axis (amplitude). We don't lose any information because the third dimension (dB value) is color-encoded.

In [None]:
show_spectrogram(freqs, times, spec, title=title)

#### Mel power spectrograms

Alternatively, we can calculate the features on the Mel-Scale.

Note that the number of features is usually smaller than when calculating the spectrograms

In [None]:
title=f'Mel-spectrogram of speech segment in {corpus_entry.id}.wav'
mel_spec = mel_specgram(speech_segment.audio, speech_segment.rate, window_size=window_size, step_size=step_size, n_mels=120)
show_spectrogram(mel_spec, scale='mel', sample_rate=speech_segment.rate, step_size=step_size, title=title)

### Alternative: MFCC

As an alternative to Spectrograms we could use Mel Frequency Cepstral Coefficients (MFCC) as features.

Such a spectrogram is now created as a matrix `x` for every single corpus entry. A label vector `y` is also created for each corpus entry. This leaves us with two files for each entry. Since the spectrograms can become quite big, separate files are created for each entry. The files share a common naming pattern to identify their type (spectrogram or label), subset membership (train-, dev- or test-set) and corresponding corpus entry (ID of the corpus entry):

`{id}.{X|Y}.{train|dev|test}.npy`

For example the following files will be created for a corpus entry in the dev-set with ID `1234`:

```
1234.X.dev.npy
1234.Y.dev.npy
```

### Precomputing the featuers

To speed up the training the spectrograms can be pre-computed and stored on disk. Creating the labelled data might take some time (around 15 minutes for the ReadyLingua corpus up to several hours for the LibriSpeech corpus). Click the button below to start computing the spectrograms and label vectors.

In [None]:
display(widgets.HBox([create_data_rl_btn, create_data_ls_btn]))

## Exploring the labelled data

After the labelled data has been created, we can explore an entire corpus entry by visualizing its spectrogram together with the segmentation information. Execute the following cell to explore a random sample from the ReadyLingua corpus. In contrast to the spectrograms above this will not calculate the spectrogram for the entire entry, not just a single speech segment.

In [None]:
# corpus_entry = random.choice(rl_corpus)
# corpus_entry = rl_corpus[0]
corpus_entry = rl_corpus['news170524']
show_labelled_data(corpus_entry, rl_data_root)

Alternatively, we can detect speech pauses using a VAD (Voice Activity Detection) algorithm like the one from [WebRTC](https://webrtc.org/):

In [None]:
# corpus_entry = random.choice(rl_corpus)

display(Audio(data=corpus_entry.audio, rate=corpus_entry.rate))

# pause boundaries from raw data
original_boundaries = calculate_pause_boundaries_from_ground_truth(corpus_entry)

# pause boundaries from WebRTC
voiced_segments, unvoiced_segments = split_segments(corpus_entry)
webrtc_boundaries = calculate_boundaries_from_webrtc(unvoiced_segments, corpus_entry.audio, corpus_entry.rate)

title = f'Raw wave of {corpus_entry.audio_file} with {len(original_boundaries)} original speech pauses (red) and {len(webrtc_boundaries)} speech pauses detected by WebRTC (green)'
ax_wave = show_wave(corpus_entry.audio, corpus_entry.rate, title=title)
show_pause_segments(ax_wave, original_boundaries, ymax=0.5)
show_pause_segments(ax_wave, webrtc_boundaries, ymin=0.5, color='green')

We can now calculate how much the speech pauses detected by WebRTC coincide with the speech pauses from raw data:

In [None]:
def calculate_overlaps(original_boundaries, webrtc_boundaries):
    boundaries = np.concatenate((original_boundaries, webrtc_boundaries))
    
    prev_start = -1
    prev_end = -1
    num_overlaps = 0
    overlaps = []
    for start, end in np.sort(boundaries, axis=0):
        if start <= prev_end: # we have an overlap
            overlap_start, overlap_end = (start, min(end, prev_end))
            
            # calculate number of frames that overlap
            overlap_frames = overlap_end - overlap_start
            
            # calculate degree of overlap
            segment_frames = prev_end - prev_start
            overlap_ratio = overlap_frames/segment_frames
            
            overlaps.append((overlap_frames, overlap_ratio))
        prev_start = start
        prev_end = end
        
    return overlaps

def compare_corpus_entry(corpus_entry, data_root):
    entry_stats = {}
    original_boundaries = calculate_pause_boundaries_from_ground_truth(corpus_entry)
    
    entry_stats['raw'] = {}
    entry_stats['raw']['#pauses'] = len(corpus_entry.pause_segments)
    entry_stats['raw']['#pause_frames'] = sum([end_frame - start_frame for start_frame, end_frame in original_boundaries])
    
    voiced_segments, unvoiced_segments = split_segments(corpus_entry)
    webrtc_boundaries = calculate_boundaries_from_webrtc(unvoiced_segments, corpus_entry.audio, corpus_entry.rate)    
    
    entry_stats['webrtc'] = {}
    entry_stats['webrtc']['#pauses'] = len(unvoiced_segments)
    entry_stats['webrtc']['#pause_frames'] = sum([end_frame - start_frame for start_frame, end_frame in webrtc_boundaries])

    overlaps = calculate_overlaps(original_boundaries, webrtc_boundaries)
    
    entry_stats['#overlaps'] = len(overlaps)
    entry_stats['#overlap_frames'] = np.sum([overlap_frames for overlap_frames, _ in overlaps])
    entry_stats['avg_overlap'] = np.mean([overlap_ratio for _, overlap_ratio in overlaps])
    
    return entry_stats
    
entry_stats = compare_corpus_entry(corpus_entry, rl_data_root)

num_pauses_raw = entry_stats['raw']['#pauses']
sum_pauses_raw = entry_stats['raw']['#pause_frames']
print(f'Length of all {num_pauses_raw} speech pauses in corpus entry (ground truth): {sum_pauses_raw}')

num_pauses_webrtc = entry_stats['webrtc']['#pauses']
sum_pauses_webrtc = entry_stats['webrtc']['#pause_frames']
print(f'Length of all {num_pauses_webrtc} speech pauses in corpus entry (WebRTC): {sum_pauses_webrtc}')

num_overlaps = entry_stats['#overlaps']
sum_overlaps = entry_stats['#overlap_frames']
print(f'WebRTC overlaps with original pauses {num_overlaps}/{num_pauses_raw} times ({100*num_overlaps/num_pauses_raw:.2f}%)')
print(f'WebRTC overlaps with original pauses in {sum_overlaps}/{sum_pauses_raw} frames ({100*sum_overlaps/sum_pauses_raw:.2f}%)')

avg_overlap = entry_stats['avg_overlap']
print(f'WebRTC pauses coincide with original pauses with {avg_overlap}')

We can further examine to what degree the speech pauses detected by WebRTC overlap with the speech pauses from the raw data for a whole corpus:

In [None]:
from tqdm import tqdm

def compare_corpus(corpus, data_root):
    total_orig_pauses = 0
    total_orig_pause_frames = 0
    total_webrtc_pauses = 0
    total_webrtc_pause_frames = 0

    entry_stats = []
    for corpus_entry in tqdm(corpus, unit=' entries'):
        entry_stat = compare_corpus_entry(corpus_entry, data_root)
        entry_stats.append(entry_stat)

    total_overlaps = len(entry_stats)
    
    corpus_stats = {}
    
    corpus_stats['raw'] = {}
    corpus_stats['raw']['#pauses'] = sum(entry_stat['raw']['#pauses'] for entry_stat in entry_stats)
    corpus_stats['raw']['#pause_frames'] = sum(entry_stat['raw']['#pause_frames'] for entry_stat in entry_stats)
    
    corpus_stats['webrtc'] = {}
    corpus_stats['webrtc']['#pauses'] = sum(entry_stat['webrtc']['#pauses'] for entry_stat in entry_stats)
    corpus_stats['webrtc']['#pause_frames'] = sum(entry_stat['webrtc']['#pause_frames'] for entry_stat in entry_stats)
    
    corpus_stats['#overlaps'] = sum(entry_stat['#overlaps'] for entry_stat in entry_stats)
    corpus_stats['#overlap_frames'] = sum(entry_stat['#overlap_frames'] for entry_stat in entry_stats)
    corpus_stats['avg_overlap'] = np.mean([entry_stat['avg_overlap'] for entry_stat in entry_stats])
    
    num_pauses_raw = corpus_stats['raw']['#pauses']
    sum_pauses_raw = corpus_stats['raw']['#pause_frames']
    print(f'Length of all {num_pauses_raw} speech pauses in corpus (ground truth): {sum_pauses_raw}')

    num_pauses_webrtc = corpus_stats['webrtc']['#pauses']
    sum_pauses_webrtc = corpus_stats['webrtc']['#pause_frames']
    print(f'Length of all {num_pauses_webrtc} speech pauses in corpus (WebRTC): {sum_pauses_webrtc}')

    num_overlaps = corpus_stats['#overlaps']
    sum_overlaps = corpus_stats['#overlap_frames']

    print(f'WebRTC overlaps with original pauses {num_overlaps}/{num_pauses_raw} times ({100*num_overlaps/num_pauses_raw:.2f}%)')
    print(f'WebRTC overlaps with original pauses in {sum_overlaps}/{sum_pauses_raw} frames ({100*sum_overlaps/sum_pauses_raw:.2f}%)')

    return corpus_stats

corpus_stats = compare_corpus(rl_corpus, rl_data_root)
# corpus_stats = compare_corpus(ls_corpus, ls_data_root)