In [None]:
"""
Imports and some helper functions. You don't need to change anything in here!
"""
from tqdm import tqdm
import requests
import math
import os
import shutil

import tarfile
from corpus_util import *
from random import randint
import readylingua_corpus
import librispeech_corpus

import ipywidgets as widgets

from IPython.display import display, Audio, HTML

# placeholder variables which must be overridden by custom values
root_path = r'E:/'                              # path to root directory with enough free storage
rl_source_path = r'D:\corpus\readylingua-raw'   # path to directory where raw ReadyLingua data is stored
ls_source_path = r'D:\corpus\librispeech-raw'    # path to directory where LibriSpeech files are/will be downloaded (will be changed if files are downloaded)

# default directories which must not be overridden by custom values
rl_target_dir = 'readylingua-corpus'      # name of target directory for ReadyLingua corpus files (default value)
ls_target_dir = 'librispeech-corpus'      # name of target directory for LibriSpeech corpus files (default value)
rl_corpus_file = os.path.join(root_path, rl_target_dir, 'readylingua.corpus')  # path to ReadyLingua corpus file (will be set after corpus has been created)
ls_corpus_file = os.path.join(root_path, ls_target_dir, 'librispeech.corpus')  # path to LibriSpeech corpus file (will be set after corpus has been created)

def select_entry(corpus, ix=None):
    ix = ix if ix is not None else randint(0, len(corpus) - 1)
    return corpus[ix]

def select_alignment(corpus_entry, ix=None):
    ix = ix if ix is not None else randint(0, len(corpus_entry.alignments) - 1)
    return corpus_entry.alignments[ix]

def display_random_entry(corpus, ix_entry=None, ix_alignment=None):
    corpus_entry = select_entry(corpus, ix_entry)
    alignment = select_alignment(corpus_entry, ix_alignment)
    
    entry_title = HTML(f'<strong>Sample corpus entry: {corpus_entry.name}</strong> ({corpus_entry.original_path})')
    entry_audio = Audio(corpus_entry.audio_file)
    entry_text = widgets.Accordion(children=[widgets.HTML(f'<pre>{corpus_entry.transcript}</pre>')], selected_index=None)
    entry_text.set_title(0, 'Transcript')
    
    alignment_title = HTML(f'<strong>Sample alignment</strong> (start_frame={alignment.start_frame}, end_frame={alignment.end_frame})')
    alignment_audio = Audio(data = alignment.audio, rate=16000.0)
    alignment_text = HTML(f'<pre>{alignment.text}</pre>')
    
    display(entry_title) 
    display(entry_audio) 
    display(entry_text) 
    display(alignment_title) 
    display(alignment_audio) 
    display(alignment_text) 
    
def download_file(url, target_dir):
    r = requests.get(url, stream=True)
    total_size = int(r.headers.get('content-length', 0)); 
    block_size = 1024
    wrote = 0 
    tmp_file = os.path.join(root_path, 'download.tmp')
    if os.path.exists(target_dir):
        shutil.rmtree(target_dir)
    
    with open(tmp_file, 'wb') as f:
        with tqdm(r.iter_content(32*block_size), total=total_size , unit='B', unit_divisor=block_size, unit_scale=True) as pbar:
            for data in r.iter_content(32*1024):
                wrote = wrote  + len(data)
                f.write(data)
                pbar.update(len(data))
                                   
    if total_size != 0 and wrote != total_size:
        print("ERROR, something went wrong")  
        
    print('Extracting data...')
    tar = tarfile.open(tmp_file, "r:gz")
    tar.extractall(target_dir)
    tar.close()
    print(f'... done! File downloaded and extracted to: {target_dir}')
    os.remove(tmp_file)
    
def on_download_ls_button_click(sender):
    global ls_source_path
    if os.path.exists(ls_source_path) and os.listdir(ls_source_path):
        print(f'Directory {ls_source_path} exists and is not empty. Assuming LibriSpeech data was already downloaded there.')
        return
        
    print('Downloading LibriSpeech data... Get lunch or something!')
    if os.path.isabs(ls_source_path):
        # path is absolute --> use this path as target directory for the download
        target_dir = ls_source_path
    else:
        # path is relative --> create relative subdirectory under root_path
        target_dir = os.path.join(root_path, ls_source_path)
    print('Download 1/2: Audio data')
    download_file('http://www.openslr.org/resources/12/dev-clean.tar.gz', os.path.join(target_dir, 'librispeech-audio'))
    print('Download 2/2: Text data')
    download_file('http://www.openslr.org/resources/12/original-books.tar.gz', os.path.join(target_dir, 'librispeech-books'))
    ls_source_path = target_dir
    
def on_create_rl_button_click(sender):
    global rl_corpus_file
    print('Creating ReadyLingua corpus... Get a coffee or something!')
    if os.path.isabs(rl_source_path):
        # path to RL raw data was given as an absolute path --> use this path
        source_root = rl_source_path
    else:
        # path to RL raw data was given as an relative path --> create absolute path from root_path
        source_root = os.path.join(root_path, rl_source_path)
    target_root = os.path.join(root_path, rl_target_dir)
    print(f'source_root={source_root}, target_root={target_root}')
    rl_corpus_file = readylingua_corpus.create_corpus(source_root=source_root, target_root=target_root)
    print(f'Done! Corpus file created in {rl_corpus_file}')
    
def on_create_ls_button_click(sender):
    global ls_corpus_file
    print('Creating LibriSpeech corpus... Go to bed or something!')
    source_root = ls_source_path
    target_root = os.path.join(root_path, ls_target_dir)
    print(f'source_root={source_root}, target_root={target_root}')
    ls_corpus_file = librispeech_corpus.create_corpus(source_root=source_root, target_root=target_root)
    print(f'Done! Corpus file created in {ls_corpus_file}')
    
layout = widgets.Layout(width='250px', height='50px')
download_ls_button = widgets.Button(description="Download LibriSpeech Data", button_style='info', layout=layout, icon='download')
download_ls_button.on_click(on_download_ls_button_click)
create_rl_button = widgets.Button(description="Create ReadyLingua Corpus", button_style='warning', layout=layout, icon="book", tooltip='~5 minutes')
create_rl_button.on_click(on_create_rl_button_click)
create_ls_button = widgets.Button(description="Create LibriSpeech Corpus", button_style='warning', layout=layout, icon="book", tooltip='~5 hours')
create_ls_button.on_click(on_create_ls_button_click)    

# IP8
This IPython notebook documents and visualizes some crucial steps made during the progress of the project. I should help the reader understand how and why decisions were made as well as illustrate some important concepts with examples.

## Prerequisites
This project was built using Python 3.6 and Anaconda 3. Please install the packages listed in `requirements.txt`. Additionally, you need the following tools and resources:

* [FFMPEG](http://www.ffmpeg.org/): for the conversion and/or resampling of audio files
* ReadyLingua raw data: You need to get the raw files somehow and store them on your machine.

### Set root directory
This project uses several corpora as training data. The corpora need to be created and trained, which requires approximately 350GB of free storage on the harddisk with the currently included corpora. Note: Final storage use might be lower since some of the memory is only used temporarily.

In [None]:
root_path = r'E:\\' # specify a root directory with at least 350GB of free storage

## Creation of Corpora
Every Neural Network needs training data. The RNN used in this project is no exception. Since this project is about Forced Alignment (FA), training data consisted of pre-aligned audio and transcript data. This training data was derived from the following resources:

* ReadyLingua
* LibriSpeech
* ... (additional Corpora tbd.)

Those corpora contain alignment information which were extracted and stored in **corpus_entries**. Those corpus entries can be created in this notebook.

### Corpus entries
In order to allow data from all sources for training, it had to be converted to a common format. Since (to my knowledge) there is not a standardized format for FA, I had to define one myself. Therefore I went for the following structure for a single corpus entry:

```JSON
// definition of the corpus
corpus = [corpus_entry]

// definition of an individual corpus entry
corpus_entry = 
{
    'audio': [byte],                 // bytes from the audio file
    'transcript': string,            // raw (unaligned) text 
    'speech-pauses': [speech_pause], // segmentation of the audio file into speech and pause segments
    'alignment': [alignment]         // alignment of bits of the unaligned text with the audio
}

// definition of a speech or pause segment
speech_pause = 
{
    'id': string,                    // some unique identifier
    'start': int,                    // start frame of the segment
    'end': int,                      // end frame of the speech pause
    'class': string                  // 'speech' for a speech segment, 'pause' for a pause segment
}

// definition of an alignment
alignment = 
{
    'text': string,                  // text that is being spoken in the audio
    'start': int,                    // start frame in the audio file (when the text starts)
    'end': int                       // end frame in the audio file (when the text stops)
}
```



### Create ReadyLingua Corpus
ReadyLingua (RL) provides alignment data distributed over several files files:

* `*.wav`: Audio file containing the speech
* `*.txt`: UTF-8 encoded (unaligned) transcript
* `* - Segmentation.xml`: file comtaining the definition of speech- and pause segments
```XML
<Segmentation>
    <SelectionExtension>0</SelectionExtension>
    <Segments>
	<Segment id="1" start="83790" end="122598" class="Speech" uid="5" />
	...
    </Segments>
    <Segmenter SegmenterType="SICore.AudioSegmentation.EnergyThresholding">
        <MaxSpeechSegmentExtension>50</MaxSpeechSegmentExtension>
        <Length>-1</Length>
        <Energies>
            <Value id="1" value="0" />
            ...
        </Energies>
        <OriginalSegments>
            <Segment id="1" start="83790" end="100548" class="Speech" uid="2" />
            ...
        </OriginalSegments>
        <EnergyPeak>3569753</EnergyPeak>
        <StepSize>441</StepSize>
        <ITL>146139</ITL>
        <ITU>730695</ITU>
        <LastUid>2048</LastUid>
        <MinPauseDuration>200</MinPauseDuration>
        <MinSpeechDuration>150</MinSpeechDuration>
        <BeginOfSilence>1546255</BeginOfSilence>
        <SilenceLength>100</SilenceLength>
        <ThresholdCorrectionFactor>1</ThresholdCorrectionFactor>
    </Segmenter>
</Segmentation>
```
* `* - Index.xml`: file containing the actual alignments of text to audio
```XML
<XMLIndexFile>
    <Version>2.0.0</Version>
    <SamplingRate>44100</SamplingRate>
    <NumberOfIndices>91</NumberOfIndices>
    <TextAudioIndex>
        <TextStartPos>0</TextStartPos>
        <TextEndPos>36</TextEndPos>
        <AudioStartPos>952101</AudioStartPos>
        <AudioEndPos>1062000</AudioEndPos>
        <SpeakerKey>-1</SpeakerKey>
    </TextAudioIndex>
    ...
</XMLIndexFile>    
```
* `* - Project.xml`: Project file binding the different files together for a corpus entry (note: this file is optional, i.e. there may be not project file for a corpus entry)

Corpus entries are organized in a folder hierarchy. There is a fileset for each corpus entry. Usually, the files for a specific corpus entry reside in a leaf directory (i.e. a directory without further subdirectories). If there is a project file, this file is used to locate the files needed to 

Audio data is provided as Wave-Files with a sampling rate of 44,1 kHz (stereo). Because most ASR corpora provide their recordings with a sampling rate of 16 kHz the files were downsampled and the alignment information adjusted. The raw transcription is integrated as-is. The XML files are parsed to extract the alignment data. Alignment-, textual and downsampled audio data are merged into a corpus entry as described above. 

#### Step 1: Define location of raw data
The pre-aligned ReadyLingua data is not publicly available so you must define where you store the files on your harddisk. Specify the folder below to indicate where the files are stored on your disk. You can enter a relative or an absolute path. **A relative path is assumed to be relative to the previously defined root directory!**

**Don't forget to execute the cell to apply the changes!**

In [None]:
rl_source_path = r'D:\corpus\readylingua-raw' # set this to the (absolute or relative) path to where the ReadyLingua files are stored.

#### Step 2: Create corpus entries
We need to extract the alignments from the segmentation information of the raw data. For this, the downloaded data needs to be converted to corpus entries. This process takes a few minuts, so this is a good time to have a coffee break.

In [None]:
display(create_rl_button)

#### Step 3: Explore corpus
To see if everything worked as expected let's check out a sample alignment. You can execute the cell below to show a random alignment from a random corpus entry. You can execute the cell several times to see different samples.

In [None]:
rl_corpus = load_corpus(rl_corpus_file)
display_random_entry(rl_corpus)

### Create LibriSpeech Corpus
[LibriSpeech](http://www.openslr.org/12/) is an open-source corpus for Automatic Speech Recognition (ASR). It contains recordings of LibriVox' public domain audio books and their transcriptions made by volunteers. The data is evenly distributed in terms of gender, recording length, accent, etc. The corpus is split into training-, dev- and test-set (`train-*.tar.gz`, `dev-*.tar.gz` and `test-*.tar.gz`). However, those sets only contain the transcript as a set of segments and an audio file for each segment. They do not contain any temporal information which is needed for alignment.

Luckily, there is also the `original-mp3-tar.gz` for download which contains the original LibriVox mp3 files (from which the corpus was created) along with the alignment information. Alignment is made on utterance level, i.e. the transcript is split up into segments whereas each segment corresponds to an utterance. Segments were derived by allowing splitting on every silence interval longer than 300ms. 

The data is organized into subdirectories of the following format:

    ./LibriSpeech/mp3/{speaker_id}/{chapter_id}/

There is one subdirectory containing all the information about a recording. For this project the following files are important:

- `{chapter_id}.mp3`: The audio file containing the recording. The audio is mono with a bitrate of 128 kB/s and a sampling rate of 44.1 kHz.
- `{speaker_id}-{chapter_id}.seg.txt`: Text file containing temporal information about the segments (one segment per line). The time is indicated in seconds.
Example:
```
14-208_0000 25.16 40.51
```
- `{speaker_id}-{chapter_id}.trans.txt`: Text file containing the transcriptions of the segments (one segment per line). The transcription is all uppercase and does not contain any punctuation.
```
14-208_0000 CHAPTER ELEVEN THE MORROW BROUGHT A VERY SOBER LOOKING MORNING THE SUN MAKING ONLY A FEW EFFORTS...
```

In order to create the corpus, these files had to be parsed and the audio was converted and downsampled to a 16kHz Wave-file.
Information about the Speakers, Chapters and Books were extracted from the respective files (`SPEAKERS.TXT`, `CHAPTERS.TXT` and `BOOKS.TXT`).

#### Step 1: Download raw data
To create the LibriSpeech corpus you first need to download the raw data. The files are over 80GB and need to be extracted, so this might take a while... Alternatively, if you have already downloaded the data, you can specify the path to the directory, where you have unpacked the files.

In [None]:
# change this value as follows:
# - if you specify an absolute path, this path will be used as source directory for the raw files
# - if you specify a relative path, a subdirectory under the root directory is used as source directory for the raw files
# In any case, if the directory does nt exist or is empty, the files will be automatically downloaded to this path
ls_source_path = r'D:\corpus\librispeech-raw'
display(download_ls_button)

#### Step 2: Create corpus
We need to extract the alignments from the segmentation information of the raw data. For this, the downloaded data needs to be converted to corpus entries. This process takes several hours, so you might want to do this just before knocking-off time.

In [None]:
display(create_ls_button)

#### Step 3: Explore corpus
To see if everything worked as expected let's check out a sample alignment. You can execute the cell below to show a random alignment from a random corpus entry. You can execute the cell several times to see different samples.

In [None]:
ls_corpus = load_corpus(rl_corpus_file)
display_random_entry(ls_corpus)