# Sample Dataset

This notebook allows for sampling a dataset through different lens.

In [None]:
import re
import sys

# Setup the "PYTHONPATH"
sys.path.insert(0, '../../')

In [None]:
from IPython.display import FileLink
from IPython.display import Audio
from IPython.display import Markdown

def display_rows(example, **kwargs):
    """ Display a `TextSpeechRow`. """
    for key, value in kwargs.items():
        display(Markdown('**%s:** %s' % (key.replace('_', ' ').title(), value)))
    display(Markdown('**Text:** "' + example.text + '"'))
    display(FileLink(example.audio_path))
    display(Audio(str(example.audio_path)))
    display(Markdown('\n\n ___'))
    display()

In [None]:
import itertools

from src import datasets
from src import hparams

hparams.set_hparams()

dataset = list(itertools.chain.from_iterable([
    datasets.hilary_speech_dataset(),
    datasets.beth_speech_dataset(),
    datasets.heather_speech_dataset(),
    datasets.susan_speech_dataset(),
    datasets.sam_speech_dataset(),
    datasets.frank_speech_dataset(),
    datasets.adrienne_speech_dataset(),
    datasets.alicia_speech_dataset(),
    datasets.george_speech_dataset(),
    datasets.megan_speech_dataset(),
    datasets.elise_speech_dataset(),
    datasets.hanuman_speech_dataset(),
    datasets.jack_speech_dataset(),
    datasets.mark_speech_dataset(),
    datasets.steven_speech_dataset(),
    datasets.lj_speech_dataset(),
    datasets.m_ailabs_en_us_speech_dataset(),
    datasets.beth_custom_speech_dataset(),
]))

dataset = datasets.filter_(hparams._filter_audio_path_not_found, dataset)
dataset = datasets.filter_(hparams._filter_no_text, dataset)
dataset = datasets.filter_(hparams._filter_elliot_miller, dataset)
dataset = datasets.filter_(hparams._filter_no_numbers, dataset)
dataset = datasets.filter_(hparams._filter_books, dataset)

'Training rows: %s' % len(dataset)

## Character Count

In [None]:
from collections import Counter

characters = Counter()
for row in dataset:
    characters.update(list(row.text))
for character, count in characters.most_common():
    print('"%s" %s' % (character, count))

## Sort By Text Length

In [None]:
from src.audio import get_num_seconds
num_samples = 20
samples = sorted(dataset, key=lambda e: len(e.text))[:num_samples]
_ = [display_rows(r, audio_length=get_num_seconds(r.audio_path)) for r in samples]

## Sort By Audio Length

In [None]:
from src.audio import get_num_seconds
audio_lengths = [get_num_seconds(r.audio_path) for r in dataset]

In [None]:
num_samples = 20
samples = sorted(zip(dataset, audio_lengths), key=lambda e: e[1])[:num_samples]
_ = [display_rows(r, audio_length=l) for r, l in samples]

## Sort by Seconds Per Character

In [None]:
num_samples = 20
samples = sorted(zip(dataset, audio_lengths), key=lambda e: e[1] / len(e[0].text))[:num_samples]
_ = [display_rows(r, audio_length=l, seconds_per_character=l / len(r.text)) for r, l in samples]

## Sort by Seconds Per Phoneme

In [None]:
import os
import spacy
from tqdm import tqdm
from multiprocessing.pool import ThreadPool
from src.spectrogram_model.input_encoder import _grapheme_to_phoneme_perserve_punctuation

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def get_phones(text):
    return _grapheme_to_phoneme_perserve_punctuation(text.strip(), separator='|').strip().split('|')

with ThreadPool(os.cpu_count()) as pool:
    graphemes = [r.text for r in dataset]
    phones = list(tqdm(pool.imap(get_phones, graphemes, chunksize=128), total=len(graphemes)))

In [None]:
num_samples = 250
samples = sorted(zip(dataset, audio_lengths, phones), key=lambda e: e[1] / len(e[2]), reverse=True)[:num_samples]
_ = [display_rows(r, audio_length=l, seconds_per_character=l / len(p), phonemes=p) for r, l, p in samples]

## Sort by Characters Per Phone

In [None]:
num_samples = 1000
samples = sorted(zip(dataset, phones), key=lambda e: len(e[1]) / len(e[0].text))[:num_samples]
_ = [display_rows(r, characters_per_phone=len(r.text) / len(p)) for r, p in samples]

## Random Sample

In [None]:
import random
num_samples = 20
samples = random.sample(dataset, num_samples)
_ = [display_rows(r) for r in samples]