# Sample from Spectrogram Feature Distribution

This notebook will help us sample the audio clips on various points of a spectrogram feature distribution. These spectrogram distributions should broadly represent things like loudness, pitch, speed, pausing, snr, etc.

In [28]:
import sys

# Setup the "PYTHONPATH"
sys.path.insert(0, '../../')

In [42]:
from src.datasets import m_ailabs_speech_dataset
from src.datasets import compute_spectrograms
from src.hparams import set_hparams
from random import sample 

set_hparams()
train, dev = m_ailabs_speech_dataset(directory='../../data/M-AILABS')
dev = sample(dev, 150)
dev = compute_spectrograms(dev, on_disk=False)

INFO:src.datasets.m_ailabs:Loading M-AILABS speech dataset
INFO:src.datasets.m_ailabs:Normalizing text and audio...
 24%|██▍       | 1406/5817 [00:39<01:08, 64.73it/s]
  0%|          | 0/46297 [00:00<?, ?it/s][A
  2%|▏         | 792/46297 [00:00<00:05, 7734.55it/s][A
  3%|▎         | 1616/46297 [00:00<00:05, 7842.63it/s][A

  7%|▋         | 3268/46297 [00:00<00:05, 8014.13it/s][A
  9%|▉         | 4128/46297 [00:00<00:05, 8116.42it/s][A
 11%|█         | 4953/46297 [00:00<00:05, 8146.42it/s][A
 12%|█▏        | 5737/46297 [00:00<00:05, 8051.27it/s][A
 14%|█▍        | 6654/46297 [00:00<00:04, 8302.93it/s][A
 16%|█▌        | 7482/46297 [00:00<00:04, 8261.67it/s][A
 18%|█▊        | 8279/46297 [00:01<00:04, 8138.74it/s][A
 20%|█▉        | 9073/46297 [00:01<00:09, 3885.22it/s][A
 21%|██▏       | 9938/46297 [00:01<00:07, 4647.75it/s][A
 23%|██▎       | 10864/46297 [00:01<00:06, 5462.77it/s][A
 25%|██▌       | 11764/46297 [00:01<00:05, 6191.77it/s][A
 27%|██▋       | 12692/46297 [0

In [43]:
# Learn more: https://en.wikipedia.org/wiki/Speech_tempo
# Learn more: https://pdfs.semanticscholar.org/6ca5/9d11df13a78bf58d866f8712890d3220bc84.pdf
# TODO: Consider extracting phoneme's directly from the audio instead of the text.

# Install
# $ python3 -m pip install git+git://github.com/bootphon/phonemizer@master
# $ sudo apt-get install espeak
import phonemizer 
from phonemizer.phonemize import phonemize 
import re

seperator = phonemizer.separator.Separator(phone='-', syllable='', word='|')
    
def textToPhonemes(text, language):
    """
    Convert graphemes to phonemes.
    """
    ph = phonemize(text, separator=seperator, strip=False, njobs=1, backend='espeak', language=language)
    return ph

# Test case
textToPhonemes('Hello World. Hi There.', 'en')

INFO:root:initializing backend espeak-1.48.03


'h-ə-l-əʊ-|w-ɜː-l-d-|\nh-aɪ-|ð-eə-|'

In [66]:
import torch
# Install:
# $ python3 -m pip install git+https://github.com/csteinmetz1/pyloudnorm
import pyloudnorm 
import librosa
# Install:
# $ python3 -m pip install pyreaper
import pyreaper
import numpy

from tqdm import tqdm_notebook

signals = []
for row in tqdm_notebook(dev):
    """
    Compute signal features accross all the examples in ``dev``.
    """
    signal, sample_rate = librosa.core.load(str(row.audio_path), sr=None)
    meter = pyloudnorm.Meter(sample_rate)
    padded_signal = row.spectrogram_audio # Used to align f0 with spectrogram
    
    if signal.shape[0] < 10000:
        print('Too short for ``pyloudnorm``, skipping row:', row)
        continue
    
    # Learn more: 
    # https://www.tcelectronic.com/brand/tcelectronic/loudness-explained#googtrans(en|en)
    
    _, _, _, f0, _ = pyreaper.reaper((padded_signal * 32767).numpy().astype(numpy.int16), sample_rate)
    
    # Compute ``signal_to_noise_ratio``
    f0_to_spectrogram_index = lambda i: int(round((i / (f0.shape[0] - 1)) * (row.spectrogram.shape[0] - 1)))
    magnitude_spectrogram = numpy.exp(row.spectrogram.numpy())
    silence_energy = numpy.median([numpy.sum(magnitude_spectrogram[f0_to_spectrogram_index(i)])
                              for i, val in enumerate(f0) if val <= 0])
    other_energy = numpy.median([numpy.sum(magnitude_spectrogram[f0_to_spectrogram_index(i)])
                              for i, val in enumerate(f0) if val > 0])
    signal_to_noise_ratio = (other_energy - silence_energy) / silence_energy
    
    loudness = meter.integrated_loudness(signal)
    phonemes = textToPhonemes(row.text, 'en')
    
    signals.append({
        'audio_path': row.audio_path,
        'loudness': loudness,
        'samples-per-character': signal.shape[0] / len(row.text),
        'samples-per-phoneme': signal.shape[0] / phonemes.count('-'),
        'samples-per-word': signal.shape[0] / phonemes.count('|'),
        'f0': numpy.median(f0[f0 > 0]), # Average Pitch
        'silence': numpy.where(f0 <=0)[0].shape[0] / f0.shape[0], # Percentage Silence
        'signal_to_noise_ratio': signal_to_noise_ratio, # Signal to noise ratio
    })

HBox(children=(IntProgress(value=0, max=150), HTML(value='')))

INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend espeak-1.48.03
INFO:root:initializing backend esp

In [None]:
from IPython.display import Audio, display
from random import sample 
import os

for row in sorted(signals, key=lambda r: r['samples-per-phoneme']):
    """
    Display features to the user in a sorted order; helping visualize the feature distribution.
    """
    print('audio_path:', row['audio_path'])
    print('loudness:', row['loudness'])
    print('samples-per-character:', row['samples-per-character'])
    print('samples-per-phoneme:', row['samples-per-phoneme'])
    print('samples-per-word:', row['samples-per-word'])
    print('f0:', row['f0'])
    print('silence:', row['silence'])
    print('signal_to_noise_ratio:', row['signal_to_noise_ratio'])
    display(Audio(str(row['audio_path'])))