In [None]:
import nengo
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import soundfile as sf
import pandas as pd
import phd
from phd import timit
from IPython.display import Audio
import sys
from nengo.utils.testing import Timer

# TIMIT

We will use the [TIMIT](https://catalog.ldc.upenn.edu/LDC93S1) corpus.
[NLTK](http://www.nltk.org/) includes a small subset of the corpus,
which is useful for instruction;
however, for training you really want access to the full corpus.

### Notes

* Using TIMIT in PyLearn2
  * https://ift6266h14.wordpress.com/experimenting/
  * https://github.com/jfsantos/ift6266h14/blob/master/old/timit_full.py
  * http://vdumoulin.github.io/articles/timit-part-2/
  * https://jpraymond.wordpress.com/2014/02/21/using-the-new-an-improved-pylearn2-timit-dataset/
  * https://github.com/vdumoulin/research/blob/master/code/pylearn2/datasets/timit.py
  * https://github.com/jfsantos/ift6266h14/tree/master/old/pylearn2_timit

### Possibly useful Python packages

* [`PySoundFile`](https://github.com/bastibe/PySoundFile) (reads NIST Sphere, hopefully)
* [`PySoundCard`](https://github.com/bastibe/PySoundCard)
* [`audio` and related tools](https://github.com/boisgera?tab=repositories) (psychoacoustics?)

# Incoporating with Nengo

Basically, we want to use TIMIT
to generate evaluation points
and phoneme targets,
which we will use to solve for
appropriate decoding weights
for the ensembles that represent
acoustic features.

In [None]:
%%javascript
if($(IPython.toolbar.selector.concat(' > #kill-run-first')).length == 0){
  IPython.toolbar.add_buttons_group([
    {
      'label'   : 'kill and run-first',
      'icon'    : 'fa fa-angle-double-down',
      'callback': function(){
        IPython.notebook.kernel.restart();
        $(IPython.events).one('kernel_ready.Kernel', function(){
          var idx = IPython.notebook.get_selected_index();
          IPython.notebook.select(0);
          IPython.notebook.execute_cell();
          IPython.notebook.select(idx);
        });
      }
    }
  ], 'kill-run-first');
}

In [None]:
# Let's work with a single utterance first
# Get the utterance and the data associated with it
utt = timit.Utterance(
    corpus='TRAIN',
    region=1,
    sex='M',
    spkr_id='DAC0',
    sent_type='A',
    sent_number=1,
)

data, fs = sf.read(utt.wav)
dt = 1. / fs
plt.plot(np.arange(data.size) * dt, data)
print data.shape, data.dtype

In [None]:
Audio(data=data.ravel(), rate=fs)

Phone transcriptions are (fortunately!) available
in `*.phn` files. Here's an example.

In [None]:
!cat {utt.phn}

## Phonemes in TIMIT

### Consonants

#### Stops

| Symbol | Example word    | Possible phonetic transcription |
|--------|-----------------|---------------------------------|
| b      |    bee          |    BCL B iy                     |
| d      |    day          |    DCL D ey                     |
| g      |    gay          |    GCL G ey                     |
| p      |    pea          |    PCL P iy                     |
| t      |    tea          |    TCL T iy                     |
| k      |    key          |    KCL K iy                     |
| dx     |    muddy, dirty |    m ah DX iy, dcl d er DX iy   |
| q      |    bat          |    bcl b ae Q                   |

####  Affricates

| Symbol | Example word | Possible phonetic transcription |
|--------|--------------|---------------------------------|
| jh     |    joke      |    DCL JH ow kcl k              |
| ch     |    choke     |    TCL CH ow kcl k              |

####  Fricatives

| Symbol | Example word | Possible phonetic transcription |
|--------|--------------|---------------------------------|
| s      |    sea       |    S iy                         |
| sh     |    she       |    SH iy                        |
| z      |    zone      |    Z ow n                       |
| zh     |    azure     |    ae ZH er                     |
| f      |    fin       |    F ih n                       |
| th     |    thin      |    TH ih n                      |
| v      |    van       |    V ae n                       |
| dh     |    then      |    DH e n                       |

#### Nasals

| Symbol | Example word  | Possible phonetic transcription |
|--------|---------------|---------------------------------|
| m      |    mom        |    M aa M                       |
| n      |    noon       |    N uw N                       |
| ng     |    sing       |    s ih NG                      |
| em     |    bottom     |    b aa tcl t EM                |
| en     |    button     |    b ah q EN                    |
| eng    |    washington |    w aa sh ENG tcl t ax n       |
| nx     |    winner     |    w ih NX axr                  |

#### Semivowels and glides

| Symbol | Example word | Possible phonetic transcription |
|--------|--------------|---------------------------------|
| l      |    lay       |    L ey                         |
| r      |    ray       |    R ey                         |
| w      |    way       |    W ey                         |
| y      |    yacht     |    Y aa tcl t                   |
| hh     |    hay       |    HH ey                        |
| hv     |    ahead     |    ax HV eh dcl d               |
| el     |    bottle    |    bcl b aa tcl t EL            |

###  Vowels

| Symbol | Example word | Possible phonetic transcription  |
|--------|--------------|----------------------------------|
| iy     |    beet      |    bcl b IY tcl t                |
| ih     |    bit       |    bcl b IH tcl t                |
| eh     |    bet       |    bcl b EH tcl t                |
| ey     |    bait      |    bcl b EY tcl t                |
| ae     |    bat       |    bcl b AE tcl t                |
| aa     |    bott      |    bcl b AA tcl t                |
| aw     |    bout      |    bcl b AW tcl t                |
| ay     |    bite      |    bcl b AY tcl t                |
| ah     |    but       |    bcl b AH tcl t                |
| ao     |    bought    |    bcl b AO tcl t                |
| oy     |    boy       |    bcl b OY                      |
| ow     |    boat      |    bcl b OW tcl t                |
| uh     |    book      |    bcl b UH kcl k                |
| uw     |    boot      |    bcl b UW tcl t                |
| ux     |    toot      |    tcl t UX tcl t                |
| er     |    bird      |    bcl b ER dcl d                |
| ax     |    about     |    AX bcl b aw tcl t             |
| ix     |    debit     |    dcl d eh bcl b IX tcl t       |
| axr    |    butter    |    bcl b ah dx AXR               |
| ax-h   |    suspect   |    s AX-H s pcl p eh kcl k tcl t |

### Others

| Symbol | Description                          |
|--------|--------------------------------------|
| pau    | pause                                |
| epi    | epenthetic silence                   |
| h#     | begin/end marker (non-speech events) |
| 1      | primary stress marker                |
| 2      | secondary stress marker              |

In [None]:
from phd.timit import consonants, closures, vowels, ignores
print consonants

Let's parse a `.phn` file into a string of phonemes
and their corresponding audio slices.
We'll separate these into separate vowel
and consonant lists.

In [None]:
from collections import defaultdict
cons = defaultdict(list)
vows = defaultdict(list)

with open(utt.phn, 'r') as phnfile:
    for line in phnfile:
        start, end, phn = line.split()
        start, end = int(start), int(end)

        if phn in ignores:
            continue
        if phn in closures:
            phn = closures[phn]

        dataslice = np.array(data[start:end])
        if phn in consonants:
            cons[phn].append(dataslice)
        elif phn in vowels:
            vows[phn].append(dataslice)
        else:
            raise ValueError("Unrecognized phoneme: '%s'" % phn)

In [None]:
# Let's look at all of the speech samples for a random vowel
import random
vow_phn = random.choice(list(vows))
print(vow_phn)
speech = np.concatenate(vows[vow_phn])
plt.plot(speech)
dt = 1. / fs
plt.plot(np.arange(speech.size) * dt, speech)

In [None]:
# Let's repeat this to get something we can listen to

timit_root = os.path.join(extract_path, 'timit')

def timit_path(corpus, region, sex, spkr_id, sent_type, sent_number):
    return os.path.join(timit_root,
                        corpus,
                        "DR%d" % region,
                        "%s%s" % (sex, spkr_id),
                        "S%s%d" % (sent_type, sent_number))

def add_utterance(tpath, cons, vows):
    data, fs = sf.read("%s.WAV" % tpath)
    with open("%s.PHN" % tpath, 'r') as phnfile:
        for line in phnfile:
            start, end, phn = line.split()
            start, end = int(start), int(end)

            if phn in ignores:
                continue
            if phn in closures:
                phn = closures[phn]

            dataslice = np.array(data[start:end])
            if phn in consonants:
                cons[phn].append(dataslice)
            elif phn in vowels:
                vows[phn].append(dataslice)
            else:
                raise ValueError("Unrecognized phoneme: '%s'" % phn)

cons = defaultdict(list)
vows = defaultdict(list)

region = 1
sex = 'M'
spkr_id = 'CPM0'

for sent_type, sent_number in zip(['A', 'A', 'I', 'I', 'I', 'X',  'X', 'X', 'X'],
                                  [1, 2, 564, 1194, 1824, 24, 114, 204, 294, 384]):
    tpath = timit_path('TRAIN', region, sex, spkr_id, sent_type, sent_number)
    add_utterance(tpath, cons=cons, vows=vows)

# Let's hear all the 'ae' phonemes
phn = np.concatenate(vows['ow']).ravel()
print(phn.shape)
Audio(data=phn, rate=fs)  # Dunno why this only works 10% of the time...

In [None]:
from phd.sounds import ArrayProcess

# Final step: transform cons and vows into eval_points and targets
def phn2nengo(model, probe, phonemes, samples):
    orig_sound = model.auditory_filter.sound_process
    dt = 1. / fs

    eval_points = []
    targets = []
    for i, phoneme in enumerate(phonemes):
        sound = np.concatenate(samples[phoneme]).ravel()
        target = np.zeros((len(phonemes), sound.size))
        target[i]
        model.auditory_filter.sound_process = ArrayProcess(sound)
        sim = nengo.Simulator(model, dt=dt*.5)
        sim.run(dt * sound.size)
        #if pool is not None:
        #    d = vowel.shape[1] // pool
        #    pooled_v = np.zeros((vowel.shape[0], d))
        #    for p in range(d):
        #        pooled_v[:, p] = np.sum(vowel[:, p*pool:(p+1)*pool], axis=1)
        #    vowel = pooled_v
        eval_points.append(sim.data[probe])
        targets.append(target)

    model.auditory_filter.sound_process = orig_sound
    return np.concatenate(eval_points), np.concatenate(targets)

# fs = 20000.
freqs = phd.filters.erbspace(20, 10000, 64)
sound = phd.sounds.WavFile('speech.wav')
aud_filter = phd.filters.gammatone(freqs)
cons_delay = 0.075
vowel_delay = 0.03
# Note: no integrator here

model = phd.SpeechRecognition()
model.add_periphery(freqs, sound, aud_filter, fs=fs, middle_ear=True)
model.add_derivative(n_neurons=30, delay=cons_delay)
model.add_derivative(n_neurons=30, delay=vowel_delay)

with model:
    # TODO: put all the info into one probe
    pass

with model:
    vowel_p = nengo.Probe(vowel, synapse=0.01, sample_every=0.001)
    cons_p = nengo.Probe(cons, synapse=0.01, sample_every=0.001)

vowel_ep, vowel_targets = phn2nengo(model, v_probe, vowels, vows)
cons_ep, cons_targets = phn2nengo(model, c_probe, consonants, cons)

_, vow_detect = model.add_phoneme_detector(15, vowel_ep, vowel_targets, [vowel_delay])
_, cons_detect = model.add_phoneme_detector(15, cons_ep, cons_targets, [cons_delay])

In [None]:
# This should give us all audio clips of a particular phoneme...
training = timit.TrainingData(None, [0.01], timit.consonants + timit.vowels)
with Timer() as t:
    audio = training.generate_audio()
print "Took %s seconds" % t.duration
bytes = sys.getsizeof(audio)
for phn in audio:
    bytes += sys.getsizeof(audio[phn])
print "Takes up %f MB of memory" % (float(bytes) / 1e6)

In [None]:
training = timit.TrainingData(None, [0.01], ['b', 'd', 't'])
with Timer() as t:
    audio = training.generate_audio()
print "Took %s seconds" % t.duration
with Timer() as t:
    targets = training.generate_targets(audio)
print "Took %s seconds" % t.duration

all_audio = []
for phoneme in sorted(list(audio)):
    all_audio.append(np.concatenate(audio[phoneme]).ravel())
all_audio = np.concatenate(all_audio)
print(all_audio.shape)

plt.plot(all_audio[:5000:16])
plt.plot(targets.T[:5000/16])

In [None]:
model = phd.Sermo(execution=False)
periphery = model.recognition.periphery
periphery.fs = 20000
periphery.freqs = phd.filters.erbspace(20, 10000, 64)
periphery.sound_process = phd.processes.WavFile('speech.wav')
periphery.auditory_filter = phd.filters.gammatone(periphery.freqs)
fast_deriv = model.recognition.add_derivative(delay=0.01)
fast_deriv.klass = 'TrippFF'
fast_deriv.args = {}
slow_deriv = model.recognition.add_derivative(delay=0.1)
slow_deriv.klass = 'TrippFF'
slow_deriv.args = {}

training = timit.TrainingData(model, [0.01], ['ae'], max_simtime=0.5)
print("Generated: %s" % training.generated)
print(training.cache_file())
timit.TrainingData.clear_cache()
try:
    training.get()
except Exception as e:
    print(e)
training.generate()

In [None]:
ep, t = training.get()
ep.shape, t.shape

In [None]:
from phd.plots import cochleogram

freqs = model.recognition.periphery.freqs
dims = freqs.size
time = np.arange(ep.shape[0]) * training.sample_every
cochleogram(ep[:, dims:], time, freqs)
plt.figure(figsize=(8, 2))
plt.plot(time, t.T)

In [None]:
# It seems to work; let's generate training data for vowels and consonants.
# For now, we'll use the fast derivative for consonants,
# slow for vowels.
model = phd.Sermo(execution=False)
periphery = model.recognition.periphery
periphery.fs = 20000
periphery.freqs = phd.filters.erbspace(20, 10000, 64)
periphery.sound_process = phd.processes.WavFile('speech.wav')
periphery.auditory_filter = phd.filters.gammatone(periphery.freqs)
fast_deriv = model.recognition.add_derivative(delay=0.01)
fast_deriv.klass = 'TrippFF'
fast_deriv.args = {}
slow_deriv = model.recognition.add_derivative(delay=0.1)
slow_deriv.klass = 'TrippFF'
slow_deriv.args = {}

train_cons = timit.TrainingData(model, [0.01], timit.consonants)
train_vow = timit.TrainingData(model, [0.1], timit.vowels)
train_cons.generate()
train_vow.generate()

In [None]:
# These are kind of huge amounts of data...
from phd import config
!du -h {config.cache_dir}/*

In [None]:
# So let's do some testing to see if any of our solvers can handle it
# Based on nengo.tests.test_solvers
from nengo import solvers

def get_rate_function(n_neurons, dims, neuron_type=nengo.LIF, rng=None):
    neurons = neuron_type(n_neurons)
    gain, bias = neurons.gain_bias(
        rng.uniform(200, 400, n_neurons), rng.uniform(-1, 1, n_neurons))
    rates = lambda x: neurons.rates(x, gain, bias)
    return rates


def get_encoders(n_neurons, dims, rng=None):
    return nengo.dists.UniformHypersphere(surface=True).sample(n_neurons, dims, rng=rng).T


def test_decoder_solver(solver, eval_points, targets):
    rng = np.random.RandomState(10)

    dims = eval_points.shape[1]
    n_neurons = 30 * dims
    rates = get_rate_function(n_neurons, dims, rng=rng)
    E = get_encoders(n_neurons, dims, rng=rng)

    Atargets = rates(np.dot(eval_points, E))
    D, info = solver(Atargets, targets, rng=rng)

    # est = np.dot(Atargets, D)
    # rel_rmse = rms(est - targets) / rms(targets)
    print solver
    print "  Time: %s" % info['time']
    print "  Mean RMSE: %s" % np.mean(info['rmses'])


# We'll use the consonants for now
eval_points, targets = train_cons.get()

# Things are a little off... that's fine for now, but we should TODO fix that
eval_points = eval_points[:targets.shape[1]]

print "eval_points.shape=%s" % (eval_points.shape,)
print "targets.shape=%s" % (targets.shape,)

solvers = [
    solvers.Lstsq(rcond=0.01),
    solvers.LstsqNoise(noise=0.1, solver=solvers.cholesky),
    solvers.LstsqMultNoise(noise=0.1, solver=solvers.cholesky),
    solvers.LstsqL2(reg=0.1, solver=solvers.cholesky),
    solvers.LstsqL2nz(reg=0.1, solver=solvers.cholesky),
    # solvers.LstsqL1(l1=1e-4, l2=1e-6),  # Way too slow... accurate though
    solvers.LstsqDrop(drop=0.25,
                      solver1=solvers.LstsqL2nz(reg=0.1, solver=subsolver),
                      solver2=solvers.LstsqL2nz(reg=0.01, solver=subsolver)),
    solvers.Nnls(),
    solvers.NnlsL2(reg=0.1),
    solvers.NnlsL2nz(reg=0.1),
    solvers.LstsqNoise(noise=0.1, solver=solvers.randomized_svd),
    solvers.LstsqMultNoise(noise=0.1, solver=solvers.randomized_svd),
    solvers.LstsqL2(reg=0.1, solver=solvers.randomzied_svd),
]

for solver in solvers:
    test_decoder_solver(solver, eval_points, targets.T)

In [None]:
# Last cell takes too long, so I just copied the results here
res = [
    # solver, time, mean_rmse
    ('Lstsq', 154.704920053, 0.141213372439),
    ('LstsqNoise (cholesky)', 51.0423779488, 0.130832001688),
    ('LstsqMultNoise (cholesky)', 50.0834109783, 0.11170648566),
    ('LstsqL2 (cholesky)', 22.3267519474, 0.123292167674),
    ('LstsqL2nz (cholesky)', 25.1613600254, 0.11741118202),
    ('LstsqL1', 18306.552321, 0.121022449449),
    ('LstsqDrop', 638.563055038, 0.105056237968),
    ('Nnls', 3678.89870405, 0.154833377335),
    ('NnlsL2', 73.7885270119, 0.157904333119),
    ('NnlsL2nz', 75.2918219566, 0.165642470283),
    ('LstsqNoise (randomized_svd)', 32.4525079727, 0.155343124338),
    ('LstsqMultNoise (randomized_svd)', 33.2236630917, 0.154465728835),
    ('LstsqL2 (randomized_svd)', 6.79836702347, 0.154354227613),
]

data = pd.DataFrame({
    'solver': [r[0] for r in res],
    'time': [r[1] for r in res],
    'mean_rmse': [r[2] for r in res],
})

plt.figure()
sns.barplot(y='solver', x='time', data=data)
plt.ylabel('')
plt.xlabel('Time')
plt.xlim(right=200)
plt.figure()
sns.barplot(y='solver', x='mean_rmse', data=data)
plt.ylabel('')
plt.xlabel('Mean RMSE')