In [None]:
# Common imports
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import nengo
import nengo.utils.numpy as npext
# import nengo_ocl
import nengo_gui.ipython

import phd

# Some plotting niceties
plt.rc('figure', figsize=(8, 5))
sns.set_style('white')
sns.set_style('ticks')

In [None]:
%%javascript
if($(IPython.toolbar.selector.concat(' > #kill-run-first')).length == 0){
  IPython.toolbar.add_buttons_group([
    {
      'label'   : 'kill and run-first',
      'icon'    : 'fa fa-angle-double-down',
      'callback': function(){
        IPython.notebook.kernel.restart();
        $(IPython.events).one('kernel_ready.Kernel', function(){
          var idx = IPython.notebook.get_selected_index();
          IPython.notebook.select(0);
          IPython.notebook.execute_cell();
          IPython.notebook.select(idx);
        });
      }
    }
  ], 'kill-run-first');
}

In [None]:
fs = 25000.
dt = 1. / fs

def plot_sound(process, t, dt):
    plt.figure()
    plt.plot(process.trange(t, dt=dt), process.run(t, dt=dt))
    plt.xlim(right=t)
    sns.despine()

plot_sound(phd.sounds.WavFile('speech.wav'), 0.667, dt)
# plot_sound(phd.sounds.WhiteNoise(), 0.1, dt)
# plot_sound(phd.sounds.Tone(250), 0.1, dt)

#  Recognition system

## Auditory periphery

Making heavy use of [Brian hears](http://www.briansimulator.org/docs/hears.html),
but should also investigate other periphery models.

In [None]:
fs = 20000.
freqs = phd.filters.erbspace(20, 10000, 64)
sound = phd.sounds.WhiteNoise()
aud_filter = phd.filters.gammatone(freqs)

model = phd.SpeechRecognition()
model.add_periphery(freqs, sound, aud_filter, fs=fs)

with model:
    ihc_p = nengo.Probe(model.ihc, synapse=None)
    an_in_p = nengo.Probe(model.an.input, synapse=None)
    an_p = nengo.Probe(model.an.add_neuron_output(), synapse=None)

In [None]:
from nengo.utils.matplotlib import rasterplot

dt = 1. / freqs.max()
print("dt=%.5f" % dt)
sim = nengo.Simulator(model, dt=dt*.5)
sim.run(0.1)

plt.figure()
phd.plots.cochleogram(sim.data[ihc_p], sim.trange(), freqs)
plt.figure()
phd.plots.cochleogram(sim.data[an_in_p], sim.trange(), freqs)
plt.figure()
rasterplot(sim.trange(), sim.data[an_p])
plt.ylim(0, model.an.n_neurons * model.an.n_ensembles)

In [None]:
print(sum(ens.n_neurons for ens in model.all_ensembles))

## Preprocessing layer

In [None]:
fs = 20000.
freqs = phd.filters.erbspace(20, 10000, 64)
sound = phd.sounds.WavFile('speech.wav')
aud_filter = phd.filters.gammatone(freqs)

model = phd.SpeechRecognition()
model.add_periphery(freqs, sound, aud_filter, fs=fs, middle_ear=True)
model.add_derivative(n_neurons=30, delay=0.01, tau_highpass=0.05)
model.add_integrator(n_neurons=20, tau=0.2)

with model:
    ihc_p = nengo.Probe(model.ihc, synapse=None, sample_every=0.001)
    an_p = nengo.Probe(model.an.output, synapse=0.01, sample_every=0.001)
    d_p = nengo.Probe(model.derivatives[0.01], synapse=0.01, sample_every=0.001)
    i_p = nengo.Probe(model.integrators[0.2], synapse=0.01, sample_every=0.001)

In [None]:
dt = 1. / freqs.max()
sim = nengo.Simulator(model, dt=dt*.5)
sim.run(0.667)

plt.figure(figsize=(10, 10))
plt.subplot(2, 2, 1)
phd.plots.cochleogram(sim.data[ihc_p], sim.trange(0.001), freqs)
plt.subplot(2, 2, 2)
phd.plots.cochleogram(sim.data[an_p], sim.trange(0.001), freqs)
plt.subplot(2, 2, 3)
phd.plots.cochleogram(sim.data[d_p], sim.trange(0.001), freqs)
plt.subplot(2, 2, 4)
phd.plots.cochleogram(sim.data[i_p], sim.trange(0.001), freqs)

In [None]:
print(sum(ens.n_neurons for ens in model.all_ensembles))

## Feature layer

### No hierarchy

In [None]:
def get_eval_points(model, pool=None):
    with model:
        an_p = nengo.Probe(model.an.output, synapse=0.01, sample_every=0.001)
        c_p = nengo.Probe(model.derivatives[cons_delay], synapse=0.01, sample_every=0.001)
        v_p = nengo.Probe(model.derivatives[vowel_delay], synapse=0.01, sample_every=0.001)
    dt = 1. / freqs.max()
    sim = nengo.Simulator(model, dt=dt*.5)
    sim.run(0.667)
    vowel = np.hstack([sim.data[an_p], sim.data[v_p]])
    consonant = np.hstack([sim.data[an_p], sim.data[c_p]])
    if pool is not None:
        d = vowel.shape[1] // pool
        pooled_v = np.zeros((vowel.shape[0], d))
        pooled_c = np.zeros((consonant.shape[0], d))
        for p in range(d):
            pooled_v[:, p] = np.sum(vowel[:, p*pool:(p+1)*pool], axis=1)
            pooled_c[:, p] = np.sum(consonant[:, p*pool:(p+1)*pool], axis=1)
        vowel = pooled_v
        consonant = pooled_c
    for probe in (an_p, c_p, v_p):
        model.probes.remove(probe)
    return vowel, consonant, sim.trange(0.001)

# hack for now: let's just manually specify
# phonemes: e n schwa r j i z
# vowel: 0.05 e 0.145 schwa 0.27 i end
# consonant: 0.11 n 0.2 r 0.263 j 0.5 z
def vowel_targets(t):
    # e, schwa, i
    out = np.zeros((t.size, 3))
    out[(t >= 0.05) & (t < 0.145), 0] = 1
    out[(t >= 0.145) & (t < 0.27), 1] = 1
    out[t >= 0.27, 2] = 1
    return out

def cons_targets(t):
    # n, r, j, z
    out = np.zeros((t.size, 4))
    out[(t >= 0.11) & (t < 0.2), 0] = 1
    out[(t >= 0.2) & (t < 0.263), 1] = 1
    out[(t >= 0.263) & (t < 0.5), 2] = 1
    out[t >= 0.5, 3] = 1
    return out

In [None]:
fs = 20000.
freqs = phd.filters.erbspace(20, 10000, 64)
sound = phd.sounds.WavFile('speech.wav')
aud_filter = phd.filters.gammatone(freqs)
cons_delay = 0.075
vowel_delay = 0.03
# Note: no integrator here

model = phd.SpeechRecognition()
model.add_periphery(freqs, sound, aud_filter, fs=fs, middle_ear=True)
model.add_derivative(n_neurons=30, delay=cons_delay)
model.add_derivative(n_neurons=30, delay=vowel_delay)

vowel_ep, cons_ep, t = get_eval_points(model)
_, vowel = model.add_phoneme_detector(15, vowel_ep, vowel_targets(t), [vowel_delay])
_, cons = model.add_phoneme_detector(15, cons_ep, cons_targets(t), [cons_delay])

with model:
    vowel_p = nengo.Probe(vowel, synapse=0.01, sample_every=0.001)
    cons_p = nengo.Probe(cons, synapse=0.01, sample_every=0.001)

In [None]:
dt = 1. / freqs.max()
sim = nengo.Simulator(model, dt=dt*.5)
sim.run(0.667)
t = sim.trange(0.001)

plt.figure(figsize=(10, 10))
plt.subplot(2, 1, 1)
plt.plot(t, sim.data[vowel_p])
plt.xlim(right=t[-1])
plt.legend(["e", "schwa", "i"])
sns.despine()
plt.subplot(2, 1, 2)
plt.plot(t, sim.data[cons_p])
plt.xlim(right=t[-1])
plt.legend(["n", "r", "j", "z"])
sns.despine()

In [None]:
print(sum(ens.n_neurons for ens in model.all_ensembles))

### With hierarchy

In [None]:
fs = 20000.
freqs = phd.filters.erbspace(20, 10000, 64)
sound = phd.sounds.WavFile('speech.wav')
aud_filter = phd.filters.gammatone(freqs)
cons_delay = 0.075
vowel_delay = 0.03
pool = 4

model = phd.SpeechRecognition()
model.add_periphery(freqs, sound, aud_filter, fs=fs, middle_ear=True)
model.add_derivative(n_neurons=30, delay=cons_delay)
model.add_derivative(n_neurons=30, delay=vowel_delay)

vowel_ep, cons_ep, t = get_eval_points(model, pool=pool)
_, vowel = model.add_hierarchical_detector(15, vowel_ep, vowel_targets(t), [vowel_delay], pool=pool)
_, cons = model.add_hierarchical_detector(15, cons_ep, cons_targets(t), [cons_delay], pool=pool)

with model:
    vowel_p = nengo.Probe(vowel, synapse=0.01, sample_every=0.001)
    cons_p = nengo.Probe(cons, synapse=0.01, sample_every=0.001)

In [None]:
dt = 1. / freqs.max()
sim = nengo.Simulator(model, dt=dt*.5)
sim.run(0.667)
t = sim.trange(0.001)

plt.figure(figsize=(10, 10))
plt.subplot(2, 1, 1)
plt.plot(t, sim.data[vowel_p])
plt.xlim(right=t[-1])
plt.legend(["e", "schwa", "i"])
sns.despine()
plt.subplot(2, 1, 2)
plt.plot(t, sim.data[cons_p])
plt.xlim(right=t[-1])
plt.legend(["n", "r", "j", "z"])
sns.despine()

In [None]:
print(sum(ens.n_neurons for ens in model.all_ensembles))