In [None]:
# This cell contains some imports that will be used
# in the remainder of the notebook.

import phd

import os
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import nengo
import numpy as np
from scipy import stats
from nengo.utils.matplotlib import rasterplot
from nengo.utils.stdlib import Timer
from matplotlib.ticker import ScalarFormatter
from IPython.display import Audio, display, SVG

# Some plotting niceties
phd.plots.setup(figsize=(5, 3.5))

# Ensure TIMIT is extracted
timit_path = "~/phd_data/timit"
timit = phd.timit.TIMIT(timit_path)
try:
    timit.untar(os.path.expanduser("~/Dropbox/LDC93S1.tgz"))
except Exception as e:
    pass

# Some general params
utt = 'deadline'
spkr = 'RRE0'

# Feature representations

Here, we show extract both
Mel-frequency Cepstral Coefficients (MFCCs)
and Neural Cepstral Coefficients (NCCs)
from a short audio sample from the TIMIT corpus.

First, we must get the speech sample
from the TIMIT corpus.
We get a relatively interesting word
that will demonstrate the changes
in MFCCs and NCCs over time.

In [None]:
timit = phd.timit.TIMIT(timit_path)
timit.filefilt.spkr_id = spkr
samples = timit.word_samples([utt])
model = phd.sermo.AuditoryFeatures()
model.audio = samples[utt][0]
Audio(data=model.audio.ravel(), rate=phd.timit.TIMIT.fs)

Next, we will extract MFCCs
using 25 ms audio frames
advancing by 10 ms per timestep.
The top plot is the raw MFCC,
which has a wide range (see the color bar).
We therefore z-score the MFCC to normalize it
to a reasonable range in the bottom plot.

In [None]:
model.mfcc.dt = 0.01
x = model.mfcc()
n_frames = x.shape[0]

# Plot
f, ax1, ax2 = phd.plots.plot_trajs(x, x, zscore=(False, True))
f.suptitle("Features for utterance of '%s'" % utt, fontsize='large')
ax1.set_ylabel("MFCC")
ax2.set_ylabel("MFCC (z-scored)")
ax2.set_xlabel("Frame")
f.tight_layout(rect=[0, 0, 1, 0.97])
phd.plots.savefig(f, 'results', 'ncc-mfcc')

In order to compare MFCCs and NCCs visually,
we also extract MFCCs by advancing by 1 ms per timestep,
which matches the natural timestep used by Nengo
and therefore used for NCCs.

In [None]:
model.mfcc.dt = 0.001
x = model.mfcc()

# Plot
f, ax1, ax2 = phd.plots.plot_trajs(x, x, zscore=(False, True))
f.suptitle("Features for utterance of '%s'" % utt, fontsize='large')
ax1.set_ylabel("MFCC")
ax2.set_ylabel("MFCC (z-scored)")
ax2.set_xlabel("Frame")
f.tight_layout(rect=[0, 0, 1, 0.97])
phd.plots.savefig(f, 'results', 'ncc-mfcc-long')

Next, we extract NCCs using a 1 ms simulation timestep,
which is the norm for neural simulations in Nengo.
Like for MFCCs, we show both the raw NCC
and the z-scored NCC.
In the case of NCCs, the raw result
is already in a reasonable range (approximate -1 to 1)
so z-scoring is not necessary,
but is shown for visual comparison.

In [None]:
net = model.build(nengo.Network(seed=0))
with net:
    ihc_p = nengo.Probe(net.periphery.ihc, synapse=None)
    an_in_p = nengo.Probe(net.periphery.an.input, synapse=None)
    an_p = nengo.Probe(net.periphery.an.add_neuron_output(), synapse=None)
    c_p = nengo.Probe(net.output, synapse=0.01)
sim = nengo.Simulator(net, dt=0.001)
sim.run(model.t_audio)

# Plot
t = sim.trange()
t_ix = np.arange(0, t.size, 100)
t_ix[1:] -= 1
f, ax1, ax2 = phd.plots.plot_trajs(sim.data[c_p], sim.data[c_p], zscore=(False, True))
f.suptitle("Features for utterance of '%s'" % utt, fontsize='large')
ax1.set_ylabel("NCC")
ax2.set_ylabel("NCC (z-scored)")
ax2.set_xticklabels(t_ix)
ax2.set_xticklabels(t[t_ix])
ax2.set_xlabel("Time (s)")
f.tight_layout(rect=[0, 0, 1, 0.97])
phd.plots.savefig(f, 'results', 'ncc-ncc')

In order to better compare NCCs and MFCCs visually,
we can shorten the NCC to have the same length
as the MFCC generated by advancing the frame 10 ms
per timestep.

In [None]:
ncc = phd.experiments.shorten(sim.data[c_p], n_frames=n_frames)

# Plot
t, t_ix = sim.trange(model.t_audio / n_frames), np.arange(0, n_frames, 10)
t_ix[1:] -= 1
f, ax1, ax2 = phd.plots.plot_trajs(ncc, ncc, zscore=(False, True))
f.suptitle("Features for utterance of '%s'" % utt, fontsize='large')
ax1.set_ylabel("NCC")
ax2.set_ylabel("NCC (z-scored)")
ax2.set_xticks(t_ix)
ax2.set_xticklabels(np.round(t[t_ix], 3))
ax2.set_xlabel("Time (s)")
f.tight_layout(rect=[0, 0, 1, 0.97])
phd.plots.savefig(f, 'results', 'ncc-ncc-short')

We can now compare the MFCCs and shortened NCCs visually.
In general, they qualitative change at around the same times,
though the NCC is somewhat delayed since it processes
the audio online rather than with ideal discrete audio frames.

In [None]:
mfcc = SVG(filename='../plots/results/ncc-mfcc.svg')
ncc = SVG(filename='../plots/results/ncc-ncc-short.svg')
display(mfcc, ncc)

# NCC implementation

The NCCs plotted above are the results of decoding
the activity from neurons representing
the cepstral coefficients.
The cepstral coefficients are computed
through connection weights between neurons representing
the auditory filter outputs,
which are similar to the neurons
afferent the auditory nerve.

In [None]:
SVG(filename='../figures/implementation/ncc-network.svg')

We can look more closely at the output of the auditory filters
(i.e., the inner hair cell activity),
the activity across the synapse between
the inner hair cell and the auditory nerve neurons,
and the spiking activity of the auditory nerve neurons.

In [None]:
# A peek into the neural implementation
fig = plt.figure(figsize=(6, 10))

ax = plt.subplot(3, 1, 1)
phd.plots.cochleogram(sim.data[ihc_p], sim.trange(), model.freqs, ax=ax, cbar=False)
ax.set_title("Inner hair cell activity")
ax.set_xticks(())

ax = plt.subplot(3, 1, 2)
phd.plots.cochleogram(sim.data[an_in_p], sim.trange(), model.freqs, ax=ax, cbar=False)
ax.set_title("IHC-AN synaptic activity")
ax.set_xticks(())

ax = plt.subplot(3, 1, 3)
rasterplot(sim.trange(), sim.data[an_p])
ax.set_title("Spiking AN neural activity")
ax.set_ylim(0, net.periphery.an.n_neurons * net.periphery.an.n_ensembles)
ax.set_ylabel("Neuron")
ax.set_xlabel("Time (s)")
sns.despine()

fig.tight_layout()
phd.plots.savefig(fig, 'results', 'ncc-periphery')

In this example,
32 auditory filters are used with 8 neuron per auditory filter.
Analogous networks with optimized parameters
using fewer neurons per filter
can achieve similar performance.

# Derivatives

As shown above, we can also compute the temporal derivative
of the NCC in a neural network,
which is a common augmentation to the MFCC feature vector.
Below, we plot MFCCs and NCCs with their first and second
temporal derivatives appended to the feature vector.

In [None]:
timit = phd.timit.TIMIT(timit_path)
timit.filefilt.spkr_id = spkr
samples = timit.word_samples([utt])
model = phd.sermo.AuditoryFeatures()
model.audio = samples[utt][0]
model.add_derivative('IntermediateDeriv')  # First derivative
model.add_derivative('FeedforwardDeriv')  # Second derivative
Audio(data=model.audio.ravel(), rate=phd.timit.TIMIT.fs)

In [None]:
# Get MFCCs
model.mfcc.dt = 0.01
x = model.mfcc()

# Plot
f, ax1, ax2 = phd.plots.plot_trajs(x, x, zscore=(False, True))
f.suptitle("Features for utterance of '%s'" % utt, fontsize='large')
ax1.set_ylabel("MFCC")
ax2.set_ylabel("MFCC (z-scored)")
ax2.set_xlabel("Frame")
f.tight_layout(rect=[0, 0, 1, 0.97])
phd.plots.savefig(f, 'results', 'ncc-mfcc-derivs')

In [None]:
# Get NCCs
net = model.build(nengo.Network(seed=0))
with net:
    ihc_p = nengo.Probe(net.periphery.ihc, synapse=None)
    an_in_p = nengo.Probe(net.periphery.an.input, synapse=None)
    an_p = nengo.Probe(net.periphery.an.add_neuron_output(), synapse=None)
    c_p = nengo.Probe(net.output, synapse=0.01)
sim = nengo.Simulator(net, dt=0.001)
sim.run(model.t_audio)

# Plot
f, ax1, ax2 = phd.plots.plot_trajs(sim.data[c_p], sim.data[c_p], zscore=(False, True))
f.suptitle("Features for utterance of '%s'" % utt, fontsize='large')
ax1.set_ylabel("NCC")
ax2.set_ylabel("NCC (z-scored)")
ax2.set_xticklabels(sim.trange())
ax2.set_xlabel("Time (s)")
f.tight_layout(rect=[0, 0, 1, 0.97])
phd.plots.savefig(f, 'results', 'ncc-ncc-derivs')

Now let's compare them.

In [None]:
mfcc = SVG(filename='../plots/results/ncc-mfcc-derivs.svg')
ncc = SVG(filename='../plots/results/ncc-ncc-derivs.svg')
display(mfcc, ncc)

# Classification experiment

In order to compare MFCCs and NCCs
in terms of how speech-related sounds
are separated in MFCC and NCC vector spaces,
we classify feature vectors corresponding to
pre-segmented speech samples using
linear support vector machines.
The metric reported in the end
in classification correctness
(i.e., the number of correctly predicted labels divided by
the total number of samples).

The following cell shows how to run a short experiment
using only a small subset of the TIMIT corpus.

In [None]:
# Not a full experiment, just showing how testing / training works
# with a small data set.
model = phd.sermo.AuditoryFeatures()
expt = phd.experiments.AuditoryFeaturesExperiment(
    model, phones=phd.timit.TIMIT.consonants)
expt.seed = 20
expt.timit.filefilt.region = 8
expt.timit.filefilt.sex = 'F'
expt.timit.filefilt.sent_type = 'I'
key = expt.run()
res = phd.experiments.AuditoryFeaturesResult.load(key)
print("==== Summary ====")
print("MFCC training acc: %f" % res.mfcc_train_acc)
print("NCC training acc: %f" % res.ncc_train_acc)
print("MFCC testing acc: %f" % res.mfcc_test_acc)
print("NCC testing acc: %f" % res.ncc_test_acc)

# Plotting experimental results

I ran these experiments with a larger subset
of the TIMIT corpus, varying several parameters.
For each parameter, I ran 10 experiments;
for all conditions I show both
a violin plot and a bar plot
summarizing the same data.

### Varying the phones used as input

Conditions:

- Only vowel phones
- Only consonant phones
- All phones (including silence)

Results summary:

- Consonants are most difficult to classify and have least separation between MFCC and NCC

We therefore only look at consonants in later results.

In [None]:
pargs = {'columns': [], 'vary': 'phones', 'filter_by': [], 'hue_order': ["vowels", "consonants", "all"]}

def fix_label(l):
    for t, label in zip(l.get_texts(), ["Vowels", "Consonants", "All"]):
        t.set_text(label)

v, b = phd.plots.ncc_accuracy(relative=False, **pargs)
ax = b.get_axes()[0]
ax.set_ylabel("Classification correctness")
l = ax.legend(title="", bbox_to_anchor=(0.55, 0.9), bbox_transform=b.transFigure)
fix_label(l)
phd.plots.savefig(b, 'results', 'ncc-phones-acc-b')

v, b = phd.plots.ncc_accuracy(relative=True, **pargs)
ax = b.get_axes()[0]
ax.set_ylabel("Relative classification correctness")
l = ax.legend(title="")
fix_label(l)
phd.plots.savefig(b, 'results', 'ncc-phones-racc-b')

t = phd.plots.ncc_time(**pargs)
ax = t.get_axes()[0]
l = ax.legend(loc='upper left', title="")
fix_label(l)
phd.plots.savefig(t, 'results', 'ncc-phones-time')

### Varying whether the feature is z-scored

Conditions:

- z-scored
- not z-scored

Results summary:

- Variance goes way down for MFCCs when z-scored
- NCCs are slightly better without z-scoring

We therefore z-score MFCCs and not NCCs in future experiments.

In [None]:
v, b = phd.plots.ncc_accuracy(['zscore'], 'zscore', hue_order=['False', 'True'], relative=False)

### Varying the number of derivatives used

Conditions:

- no derivatives
- first derivative
- first and second derivatives

Results summary:

- The first derivative improves MFCC accuracy significantly and has little effect on NCC accuracy.

We therefore use 1 derivative for future experiments.

In [None]:
v, b = phd.plots.ncc_accuracy(relative=False,
                              columns=['derivatives'],
                              vary='derivatives',
                              hue_order=['0', '1', '2'])
ax = b.get_axes()[0]
ax.legend(title="# of derivatives", bbox_to_anchor=(0.57, 0.9), bbox_transform=b.transFigure)
phd.plots.savefig(b, 'results', 'ncc-derivatives-acc-b')

### Varying the temporal derivative model

Conditions:

- Feedforward derivative model
- Intermediate ensemble derivative model

Results summary:

- The intermediate derivative model is significantly more accurate.

We therefore only use the intermediate derivative model in future experiments.

In [None]:
phd.plots.ncc_accuracy(['derivtype'], 'derivtype',
                       hue_order=['FeedforwardDeriv', 'IntermediateDeriv'])

### Varying the number of neurons per auditory filter

Conditions:

- 1, 2, 4, 8, 16, or 32 neurons per filter

Results summary:

- Plateaus around 8 neurons per filter.

We therefore use 8 neurons per filter in future experiments.

In [None]:
t = phd.plots.ncc_tsaccuracy(['periphery'], 'periphery')
ax = t.get_axes()[0]
ax.legend(loc='best', title="")
ax.set_xlabel("Neurons per periphery ensemble")
ax.set_xscale('log')
ax.set_xticks((1, 2, 4, 8, 16, 32))
ax.xaxis.set_major_formatter(ScalarFormatter())
t.tight_layout()
phd.plots.savefig(t, 'results', 'ncc-periphery-acc-t')

### Varying the number of neurons per feature vector dimension

Conditions:

- 1, 8, 12, 16, 32, 64 neurons

Results summary:

- Accuracy improves steadily as more neurons are added
- Simulation speed is drastically slower for more than 12 neurons

We therefore use 12 neurons per feature vector dimension in future experiments.

In [None]:
t = phd.plots.ncc_tsaccuracy(['feature'], 'feature')
ax = t.get_axes()[0]
ax.legend(loc='best', title="")
ax.set_xlabel("Neurons per feature ensemble")
t.tight_layout()
phd.plots.savefig(t, 'results', 'ncc-feature-acc-t')

t = phd.plots.ncc_time(['feature'], 'feature', [str(x) for x in [1, 8, 12, 16, 32, 64]])
ax = t.get_axes()[0]
ax.legend(loc='best', title="# of feature neurons")
phd.plots.savefig(t, 'results', 'ncc-feature-time')

### Varying the frame advance for MFCC

Conditions:

- 10 ms, 5 ms, 1 ms

Results summary:

- Best performance at 10 ms

We therefore continue to use 10 ms frame advance in future experiments.

In [None]:
v, b = phd.plots.ncc_accuracy(
    relative=False, columns=['dt'], vary='dt', hue_order=["0.010000", "0.005000", "0.001000"])
ax = b.get_axes()[0]
l = ax.legend(title="MFCC dt", loc='best')
for t, label in zip(l.get_texts(), ["10 ms", "5 ms", "1 ms"]):
    t.set_text(label)
phd.plots.savefig(b, 'results', 'ncc-dt-acc-b')

### Varying the auditory periphery model used

Conditions:

- Gammatone filter
- Log Gammachirp filter
- Dual Resonance nonlinear filter
- Compressive Gammachirp filter
- Tan Carney auditory model

Results summary:

- Generally, more complicated auditory periphery models performed better
- More complicated auditory periphery models simulated slower
- Gammatone is an outlier: least complex, best performance

In [None]:
columns = ['periphmodel', 'adaptive']
porder = ['gammatone', 'log_gammachirp', 'dual_resonance', 'compressive_gammachirp', 'tan_carney']

def fix_label(l):
    labels = ["Gammatone", "Log Gammachirp", "Dual Resonance",
              "Compressive GC", "Tan Carney"]
    for t, label in zip(l.get_texts(), labels):
        t.set_text(label)

v, b = phd.plots.ncc_accuracy(columns, 'periphmodel', hue_order=porder, relative=True)
ax = b.get_axes()[0]
l = ax.legend(title="", loc='best')
fix_label(l)
phd.plots.savefig(b, 'results', 'ncc-periphmodel-racc-b')

t = phd.plots.ncc_time(columns, 'periphmodel', hue_order=porder)
ax = t.get_axes()[0]
l = ax.legend(title="", loc='best')
fix_label(l)
phd.plots.savefig(t, 'results', 'ncc-periphmodel-time')

### Varying the neuron type used

Conditions:

- Normal LIF neuron
- Adaptive LIF neuron

Results summary:

- No difference

In [None]:
v, b = phd.plots.ncc_accuracy(columns, 'adaptive', hue_order=['False', 'True'], relative=True)
ax = b.get_axes()[0]
l = ax.legend(title="", loc='upper left')
for t, label in zip(l.get_texts(), ["LIF", "Adaptive LIF"]):
    t.set_text(label)
phd.plots.savefig(b, 'results', 'ncc-adaptive-racc-b')

# Scaling

Since this network is designed to mimic the human auditory system,
and important question to ask is whether it will scale
to the size of the human auditory system.
We only use a small number of auditory filters
and neurons associated with those filters.
If we scale up, do we remain within known
neuroanatomical constraints?

In [None]:
def n_neurons(msg, model):
    net = model.build()
    nn = sum(e.n_neurons for e in net.all_ensembles)
    print("=== %s ===" % msg)
    pneurons = model.freqs.size * model.periphery.neurons_per_freq
    print("Periphery layer: %d freqs x %d neurons_per_freq = %d neurons" % (
        model.freqs.size, model.periphery.neurons_per_freq, pneurons))
    cneurons = model.n_cepstra * model.cepstra.n_neurons
    print("Feature layer: %d cepstra x %d neurons_per_cepstra = %d neurons" % (
        model.n_cepstra, model.cepstra.n_neurons, cneurons))
    for i, deriv in enumerate(model.derivatives):
        # Note! Assumes FeedforwardDeriv!
        cneurons += model.n_cepstra * 2 * deriv.n_neurons
        print("Derivative %d: %d cepstra x 2 x %d neurons_per_cepstra = %d neurons" % (
            (i+1), model.n_cepstra, deriv.n_neurons, model.n_cepstra * deriv.n_neurons))

    assert pneurons + cneurons == nn
    print("Total: %d neurons" % nn)
    print("%.3f mm^3 of cortex" % (nn / 27000.))
    print("")

model = phd.sermo.AuditoryFeatures()
model.add_derivative()
n_neurons("Default configuration", model)
model = phd.sermo.AuditoryFeatures()
model.freqs = phd.filters.erbspace(20, 20000, 3500)
model.periphery.neurons_per_freq = 20
model.n_cepstra = 20
model.cepstra.n_neurons = 50
model.add_derivative(n_neurons=50)
n_neurons("Conservative estimate", model)
model = phd.sermo.AuditoryFeatures()
model.freqs = phd.filters.erbspace(20, 20000, 3500)
model.periphery.neurons_per_freq = 40
model.n_cepstra = 40
model.cepstra.n_neurons = 200
model.add_derivative(n_neurons=200)
model.add_derivative(n_neurons=200)
n_neurons("Generous estimate", model);