In [None]:
import os
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nengo
from nengo.utils.matplotlib import rasterplot
from nengo.utils.stdlib import Timer

import phd

# Some plotting niceties
plt.rc('figure', figsize=(10, 6))
sns.set_style('white')
sns.set_style('ticks')

def img(array):
    plt.pcolormesh(array.T)
    plt.ylim(top=array.shape[1])
    plt.xlim(right=array.shape[0])
    plt.colorbar()
    sns.despine()
    plt.tight_layout()

timit_path = "~/phd_data/timit"
timit = phd.timit.TIMIT(timit_path)
try:
    timit.untar(os.path.expanduser("~/Dropbox/LDC93S1.tgz"))
except Exception as e:
    pass

In [None]:
%%javascript
if($(IPython.toolbar.selector.concat(' > #kill-run-first')).length == 0){
  IPython.toolbar.add_buttons_group([
    {
      'label'   : 'kill and run-first',
      'icon'    : 'fa fa-angle-double-down',
      'callback': function(){
        IPython.notebook.kernel.restart();
        $(IPython.events).one('kernel_ready.Kernel', function(){
          var idx = IPython.notebook.get_selected_index();
          IPython.notebook.select(0);
          IPython.notebook.execute_cell();
          IPython.notebook.select(idx);
        });
      }
    }
  ], 'kill-run-first');
}

In [None]:
timit = phd.timit.TIMIT(timit_path)
timit.filefilt.spkr_id = "CAG0"
samples = timit.word_samples(['she'])

In [None]:
model = phd.sermo.AudioFeatures()
model.fs = timit.fs
model.audio = samples['she'][0]
model.freqs = phd.filters.erbspace(20, 4000, 64)
model.n_cepstra = 13
print(model.t_audio)

In [None]:
# Get MFCCs
model.mfcc.dt = 0.001
x = model.mfcc()
print x.shape

In [None]:
img(x)
plt.title("Mel-frequency cepstral coefficients")

In [None]:
from scipy.stats import zscore
img(zscore(x, axis=0))

In [None]:
# Get NCCs
model.periphery.auditory_filter = phd.filters.gammatone(model.freqs)
net = model.build()

with net:
    ihc_p = nengo.Probe(net.periphery.ihc, synapse=None)
    an_in_p = nengo.Probe(net.periphery.an.input, synapse=None)
    an_p = nengo.Probe(net.periphery.an.add_neuron_output(), synapse=None)
    c_p = nengo.Probe(net.cepstra.output, synapse=0.01)

sim = nengo.Simulator(net, dt=0.001)
sim.run(model.t_audio)

In [None]:
phd.plots.cochleogram(sim.data[ihc_p], sim.trange(), model.freqs)

In [None]:
phd.plots.cochleogram(sim.data[an_in_p], sim.trange(), model.freqs)

In [None]:
rasterplot(sim.trange(), sim.data[an_p])
plt.ylim(0, net.periphery.an.n_neurons * net.periphery.an.n_ensembles);

In [None]:
img(sim.data[c_p])

In [None]:
img(zscore(sim.data[c_p], axis=0))

## With derivatives

## Phoneme classification with SVM

Since the samples have to be the same length,
we classify vowel and consonant phonemes separately.
Also, we lengthen all samples to be the
length of the longest sample
with simple linear interpolation.

In [None]:
# Let's start with differentiating two phonemes with whole test set
timit = phd.timit.TIMIT(timit_path)
audio = timit.phn_samples(timit.vowels, corpus="train")

In [None]:
# Convert everything to MFCCs
model = phd.sermo.AudioFeatures()
model.fs = timit.fs
model.freqs = phd.filters.erbspace(20, 4000, 64)
model.n_cepstra = 13

mfccs = {}
for label in audio:
    mfccs[label] = []
    for sample in audio[label]:
        model.audio = sample
        mfccs[label].append(model.mfcc())

In [None]:
# Lengthen all samples to the longest with linear interpolation.
# We have to do this across all labels!
from scipy.interpolate import interp1d
i_mfccs = {}
f_mfccs = {}

n_frames = max(max(m.shape[0] for m in mfccs[label]) for label in audio)
# n_frames = 611 / 13
for label in audio:
    i_mfccs[label] = []
    f_mfccs[label] = []
    for sample in mfccs[label]:
        if sample.shape[0] <= 1:
            # Too short -- just ignore it
            continue
        if sample.shape[0] < n_frames:
            interp_x = np.linspace(0, n_frames, sample.shape[0])
            f = interp1d(interp_x, sample, axis=0, assume_sorted=True)
            sample = f(np.arange(n_frames))
        # Now, we have each utterance as an array of shape (n_frames, n_features).
        # We'll flatten these so that the SVM considers
        # each utterance as a separate "feature vector"
        i_mfccs[label].append(sample)
        f_mfccs[label].append(sample.reshape(-1))
        # Should also try z-scoring it
print f_mfccs['oy'][0].shape

In [None]:
# Prep X, y for SVM classification

# SVM functions take as input
#  X, shape (n_samples, n_features) holding the training samples,
#  y, shape (n_samples,) with class labels (strings or integers)
lbls = sorted(list(audio))
X = np.vstack([np.vstack(f_mfccs[lbl]) for lbl in lbls])
print X.shape
y = []
for lbl in lbls:
    y.extend([lbl] * len(f_mfccs[lbl]))
y = np.array(y)
print y.shape

In [None]:
# Try LinearSVC (we're already in a high dimensional space)
from sklearn import svm
clf = svm.LinearSVC()
with Timer() as t:
    clf.fit(X, y)
print "Took %s seconds" % t.duration

In [None]:
# Predict our training set to see how it does
with Timer() as t:
    pred_y = clf.predict(X)
print "Took %s seconds" % t.duration
train_acc = np.mean(pred_y == y)
print "Train accuracy: %0.3f" % train_acc