In [1]:
from IPython.display import Audio
def video(fname, mimetype, width="100%"):
    from IPython.display import HTML
    video_encoded = open(fname, "rb").read().encode("base64")
    video_tag = '<video controls alt="test" src="data:video/{0};base64,{1}" width="{2}">'.format(
        mimetype, video_encoded, width)
    return HTML(data=video_tag)

##  

## Biologically inspired methods in speech recognition and synthesis: closing the loop

<h3>
  PhD defense presentation <br>
  February 4, 2016 <br>
  Trevor Bekolay
</h3>

# Motivation

In [2]:
video('spaun.mp4', 'mp4', "80%")

<img class="center" width="600" src="fig/presentation/spectrotemporal.svg">

# Closed-loop modeling

<img class="center" width="500" src="fig/presentation/perception-action.svg">

<img class="fragment center" style="margin-left:212px" width="670" src="fig/presentation/perception-action-discrete.svg">

# Conceptual model: Sermo

1. Subsystems must operate
   in a continuous, online fashion.
2. Subsystems must be implementable
   in biologically plausible spiking neurons.

# Conceptual model: Sermo

<img class="center" width="460" src="fig/presentation/sermo.svg">

# Conceptual model: Sermo

<img class="center" width="500" src="fig/presentation/sermo-implemented.svg">

## Automatic speech recognition

<img class="center" width="700" src="fig/background/asr.svg">

# Cepstral coefficients

<img class="center" width="350" src="fig/implementation/ncc.svg">

## Neural cepstral coefficients

<img class="center" width="960" src="fig/implementation/ncc-network.svg">

# Example features

<img class="center" width="620" src="fig/presentation/mfcc-ncc.svg">

# Evaluation

##  

<img class="center" width="960" src="fig/presentation/ncc-eval.svg">

# NCCs outperform MFCCs

<img class="inline" width="462" src="fig/presentation/ncc-phones-acc-b.svg">
<img class="fragment inline" width="462" src="fig/presentation/ncc-phones-racc-b.svg">

### Enables comparing periphery models

<img class="center" width="600" src="fig/results/ncc-periphmodel-racc-b.svg">

# Syllable production

<img class="center" width="500" src="fig/presentation/sermo-implemented.svg">

# VocalTractLab

<img class="center" width="740" src="fig/presentation/vtl.png">

# Gesture scores

<img class="center" width="480" src="fig/background/gs.svg">

<img class="fragment center" width="480" src="fig/presentation/gs-traj.svg">

### Dynamic Movement Primitives (DMPs)

<img class="center" width="960" src="fig/presentation/dmp.svg">

# Example syllable sequence

<img class="center" width="960" src="fig/results/prod-good.svg">

# Audio sample

In [3]:
Audio("original.wav")

In [4]:
Audio("synthesized.wav")

## Enables speech of varying speeds

<img class="center" width="600" src="fig/presentation/prod-freq.svg">

# Syllable classification

<img class="center" width="500" src="fig/presentation/sermo-implemented.svg">

# Inverse DMPs

<img class="center" width="960" src="fig/presentation/idmp.svg">

## Example syllable classification

<img class="center" width="760" src="fig/results/recog-good.svg">

## Operates online with no resets

<img class="center" width="600" src="fig/presentation/recog-sequence_len.svg">

# Limitations & future work

<img class="center" width="500" src="fig/presentation/sermo-implemented.svg">

# NCCs

<img class="center" width="600" src="fig/presentation/ncc-phones-time.svg">

# Syllable production

<img class="center" width="600" src="fig/presentation/prod-sequence_len.svg">

# Syllable production

<img class="center" width="600" src="fig/presentation/prod-n_syllables.svg">

# Syllable classification

<img class="center" width="600" src="fig/presentation/recog-freq.svg">

# Takeaways

<ol>
  <li>Speech: spiking neuron models</li>
  <li class="fragment">Machine learning: NCCs, discrete production information data set</li>
  <li class="fragment">Neural modeling: iDMPs, linking temporal inputs/outputs to Spaun, ears and vocal tract</li>
</ol>

# Thank you

<img class="center" width="500" src="fig/presentation/thanks.png">

# Periphery models

<img class="center" width="720" src="fig/methods/gammatone.svg">

# Discrete Cosine Transform

\begin{align}
  &~ \\
  y_k =& {x_0 \over \sqrt{N}} + \sqrt{2 \over N} \sum_{n=1}^{N-1}
  x_n \cos \left( {\pi \over N} n \left( k + {1 \over 2} \right) \right)
  \hspace{-3em} \\
  &\text{for } 0 \le k < N
\end{align}

\begin{align}
  \mathbf{k} &= \left[ 0, 1, \ldots, N-1 \right] & 1 \times N \text{ vector}
    \hspace{-4em} \\
  \mathbf{s} &= \left[ \sqrt{2}, 1, 1, \ldots, 1 \right] & 1 \times N \text{ vector}
    \hspace{-4em} \\
  \mathbf{T} &= \sqrt{2}{N} \, \mathbf{s} \circ \cos \left( \frac{\pi}{N} \left(
    \mathbf{k} + \frac{1}{2} \right) \otimes \mathbf{k} \right)
    & N \times N \text{ matrix}
    \hspace{-4em} \\
  \mathbf{y} &= \mathbf{T}\mathbf{x} & N \times 1 \text{ vector}
    \hspace{-4em}
\end{align}

## Syllable production network

<img class="center" width="470" src="fig/implementation/prod-network.svg">

## Syllable classification network

<img class="center" width="500" src="fig/implementation/recog-network.svg">