In [None]:
import scipy as sp
import scipy.io.wavfile as wavfile
import os
import gmmhmm as hmm
import MFCC
import numpy as np
import re
import random
import pickle
from tqdm import tqdm 

import warnings
warnings.filterwarnings("ignore")

Speech recognition is a cool application of Hidden Markov Models when we allow the state space to be continuous rather than discrete - Continuous Density Hidden Markov Models. Here I use one type of these, the Gaussian Mixture Model Hidden Markov Model.  


The following function accepts a GMMHMM as well as
an integer n_sim, and which simulates the GMMHMM process, generating n_sim different observations.

In [None]:
def sample_gmmhmm(gmmhmm, n_sim):
    """
    Simulate from a GMMHMM.
    
    Returns
    -------
    states : ndarray of shape (n_sim,)
        The sequence of states
    obs : ndarray of shape (n_sim, K)
        The generated observations (vectors of length K)
    """
    A, weights, means, covars, pi = gmmhmm 
    states, obs = np.zeros(n_sim), np.zeros((n_sim, len(weights[0]))) 
         
    for i in range(n_sim): 
        # choose initial state
        state = np.argmax(np.random.multinomial(1, pi))
        # randomly sample
        sample_component = np.argmax(np.random.multinomial(1, weights[state,:])) 
        sample = np.random.multivariate_normal(means[state, sample_component, :], 
                                               covars[state, sample_component, :, :])
        # update states and obs arrays   
        states[i], obs[i] = state, sample                                
                                               
    return states, obs

In [None]:
A = np.array([[.65, .35], [.15, .85]])
pi = np.array([.8, .2])
weights = np.array([[.7, .2, .1], [.1, .5, .4]])
means1 = np.array([[0., 17., -4.], [5., -12., -8.], [-16., 22., 2.]])
means2 = np.array([[-5., 3., 23.], [-12., -2., 14.], [15., -32., 0.]])
means = np.array([means1, means2])
covars1 = np.array([5*np.eye(3), 7*np.eye(3), np.eye(3)])
covars2 = np.array([10*np.eye(3), 3*np.eye(3), 4*np.eye(3)])
covars = np.array([covars1, covars2])
gmmhmm = [A, weights, means, covars, pi] 

In [None]:
sample_gmmhmm(gmmhmm, n_sim = 4)

(array([1., 0., 1., 0.]),
 array([[ -4.08227529,   0.79816729,  19.43895353],
        [  0.12786178,  18.32327316,  -3.72006605],
        [ 12.69725559, -30.29012604,  -1.98135426],
        [ -0.6455404 ,  12.21394581,  -5.93210441]]))

## Problem 2

Samples.zip contains 31 recordings for each of the words/phrases mathematics, biology, political science, psychology, and statistics. These audio samples are 2 seconds in
duration, recorded at a rate of 44100 samples per second, with samples stored as 16-bit signed
integers in WAV format. 
Load the recordings into Python using scipy.io.wavfile.read

Extract the MFCCs from each sample using code from the file MFCC.py.
Store the MFCCs for each word in a separate list. You should have five lists, each containing
31 MFCC arrays, corresponding to each of the five words under consideration.

In [None]:
# skip the repeats, keep the mels in mels dict 
repeats = {"Biology00.wav", "Mathematics00.wav", "PoliticalScience.wav", 
           "Psychology00.wav", "Statistics00.wav"}
mels = {'Biology': [], 'Mathematics': [], 'PoliticalScience': [], 
        'Psychology': [], 'Statistics': []} 
 
filepath = "./Samples"

# loop over files 
for doc in os.listdir(filepath): 
    if doc not in repeats:
        temp = doc.split(" ")
        try: 
            # get the mel., append to appropriate list 
            num, x = wavfile.read(filepath + "/" + doc)  
            mel = MFCC.extract(x, show = False)   
            mels[temp[0]].append(mel)  
        except: 
            continue

# unpack lists and make sure there are 30 arrays 
bio, math, polysci, psych, stats = mels.values() 
for l in [bio, math, polysci, psych, stats]: 
    print(len(l), end = " ")  

30 30 30 30 30 

## Problem 3

Partition each list of MFCCs into a training set of 21 samples, and a test set of
the remaining 10 samples.
Using the training sets, train a GMMHMM on each of the words from the previous problem
with at least 10 random restarts, keeping the best model for each word (the one with the highest
log-likelihood). This process may take several minutes. Since you will not want to run this
more than once, you will want to save the best model for each word to disk using the pickle
module so that you can use it later.

In [None]:
def initialize(n_states):
    transmat = np.ones((n_states,n_states))/float(n_states)
    for i in range(n_states):
        transmat[i, :] += sp.random.uniform(-1./n_states,1./n_states,n_states)
        transmat[i, :] /= sum(transmat[i, :])
    startprob = np.ones(n_states)/float(n_states) + sp.random.uniform(-1./n_states,1./n_states,n_states)
    startprob /= sum(startprob)
    return startprob, transmat 

In [None]:
words = mels.keys() 
samples = [bio, math, polysci, psych, stats] 

# loop over each word and its samples 
for word, word_samples in zip(words, samples): 
    
    # get traina dn test data 
    x_train, x_test = word_samples[: 20], word_samples[20: ] 
    best = -np.inf  
    loop = tqdm(range(10)) 
    
    for i in loop: 
        
        # traineach model 10 times !! 
        startprob, transmat = initialize(5)
        model = hmm.GMMHMM(n_components=5, n_mix=3, transmat=transmat, startprob=startprob, cvtype='diag')
        model.covars_prior = 0.01
        model.fit(x_train, init_params='mc', var=0.1)
        
        # track the best model for each word 
        if model.logprob > best: 
            
            best = model.logprob 
            best_model = model 
            
    # save the models 
    pickle.dump(best_model, open("{}.p".format(word), "wb"))  

100%|██████████| 10/10 [05:52<00:00, 35.30s/it]
100%|██████████| 10/10 [09:30<00:00, 57.07s/it]
100%|██████████| 10/10 [08:27<00:00, 50.73s/it]
100%|██████████| 10/10 [09:14<00:00, 55.44s/it]
100%|██████████| 10/10 [07:34<00:00, 45.48s/it]


In [None]:
best 

-30489.250566198258

## Problem 4

Classify the 10 test samples for each word. Make a dictionary containing the accuracy
of the classification of your five testing sets where the words/phrases are the keys,
and the values are the percent accuracy.

Write a few sentences answering the following questions:
How does your system perform?
Which words are the hardest to correctly classify?

In [None]:
# load in the models 
models = [pickle.load(open("{}.p".format(word), "rb")) for word in words] 

accs = {} 

# loop over the words and their samples 
for index, (word, sample) in enumerate(zip(words, samples)): 
    
    # get the test data and label 
    x_test, y_test, preds = sample[20: ], index, [] 

    # get predicted label for each word 
    for obs in x_test: 
        y_hat = np.argmax([model.score(obs) for model in models]) 
        preds.append(y_hat) 
       
    # print results and update accuracy dictionary 
    acc = 100 * np.mean([pred == y_test for pred in preds]) 
    accs.update({word: acc})
    if word == "Biology": 
        print("Accuracy for {}: \t\t{:.2f}%".format(word, acc)) 
    else: 
        print("Accuracy for {}: \t{:.2f}%".format(word, acc)) 

Accuracy for Biology: 		100.00%
Accuracy for Mathematics: 	100.00%
Accuracy for PoliticalScience: 	90.00%
Accuracy for Psychology: 	100.00%
Accuracy for Statistics: 	100.00%


In [None]:
accs

{'Biology': 100.0,
 'Mathematics': 100.0,
 'PoliticalScience': 90.0,
 'Psychology': 100.0,
 'Statistics': 100.0}

Looks like Political science is that hardest to classify, though this word has more syllables than the others and 30ms might not be a large enough time partition to capture this. 