# Lab 9: Audio Classification 

In [1]:
# IMPORTS
import numpy as np
from matplotlib import pyplot as plt
from scipy.io.wavfile import read, write
from scipy.signal import hann
import os
import random
import copy
from sklearn.mixture import GaussianMixture as GMM
from librosa import load
from librosa.feature import mfcc
from sklearn.neural_network import MLPClassifier

# SOUNDPLAYER FUNCTION
def sound( x, rate=44100, label=''):
    from IPython.display import display, Audio, HTML
    if label is '':
        display( Audio( x, rate=rate))
    else:
        display( HTML( 
        '<style> table, th, td {border: 0px; }</style> <table><tr><td>' + label + 
        '</td><td>' + Audio( x, rate=rate)._repr_html_()[3:] + '</td></tr></table>'
        ))

In [2]:
# STFT FUNCTION
#dft_size_dflt = 1024
#hop_size_dflt = 256
dft_size_dflt = 2048
hop_size_dflt = 1024
def stft( input_sound, numb_bins=0, dft_size=dft_size_dflt, hop_size=hop_size_dflt, window=0):
    if (numb_bins==0):
        numb_bins=dft_size//2+1
    if (window==0):
        window=hann(dft_size)
    if (input_sound.size%hop_size) != 0:
        input_sound = np.pad(input_sound, (0, hop_size-(input_sound.size%hop_size))) # pad input to hop size so last frame is full
    f = np.zeros(numb_bins) # initial matrix column
    current = 0
    while current <= (input_sound.size-dft_size):
        arg = input_sound[current:current+dft_size]
        dft = np.fft.rfft(window*arg, (numb_bins-1)*2)
        f = np.column_stack((f, dft))
        current += hop_size
    return f[:, 1:] # complex-valued spectrogram - frequencies(rows) x time(columns)

In [3]:
# ISTFT FUNCTION
def istft( stft_output, dft_size=dft_size_dflt, hop_size=hop_size_dflt):
    numb_bins = stft_output.shape[0] # number of frequency bins in spectrogram
    numb_dft = stft_output.shape[1] # number of dft slices in spectrogram
    f = np.zeros((numb_bins-1)*2 + (numb_dft-1)*hop_size) # output array
    current = 0
    while current < (numb_dft):
        arg = stft_output[:, current]
        idft = np.fft.irfft(arg)
        f[current*hop_size : current*hop_size + idft.size] += idft
        current+=1
    return f[:dft_size+(numb_dft-1)*hop_size] # real-valued array of audio samples in time-domain

## Part 1: Making a speech detector

In this section we will design a simple classifier that will let us know if its input is speech or non-speech. Download the data archive from: [ https://drive.google.com/file/d/1oAnvk-hzzgzZ4di4W0pKw6v3IWLm9u2X/view?usp=sharing ] In this part we will use the dataset in data/SpeechMusic. In it you will find two directories, speech/ and music/ containing data from each class.

Randomly select 50 soundfiles from each directory to use as training data, and use the remaining sounds as testing data. For all of the sounds we will compute a representation that makes the classification easier and we will use a simple Gaussian model to classify them. Do the following:

- Perform an STFT for each sound, take it’s magnitude and raise it to 0.3 to improve contrast
    - We will consider each spctral slice of that to be a data point
- Using the training data of each sound:
    - Calculate the mean column and the diagonal covariance of the columns
    - You will thus get two sets of Gaussian parameters that model each sound class
- For each testing data point:
    - Calculate the likelihood of each column based on the above models
	- To calculate the entire file likelihood add all the frame likelihoods
	- Assign each soundfile to the class that gets the highest likelihood

For extra credit implement the parameter estimation and model likelihood yourself. If you are too lazy for that you can instead use ```sklearn.mixture.GaussianMixture``` to learn a diagonal single-Gaussian model per class.

How do the results look like? If you rerun this with a different training/testing set, is there an appreciable difference? On average over multiple training/testing sets what accuracy do you get?

In [4]:
# learning about generators
#gen = os.walk('data/SpeechMusic/')
#print(gen)
#print(next(gen))
#print(next(gen))
#print(next(gen))

In [5]:
# LOAD SPEECH/MUSIC
music_sr = np.empty(0)
music = []
for root, sub, files in os.walk('data/SpeechMusic/music/'):
    for f in files:
        sr, x = read(os.path.join(root, f))
        music_sr = np.append(music_sr, sr)
        music.append(x)
#print('music', music_sr.shape, len(music))

speech_sr = np.empty(0)
speech = []
for root, sub, files in os.walk('data/SpeechMusic/speech/'):
    for f in files:
        sr, x = read(os.path.join(root, f))
        speech_sr = np.append(speech_sr, sr)
        speech.append(x)
#print('speech', speech_sr.shape, len(speech))

In [6]:
# SAMPLE RATES
#print(music_sr)
#print(speech_sr)
sr = 22050

In [7]:
# STFT
exec_cnt = 0

music_stft = []
for i in range(len(music)):
    f = stft(music[i])
    music_stft.append(f)
    exec_cnt+=1
    print(exec_cnt, end=' ')

speech_stft = []
for i in range(len(speech)):
    f = stft(speech[i])
    speech_stft.append(f)
    exec_cnt+=1
    print(exec_cnt, end=' ')

#print()
#print('music', len(music_stft), music_stft[0].shape)
#print('speech', len(speech_stft), speech_stft[0].shape)

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 

In [8]:
# learning about shallow/deep copy
# run one test at a time. reload between tests

# test 1
#sc1 = music_stft # sc1 points to data that music_stft points to
#music_stft = [0,0] # music_stft points to new data
#print(len(music_stft_save)) # references save data
#print(len(music_stft)) # references new data
#print(len(sc1)) # still references old data

# test 2
#sc2 = music_stft # sc2 points to data that music_stft points to
#music_stft[0] *= 2 # modifies data
#print(music_stft_save[0][0, 100]) # references save data
#print(music_stft[0][0, 100]) # references modified data
#print(sc2[0][0, 100]) # references same modified data

In [9]:
# SAVE
music_stft_save = copy.deepcopy(music_stft)
speech_stft_save = copy.deepcopy(speech_stft)

# RELOAD
#music_stft = copy.deepcopy(music_stft_save)
#speech_stft = copy.deepcopy(speech_stft_save)

In [10]:
# RANDOMIZE ORDERING OF DATA
m_shuff = list(zip(music, music_stft))
random.shuffle(m_shuff)
music, music_stft = zip(*m_shuff)
music = list(music)
music_stft = list(music_stft)
#print(len(music), len(music_stft))

s_shuff = list(zip(speech, speech_stft))
random.shuffle(s_shuff)
speech, speech_stft = zip(*s_shuff)
speech = list(speech)
speech_stft = list(speech_stft)
#print(len(speech), len(speech_stft))

In [11]:
# IMPROVE CONTRAST BY LOG SCALING
for i in range(len(music_stft)):
    mag = abs(music_stft[i])
    #phase = np.angle(music_stft[i])
    music_stft[i] = (mag**0.3)# * (np.exp(1j * phase))

for i in range(len(speech_stft)):
    mag = abs(speech_stft[i])
    #phase = np.angle(speech_stft[i])
    speech_stft[i] = (mag**0.3)# * (np.exp(1j * phase))

In [12]:
# DIVIDE SETS INTO TRAIN/TEST DATA
split = 50
music_train = music[:split]
music_test = music[split:]
speech_train = speech[:split]
speech_test = speech[split:]

music_train_stft = music_stft[:split]
music_test_stft = music_stft[split:]
speech_train_stft = speech_stft[:split]
speech_test_stft = speech_stft[split:]

In [None]:
# LISTEN
#sound(music_train[0], sr)
#sound(music_test[0], sr)
#sound(speech_train[0], sr)
#sound(speech_test[0], sr)

In [None]:
# PLOT STFT
# reconstruct with phase during log scaling if you want to do this
#samp_sr = sr
#samp = music_train[0]
#samp_stft = music_train_stft[0]

# normalize axes to Hz vs s
#samp_hz2bin = (samp_stft.shape[0])*(samp_sr/2)
#samp_xticks = np.arange(0, samp_stft.shape[1], samp_sr/(samp.size/samp_stft.shape[1]))
#samp_xlabels = np.arange(samp_xticks.size)
#samp_numb_yticks = 10;
#samp_freq = np.fft.fftfreq((samp_stft.shape[0])*2)*samp_sr
#samp_yticks = np.arange(0, samp_stft.shape[0], samp_stft.shape[0]/samp_numb_yticks)
#samp_ylabels = samp_freq[:(samp_freq.size//2): samp_stft.shape[0]//samp_numb_yticks]

# plot spectrogram + play sound
#plt.pcolormesh(abs(samp_stft), cmap='cubehelix')
#plt.xticks(samp_xticks, samp_xlabels)
#plt.yticks(samp_yticks, samp_ylabels)
#ax = plt.gca()
##ax.set_xlim((110, 130))
##ax.set_ylim((0, 2000*samp_hz2bin))
#plt.show()
#sound(istft(samp_stft), rate=samp_sr) # sounds bad because of the log scaling

In [None]:
# FORMAT DATA FOR GAUSSIAN MODEL
music_train_data = []
for i in range(len(music_train_stft)):
    for j in range(music_train_stft[i].shape[1]):
        music_train_data.append(music_train_stft[i][:, j])
#print('music train', len(music_train_data), music_train_data[0].shape)

music_test_data = []
for i in range(len(music_test_stft)):
    for j in range(music_test_stft[i].shape[1]):
        music_test_data.append(music_test_stft[i][:, j])
#print('music test', len(music_test_data), music_test_data[0].shape)

speech_train_data = []
for i in range(len(speech_train_stft)):
    for j in range(speech_train_stft[i].shape[1]):
        speech_train_data.append(speech_train_stft[i][:, j])
#print('speech train', len(speech_train_data), speech_train_data[0].shape)

speech_test_data = []
for i in range(len(speech_test_stft)):
    for j in range(speech_test_stft[i].shape[1]):
        speech_test_data.append(speech_test_stft[i][:, j])
#print('speech test', len(speech_test_data), speech_test_data[0].shape)

In [None]:
# TRAIN CLASSIFIERS (GAUSSIAN MODEL)
music_classifier = GMM(1, 'diag')
music_classifier.fit(music_train_data)

speech_classifier = GMM(1, 'diag')
speech_classifier.fit(speech_train_data)

In [None]:
# TEST MUSIC
music_raw_scores1 = music_classifier.score_samples(music_test_data)
music_raw_scores2 = speech_classifier.score_samples(music_test_data)

# music VS music classifier
k=0
score=0
music_scores1 = []
for i in range(len(music_test_stft)):
    for j in range(music_test_stft[i].shape[1]):
        score += music_raw_scores1[k]
        k+=1
    music_scores1.append(score)
    score = 0
#print(len(music_scores1))
#print(music_scores1)

# music VS speech classifier
k=0
score=0
music_scores2 = []
for i in range(len(music_test_stft)):
    for j in range(music_test_stft[i].shape[1]):
        score += music_raw_scores2[k]
        k+=1
    music_scores2.append(score)
    score = 0
#print(len(music_scores2))
#print(music_scores2)

# music results
results = 0
for i in range(len(music_scores1)):
    if (music_scores1[i] > music_scores2[i]):
        results+=1
#print()
print("MUSIC RESULTS #CORRECT =", results)

In [None]:
# TEST SPEECH
speech_raw_scores1 = speech_classifier.score_samples(speech_test_data)
speech_raw_scores2 = music_classifier.score_samples(speech_test_data)

# speech VS speech classifier
k=0
score=0
speech_scores1 = []
for i in range(len(speech_test_stft)):
    for j in range(speech_test_stft[i].shape[1]):
        score += speech_raw_scores1[k]
        k+=1
    speech_scores1.append(score)
    score = 0
#print(len(speech_scores1))
#print(speech_scores1)

# speech VS music classifier
k=0
score=0
speech_scores2 = []
for i in range(len(speech_test_stft)):
    for j in range(speech_test_stft[i].shape[1]):
        score += speech_raw_scores2[k]
        k+=1
    speech_scores2.append(score)
    score = 0
#print(len(speech_scores2))
#print(speech_scores2)

# speech results
results = 0
for i in range(len(speech_scores1)):
    if (speech_scores1[i] > speech_scores2[i]):
        results+=1
#print()
print("SPEECH RESULTS #CORRECT =", results)

In [None]:
### --- NOTES --- ###

# speech is typically more accurate ~ 80/90 %
# music ~ 50/60/70 %

# using the same train/test set combo always yields the same results

## Part 2: Making a music genre classifier

We will repeat the above, but this time we will perform music genre classification. To do so we will use a slightly more elaborate feature representation, and a stronger classification model. If you downloaded the data archive pointed to above, you will find a subset of the CTZAN dataset in the data/genre folder, this is a benchmark data set for music genre classification.

Just as before, you will find a set of directories with examples of each sound class that we want to recognize. For each class, split the soundfiles into a training set (50% of data) and testing set (remaining 50% of data).

For a representation we will use MFCC features. For extra credit, code these yourself otherwise you can use the implementation from the ```librosa``` library. Once all the files are transformed we will have a series of MFCC frames for each recording (as opposed to spectral frames as is in the case of the STFT). We will use these as the data to classify.

For each class learn a Gaussian model (with a diagonal covariance again). This will be the same process as above.
In order to evaluate how good this works we will use the following procedure. For each sound in the training data, get the likelihood of each MFCC frame based on the learned Gaussian models and sum these over the entire file just as we did before. Use the resulting values to get a classification result for each . Report how accurate your results are. Now report the accuracy using your testing data instead.

Now will use a better classifier to hopefully get better accuracy. We will use a Gaussian Mixture Model (```sklearn.mixture.GaussianMixture```). Just as before you should learn one such model for each class using the corresponding training data.

How many Gaussians do you need in your GMM to get the best results? Do the MFCC parameters make a difference? Play around with the numbers to get the best possible results.

In [None]:
# LOAD GENRES
exec_cnt = 0

classical_sr = np.empty(0)
classical = []
for root, sub, files in os.walk('data/genres/classical/'):
    for f in files:
        x, sr = load(os.path.join(root, f))
        classical_sr = np.append(classical_sr, sr)
        classical.append(x)
        exec_cnt+=1
        print(exec_cnt, end=' ')

disco_sr = np.empty(0)
disco = []
for root, sub, files in os.walk('data/genres/disco/'):
    for f in files:
        x, sr = load(os.path.join(root, f))
        disco_sr = np.append(disco_sr, sr)
        disco.append(x)
        exec_cnt+=1
        print(exec_cnt, end=' ')

metal_sr = np.empty(0)
metal = []
for root, sub, files in os.walk('data/genres/metal/'):
    for f in files:
        x, sr = load(os.path.join(root, f))
        metal_sr = np.append(metal_sr, sr)
        metal.append(x)
        exec_cnt+=1
        print(exec_cnt, end=' ')

pop_sr = np.empty(0)
pop = []
for root, sub, files in os.walk('data/genres/pop/'):
    for f in files:
        x, sr = load(os.path.join(root, f))
        pop_sr = np.append(pop_sr, sr)
        pop.append(x)
        exec_cnt+=1
        print(exec_cnt, end=' ')

reggae_sr = np.empty(0)
reggae = []
for root, sub, files in os.walk('data/genres/reggae/'):
    for f in files:
        x, sr = load(os.path.join(root, f))
        reggae_sr = np.append(reggae_sr, sr)
        reggae.append(x)
        exec_cnt+=1
        print(exec_cnt, end=' ')
        
#print()
#print('classical:', classical_sr.shape, len(classical))
#print('disco:', disco_sr.shape, len(disco))
#print('metal', metal_sr.shape, len(metal))
#print('pop', pop_sr.shape, len(pop))
#print('reggae', reggae_sr.shape, len(reggae))

In [None]:
# SAMPLE RATES
#print(classical_sr)
#print(disco_sr)
#print(metal_sr)
#print(pop_sr)
#print(reggae_sr)
sr = 22050

In [None]:
# MFCC
n_mfcc_dflt=40
exec_cnt = 0

classical_mfcc = []
for i in range(len(classical)):
    f = mfcc(classical[i], sr, n_mfcc=n_mfcc_dflt)
    classical_mfcc.append(f)
    exec_cnt+=1
    print(exec_cnt, end=' ')
    
disco_mfcc = []
for i in range(len(disco)):
    f = mfcc(disco[i], sr, n_mfcc=n_mfcc_dflt)
    disco_mfcc.append(f)
    exec_cnt+=1
    print(exec_cnt, end=' ')
    
metal_mfcc = []
for i in range(len(metal)):
    f = mfcc(metal[i], sr, n_mfcc=n_mfcc_dflt)
    metal_mfcc.append(f)
    exec_cnt+=1
    print(exec_cnt, end=' ')
    
pop_mfcc = []
for i in range(len(pop)):
    f = mfcc(pop[i], sr, n_mfcc=n_mfcc_dflt)
    pop_mfcc.append(f)
    exec_cnt+=1
    print(exec_cnt, end=' ')

reggae_mfcc = []
for i in range(len(reggae)):
    f = mfcc(reggae[i], sr, n_mfcc=n_mfcc_dflt)
    reggae_mfcc.append(f)
    exec_cnt+=1
    print(exec_cnt, end=' ')

#print()
#print('classical', len(classical_mfcc), classical_mfcc[0].shape)
#print('disco', len(disco_mfcc), disco_mfcc[0].shape)
#print('metal', len(metal_mfcc), metal_mfcc[0].shape)
#print('pop', len(pop_mfcc), pop_mfcc[0].shape)
#print('reggae', len(reggae_mfcc), reggae_mfcc[0].shape)

In [None]:
# SAVE
#classical_mfcc_save = copy.deepcopy(classical_mfcc)
#disco_mfcc_save = copy.deepcopy(disco_mfcc)
#metal_mfcc_save = copy.deepcopy(metal_mfcc)
#pop_mfcc_save = copy.deepcopy(pop_mfcc)
#reggae_mfcc_save = copy.deepcopy(reggae_mfcc)

# RELOAD
#classical_mfcc = copy.deepcopy(classical_mfcc_save)
#disco_mfcc = copy.deepcopy(disco_mfcc_save)
#metal_mfcc = copy.deepcopy(metal_mfcc_save)
#pop_mfcc = copy.deepcopy(pop_mfcc_save)
#reggae_mfcc = copy.deepcopy(reggae_mfcc_save)

In [None]:
# RANDOMIZE ORDERING OF DATA
classical_shuff = list(zip(classical, classical_mfcc))
random.shuffle(classical_shuff)
classical, classical_mfcc = zip(*classical_shuff)
classical = list(classical)
classical_mfcc = list(classical_mfcc)
#print('classical', len(classical), len(classical_mfcc))

disco_shuff = list(zip(disco, disco_mfcc))
random.shuffle(disco_shuff)
disco, disco_mfcc = zip(*disco_shuff)
disco = list(disco)
disco_mfcc = list(disco_mfcc)
#print('disco', len(disco), len(disco_mfcc))

metal_shuff = list(zip(metal, metal_mfcc))
random.shuffle(metal_shuff)
metal, metal_mfcc = zip(*metal_shuff)
metal = list(metal)
metal_mfcc = list(metal_mfcc)
#print('metal', len(metal), len(metal_mfcc))

pop_shuff = list(zip(pop, pop_mfcc))
random.shuffle(pop_shuff)
pop, pop_mfcc = zip(*pop_shuff)
pop = list(pop)
pop_mfcc = list(pop_mfcc)
#print('pop', len(pop), len(pop_mfcc))

reggae_shuff = list(zip(reggae, reggae_mfcc))
random.shuffle(reggae_shuff)
reggae, reggae_mfcc = zip(*reggae_shuff)
reggae = list(reggae)
reggae_mfcc = list(reggae_mfcc)
#print('reggae', len(reggae), len(reggae_mfcc))

In [None]:
# DIVIDE SETS INTO TRAIN/TEST DATA
split = 50

classical_train = classical[:split]
classical_test = classical[split:]
classical_train_mfcc = classical_mfcc[:split]
classical_test_mfcc = classical_mfcc[split:]

disco_train = disco[:split]
disco_test = disco[split:]
disco_train_mfcc = disco_mfcc[:split]
disco_test_mfcc = disco_mfcc[split:]

metal_train = metal[:split]
metal_test = metal[split:]
metal_train_mfcc = metal_mfcc[:split]
metal_test_mfcc = metal_mfcc[split:]

pop_train = pop[:split]
pop_test = pop[split:]
pop_train_mfcc = pop_mfcc[:split]
pop_test_mfcc = pop_mfcc[split:]

reggae_train = reggae[:split]
reggae_test = reggae[split:]
reggae_train_mfcc = reggae_mfcc[:split]
reggae_test_mfcc = reggae_mfcc[split:]

In [None]:
# LISTEN
#sound(classical_train[0], sr)
#sound(classical_test[0], sr)
#sound(disco_train[0], sr)
#sound(disco_test[0], sr)
#sound(metal_train[0], sr)
#sound(metal_test[0], sr)
#sound(pop_train[0], sr)
#sound(pop_test[0], sr)
#sound(reggae_train[0], sr)
#sound(reggae_test[0], sr)

In [None]:
# FORMAT TRAIN/TEST DATA
classical_train_data = []
for i in range(len(classical_train_mfcc)):
    for j in range(classical_train_mfcc[i].shape[1]):
        classical_train_data.append(classical_train_mfcc[i][:, j])
#print(len(classical_train_data))
#print(classical_train_data[0].shape)

classical_test_data = []
for i in range(len(classical_test_mfcc)):
    for j in range(classical_test_mfcc[i].shape[1]):
        classical_test_data.append(classical_test_mfcc[i][:, j])
#print(len(classical_test_data))
#print(classical_test_data[0].shape)
#print()

disco_train_data = []
for i in range(len(disco_train_mfcc)):
    for j in range(disco_train_mfcc[i].shape[1]):
        disco_train_data.append(disco_train_mfcc[i][:, j])
#print(len(disco_train_data))
#print(disco_train_data[0].shape)

disco_test_data = []
for i in range(len(disco_test_mfcc)):
    for j in range(disco_test_mfcc[i].shape[1]):
        disco_test_data.append(disco_test_mfcc[i][:, j])
#print(len(disco_test_data))
#print(disco_test_data[0].shape)
#print()

metal_train_data = []
for i in range(len(metal_train_mfcc)):
    for j in range(metal_train_mfcc[i].shape[1]):
        metal_train_data.append(metal_train_mfcc[i][:, j])
#print(len(metal_train_data))
#print(metal_train_data[0].shape)

metal_test_data = []
for i in range(len(metal_test_mfcc)):
    for j in range(metal_test_mfcc[i].shape[1]):
        metal_test_data.append(metal_test_mfcc[i][:, j])
#print(len(metal_test_data))
#print(metal_test_data[0].shape)
#print()

pop_train_data = []
for i in range(len(pop_train_mfcc)):
    for j in range(pop_train_mfcc[i].shape[1]):
        pop_train_data.append(pop_train_mfcc[i][:, j])
#print(len(pop_train_data))
#print(pop_train_data[0].shape)

pop_test_data = []
for i in range(len(pop_test_mfcc)):
    for j in range(pop_test_mfcc[i].shape[1]):
        pop_test_data.append(pop_test_mfcc[i][:, j])
#print(len(pop_test_data))
#print(pop_test_data[0].shape)
#print()

reggae_train_data = []
for i in range(len(reggae_train_mfcc)):
    for j in range(reggae_train_mfcc[i].shape[1]):
        reggae_train_data.append(reggae_train_mfcc[i][:, j])
#print(len(reggae_train_data))
#print(reggae_train_data[0].shape)

reggae_test_data = []
for i in range(len(reggae_test_mfcc)):
    for j in range(reggae_test_mfcc[i].shape[1]):
        reggae_test_data.append(reggae_test_mfcc[i][:, j])
#print(len(reggae_test_data))
#print(reggae_test_data[0].shape)

In [None]:
# TRAIN CLASSIFIERS (GAUSSIAN MODEL)
classical_classifier = GMM(15, 'diag')
classical_classifier.fit(classical_train_data)

disco_classifier = GMM(15, 'diag')
disco_classifier.fit(disco_train_data)

metal_classifier = GMM(15, 'diag')
metal_classifier.fit(metal_train_data)

pop_classifier = GMM(15, 'diag')
pop_classifier.fit(pop_train_data)

reggae_classifier = GMM(15, 'diag')
reggae_classifier.fit(reggae_train_data)

In [None]:
# TEST FUNCTION
def score_test(test_data, test_mfcc):
    raw_scores1 = classical_classifier.score_samples(test_data)
    raw_scores2 = disco_classifier.score_samples(test_data)
    raw_scores3 = metal_classifier.score_samples(test_data)
    raw_scores4 = pop_classifier.score_samples(test_data)
    raw_scores5 = reggae_classifier.score_samples(test_data)

    # classical classifier
    k=0
    score=0
    scores1 = []
    for i in range(len(test_mfcc)):
        for j in range(test_mfcc[i].shape[1]):
            score += raw_scores1[k]
            k+=1
        scores1.append(score)
        score = 0
    #print(len(scores1))
    #print(scores1)

    # disco classifier
    k=0
    score=0
    scores2 = []
    for i in range(len(test_mfcc)):
        for j in range(test_mfcc[i].shape[1]):
            score += raw_scores2[k]
            k+=1
        scores2.append(score)
        score = 0
    #print(len(scores2))
    #print(scores2)

    # metal classifier
    k=0
    score=0
    scores3 = []
    for i in range(len(test_mfcc)):
        for j in range(test_mfcc[i].shape[1]):
            score += raw_scores3[k]
            k+=1
        scores3.append(score)
        score = 0
    #print(len(scores3))
    #print(scores3)

    # pop classifier
    k=0
    score=0
    scores4 = []
    for i in range(len(test_mfcc)):
        for j in range(test_mfcc[i].shape[1]):
            score += raw_scores4[k]
            k+=1
        scores4.append(score)
        score = 0
    #print(len(scores4))
    #print(scores4)

    # reggae classifier
    k=0
    score=0
    scores5 = []
    for i in range(len(test_mfcc)):
        for j in range(test_mfcc[i].shape[1]):
            score += raw_scores5[k]
            k+=1
        scores5.append(score)
        score = 0
    #print(len(scores5))
    #print(scores5)

    # results
    results1 = 0
    results2 = 0
    results3 = 0
    results4 = 0
    results5 = 0
    for i in range(len(scores1)):
        results = np.array((scores1[i], scores2[i], scores3[i], scores4[i], scores5[i]))
        if (np.argmax(results) == 0):
            results1+=1
        elif (np.argmax(results) == 1):
            results2+=1
        elif (np.argmax(results) == 2):
            results3+=1
        elif (np.argmax(results) == 3):
            results4+=1
        elif (np.argmax(results) == 4):
            results5+=1
    print("CLASSICAL =", results1)
    print("DISCO =", results2)
    print("METAL =", results3)
    print("POP =", results4)
    print("REGGAE =", results5)
    print()

In [None]:
# RESULTS FOR THE TEST DATA
print("~~~ CLASSICAL RESULTS ~~~")
score_test(classical_test_data, classical_test_mfcc)
print("~~~ DISCO RESULTS ~~~")
score_test(disco_test_data, disco_test_mfcc)
print("~~~ METAL RESULTS ~~~")
score_test(metal_test_data, metal_test_mfcc)
print("~~~ POP RESULTS ~~~")
score_test(pop_test_data, pop_test_mfcc)
print("~~~ REGGAE RESULTS ~~~")
score_test(reggae_test_data, reggae_test_mfcc)

In [None]:
# RESULTS FOR THE TRAIN DATA
print("~~~ CLASSICAL RESULTS ~~~")
score_test(classical_train_data, classical_train_mfcc)
print("~~~ DISCO RESULTS ~~~")
score_test(disco_train_data, disco_train_mfcc)
print("~~~ METAL RESULTS ~~~")
score_test(metal_train_data, metal_train_mfcc)
print("~~~ POP RESULTS ~~~")
score_test(pop_train_data, pop_train_mfcc)
print("~~~ REGGAE RESULTS ~~~")
score_test(reggae_train_data, reggae_train_mfcc)

In [None]:
### --- NOTES --- ###

## WITH 1 GAUSSIAN:
# classical, metal, pop... accuracy=90%
# disco is misclassified about 50% of time, usually to pop or metal
# reggae is missclassified about 30-40% of time, equally to other genres 

## WITH 2 GAUSSIANS:
# reggae sees the most noticeable increase to about 80%
# this may suggest reggae has two fairly distinct subclasses within it

## WITH 3 GAUSSIANS:
# slight bump to reggae and disco
# classical sometimes scores perfect

## WITH 7 GAUSSIANS:
# things start to flatten out... the best tests are getting worse and the worse tests are getting a little better, if at all

## WITH 15 GUASSIANS:
# this seems to be the best!
# training data significantly outperforms test data... makes sense since each song has a strong direct influence on the gaussians created

## TRAIN VS TEST DATA:
# sees about the same accuracy
# classes with a high test data accuracy see a slight bump in train data accuracy

## MFCC PARAMETERS
# not noticing much difference when increasing n_mfcc past 20
# going too low reduces accuracy

## Part 3: Make it better (extra credit, required for 4-hour registrants)

There is no shortage of techniques (and free code) to use for classification. Revisit the two problems above and use any other type of classifier you want (Neural Nets, Boosting, Decision Trees, whatever). Also feel free to use any feature you want. Can you improve on the results you got before? How much higher can you get your accuracy for either case?

In [13]:
# FORMAT DATA FOR NEURAL NETWORK
music_train_data = []
music_train_y = []
for i in range(len(music_train_stft)):
    for j in range(music_train_stft[i].shape[1]):
        music_train_data.append(music_train_stft[i][:, j])
        music_train_y.append('music')
#print('music train', len(music_train_data), music_train_data[0].shape)

music_test_data = []
for i in range(len(music_test_stft)):
    for j in range(music_test_stft[i].shape[1]):
        music_test_data.append(music_test_stft[i][:, j])
#print('music test', len(music_test_data), music_test_data[0].shape)

speech_train_data = []
speech_train_y = []
for i in range(len(speech_train_stft)):
    for j in range(speech_train_stft[i].shape[1]):
        speech_train_data.append(speech_train_stft[i][:, j])
        speech_train_y.append('speech')
#print('speech train', len(speech_train_data), speech_train_data[0].shape)

speech_test_data = []
for i in range(len(speech_test_stft)):
    for j in range(speech_test_stft[i].shape[1]):
        speech_test_data.append(speech_test_stft[i][:, j])
#print('speech test', len(speech_test_data), speech_test_data[0].shape)

train_data = music_train_data + speech_train_data
train_y = music_train_y + speech_train_y

In [14]:
# TRAIN CLASSIFIERS (NEURAL NETWORK)
sm_classifier = MLPClassifier()
sm_classifier.fit(train_data, train_y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [15]:
# SCORE TEST DATA
sm_score1 = sm_classifier.predict(music_test_data)
sm_score2 = sm_classifier.predict(speech_test_data)

In [18]:
# PROCESS RESULTS
k=0
sm_result1 = 0
for i in range(len(music_test_stft)):
    right=0
    wrong=0
    for j in range(music_test_stft[i].shape[1]):
        if (sm_score1[k] == 'music'):
            right+=1
        else:
            wrong+=1
        k+=1
    if (right>wrong):
        sm_result1+=1
    print(i, right, wrong)
print("MUSIC #CORRECT =", sm_result1)
print()

k=0
sm_result2 = 0
for i in range(len(speech_test_stft)):
    right=0
    wrong=0
    for j in range(speech_test_stft[i].shape[1]):
        if (sm_score2[k] == 'speech'):
            print
            right+=1
        else:
            wrong+=1
        k+=1
    if (right>wrong):
        sm_result2+=1
    print(i, right, wrong)
print("SPEECH #CORRECT =", sm_result2)

0 272 50
1 267 55
2 283 39
3 279 43
4 320 2
5 50 272
6 310 12
7 269 53
8 322 0
9 172 150
MUSIC #CORRECT = 9

0 321 1
1 319 3
2 311 11
3 320 2
4 317 5
5 313 9
6 313 9
7 299 23
8 316 6
9 321 1
SPEECH #CORRECT = 10
