# Speech Classifier

Notebook to train a simple MLP classifier to classify utterances in the VariaNTS data by word classes, including hyperparameter optimization using `RandomSearchCV`.

This is used to obtain class probabilities as is required to compute the Inception Score.

In [None]:
import os
import pickle
import sys
from tqdm import tqdm

sys.path.append('..') # append src/ directory

import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier

from utils.inception_score import InceptionScore
calculate_inception_score = InceptionScore.calculate_inception_score

In [3]:
%env CUDA_VISIBLE_DEVICES=0

env: CUDA_VISIBLE_DEVICES=0


In [None]:
dataset_path = "../../data/VariaNTS/VariaNTS_words_16kHz_HP_synth_aug_flattened_fixed-length"

data, labels = [], []
for fn in tqdm(os.listdir(dataset_path)):
    if fn.endswith('.wav'):
        label = fn.split('_')[1]
        mfcc = InceptionScore.get_mfcc(os.path.join(dataset_path, fn))
        data.append(mfcc)
        labels.append(label)

In [None]:
X = np.array(data)
y = np.array(labels)

X = X.reshape(X.shape[0], -1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from scipy.stats import randint as sp_randint

clf = MLPClassifier(max_iter=500)
scaler = sklearn.preprocessing.StandardScaler()
pipe = Pipeline([('scaler', scaler), ('mlp', clf)])

param_dist = {
    'mlp__hidden_layer_sizes': sp_randint(50, 500),
    'mlp__activation': ['logistic', 'tanh', 'relu'],
    'mlp__solver': ['sgd', 'adam', 'lbfgs'],
    'mlp__alpha': [0.0001, 0.001, 0.01, 0.1],
    'mlp__learning_rate': ['constant', 'adaptive', 'invscaling'],
    'mlp__learning_rate_init': [0.0001, 0.001, 0.05, 0.01],
    'mlp__batch_size': [180, 120, 90],
}

search = RandomizedSearchCV(
    pipe, 
    param_distributions=param_dist, 
    n_iter=50,
    n_jobs=16, 
    cv=5,
    random_state=14,
    verbose=3,
)

search.fit(X_train, y_train)

In [None]:
print("Best hyperparameters:", search.best_params_)

train_acc = search.score(X_train, y_train)
test_acc = search.score(X_test, y_test)
print('Train Acc.:', train_acc)
print('Test Acc.:', test_acc)

train_probs = search.predict_proba(X_train)
test_probs = search.predict_proba(X_test)

print('IS Train:', calculate_inception_score(train_probs))
print('IS Test:',  calculate_inception_score(test_probs))

In [None]:
mlp_params = {'activation': 'logistic', 'alpha': 0.1, 'batch_size': 90, 'hidden_layer_sizes': 387, 'learning_rate': 'constant', 'learning_rate_init': 0.05, 'solver': 'sgd'}

In [None]:
clf = MLPClassifier(**mlp_params, max_iter=500)
scaler = sklearn.preprocessing.StandardScaler()
pipe = Pipeline([('scaler', scaler), ('mlp', clf)])

pipe.fit(X_train, y_train)
train_acc = pipe.score(X_train, y_train)
val_acc = pipe.score(X_test, y_test)

print(train_acc)
print(val_acc)

print(calculate_inception_score(pipe.predict_proba(X_train)))
print(calculate_inception_score(pipe.predict_proba(X_test)))

In [None]:
with open('../../speech_classifier/speech_clf_pipeline_variants_aug.pickle', 'wb') as f:
    pickle.dump(pipe, f, protocol=pickle.HIGHEST_PROTOCOL)