In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

ModuleNotFoundError: No module named 'seaborn'

## Load Data

In [None]:
df = pd.read_csv('../data/original_mfcc_data.csv')
df.head()

In [None]:
# different possible emotions
df['label'].unique()

In [None]:
dfC = df.copy() # make a copy before making edits

# label encode the target emotions
dfC['label'] = dfC['label'].map({'Angry': 0, 'Disgust': 1, 'Fear': 2, 'Happy': 3, 'Neutral': 4, 'Sad': 5})
dfC.head()

In [None]:
# check distribution of target variable
dfC['label'].value_counts() # Neutral emotion has least number of records in the dataset

In [None]:
# separate target and features
def separate(data):
    y = data['label']
    X = data.iloc[:, 0:-1]
    return X, y

In [None]:
X, y = separate(dfC)

In [None]:
# manual train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_preds = knn.predict(X_test)
acc = accuracy_score(y_test, y_preds)
print('Accuracy of Baseline KNN: ', acc)

In [None]:
len(y_preds) # confirm shape of predictions

In [None]:
cate = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad']

In [None]:
from sklearn.metrics import confusion_matrix, classification_report 

In [None]:
cf_matrix = confusion_matrix(y_test, y_preds)
plt.figure(figsize=(12,10))
cm = pd.DataFrame(cf_matrix, index = [i for i in cate], columns = [i for i in cate])

In [None]:
sns.heatmap(cm, linecolor="white", annot=True, linewidth=1, cmap="Blues", fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
print(confusion_matrix(y_test, y_preds))
print(classification_report(y_test, y_preds))

By observing the baseline KNN confusion matrix, we see that the majority of Fear-labeled instances are classified as Happy and Disgust, whereas KNN was able to distinguish quite well between happiness and sadness. 

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
# training model using k-fold cross-validation
knn = KNeighborsClassifier()
scores = cross_val_score(knn, X, y, cv = 10, scoring = 'accuracy')
scores

With 10 folds of cross validation, the scores are observed to not significantly vary from the baseline KNN classifier and lie within the range of -/+ 5%. 

In [None]:
# use average accuracy as an estimate of out-of-sample accuracy
# numpy array has a method mean()
scores.mean()

In [None]:
# search for an optimal value of K for KNN

# range of k we want to try
k_range = range(1, 31)
# empty list to store scores
k_scores = []

# 1. we will loop through reasonable values of k
for k in k_range:
    # 2. run KNeighborsClassifier with k neighbours
    knn = KNeighborsClassifier(n_neighbors = k)
    # 3. obtain cross_val_score for KNeighborsClassifier with k neighbours
    scores = cross_val_score(knn, X, y, cv = 10, scoring='accuracy')
    # 4. append mean of scores for k neighbors to k_scores list
    k_scores.append(scores.mean())


print(k_scores)

In [None]:
print('Length of list', len(k_scores)) # length of scores should be 30 due to running k-fold cv method 30 times
print('Max of list', max(k_scores))

In [None]:
plt.plot(k_range, k_scores)
plt.xlabel('Neighbors (k)')
plt.ylabel('Cross-validated accuracy')
plt.title('CV Accuracy vs. K Neighbors')

From the elbow method, we found that 20 neighbors is where the median CV takes place. Let's retrain KNN with the elbow method-verified k value and observe the average cross-validated mean.

In [None]:
# 10-fold cross-validation with best KNN model
# calculate mean directly on results
knn = KNeighborsClassifier(n_neighbors = 20)
y_pred = cross_val_score(knn, X, y, cv = 10, scoring = 'accuracy').mean()

In [None]:
y_pred

And as expected, the model with k = 20 neighbors performs slightly better than sklearn's default model with k = 5 neighbors.

## SVC

In [None]:
from sklearn.svm import SVC

In [None]:
# baseline SVM classifier
best_svc = SVC(kernel = 'rbf')
best_svc.fit(X_train, y_train)
y_preds = best_svc.predict(X_test)
svc_acc = accuracy_score(y_test, y_preds)
print('Accuracy of Baseline SVC: ', svc_acc)

In [None]:
cf_matrix = confusion_matrix(y_test, y_preds)
plt.figure(figsize=(12,10))
cm = pd.DataFrame(cf_matrix, index = [i for i in cate], columns = [i for i in cate])

In [None]:
sns.heatmap(cm, linecolor="white", annot=True, linewidth=1, cmap="Blues")
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
y_preds = best_svc.predict(X_test)
print(confusion_matrix(y_test, y_preds))
print(classification_report(y_test, y_preds))

The second algorithm we used is the SVM Classifier, specifically, the non-linear SVM as the boundary the algorithm calculates doesn’t have to be a straight line. We think the algorithm’s kernel trick can help compute a much more optimal hyperplane for our unstructured audio data. The baseline SVC accuracy aligns with our hypothesis that our unstructured data may be better comprehended by selecting the appropriate non-linear kernel function. Our confusion matrix shows Fear having the least quantity of true predictions and are largely misclassified as Happiness. 

In [None]:
# run cross validation on SVC
best_svc = SVC(kernel = 'rbf')
svc_scores = cross_val_score(best_svc, X, y, cv = 10, scoring = 'accuracy')
svc_scores

In [None]:
print('Average k-fold score: ', svc_scores.mean())
print('Length of list', len(svc_scores)) 
print('Max of list', max(svc_scores))

The maximum cross-validated score of ~38.8% is above KNN's maximum cross-validated score by around 4%. 

In [None]:
# any improvement using poly kernel?
svc2 = SVC(kernel = 'poly')
svc_scores2 = cross_val_score(svc2, X, y, cv = 10, scoring = 'accuracy')
svc_scores2

In [None]:
print('Average k-fold score: ', svc_scores2.mean())
print('Length of list', len(svc_scores2)) 
print('Max of list', max(svc_scores2)) # not really any improvement

A polynomial kernel did not perform significantly worse, but also rarely yields any improvement when compared to the performance of a SVC with RBF kernel. Thus, we decided to stick with the radial basis fuction kernel for re-training with dimensionality reduction.

## SVC + PCA

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
# standardize the input

x_train = StandardScaler().fit_transform(X_train.values)
print(x_train)
print(len(x_train[0]))

In [None]:
print(np.mean(x_train))
print(np.std(x_train))

In [None]:
# testing on three levels of explained variance (90%, 95%, 98%)
from sklearn.decomposition import PCA

In [None]:
# set number of components equal to number of input features
pca = PCA().fit(x_train) # fit as many principal components possible

# confirm findings by how much variance is explained by the principal components derived
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio

In [None]:
# inverse of scree plot
plt.plot(np.cumsum(explained_variance_ratio))
plt.title('Cumulative Explained Variance per Principal Component')
plt.xlabel('Principal Components')
plt.ylabel('Variance')

In [None]:
np.cumsum(explained_variance_ratio)

In [None]:
np.cumsum(explained_variance_ratio[:65])

In [None]:
def refit(train_data, test_data, n_components):
    refit_pca = PCA(n_components = n_components)
    PCA_X_train = refit_pca.fit_transform(train_data)
    PCA_X_test = refit_pca.transform(test_data)
    
    return PCA_X_train, PCA_X_test

In [None]:
x_test = StandardScaler().fit_transform(X_test.values)

In [None]:
pca_X_train1, pca_X_test1 = refit(x_train, x_test, 17)
pca_X_train2, pca_X_test2 = refit(x_train, x_test, 34)
pca_X_train3, pca_X_test3 = refit(x_train, x_test, 65)

In [None]:
pca_X_train3

In [None]:
len(pca_X_test3[0])

In [None]:
# SVC keeping only 17 components
final_svm_model = SVC(kernel = 'rbf').fit(pca_X_train1, y_train)
final_pred_train = final_svm_model.predict(pca_X_train1)
final_pred_test = final_svm_model.predict(pca_X_test1)
print('Test Data Accuracy after PCA Transformation: ', accuracy_score(y_test, final_pred_test))

In [None]:
# SVC keeping first 34 components
final_svm_model2 = SVC(kernel = 'rbf').fit(pca_X_train2, y_train)
final_pred_train2 = final_svm_model2.predict(pca_X_train2)
final_pred_test2 = final_svm_model2.predict(pca_X_test2)
print('Test Data Accuracy after PCA Transformation: ', accuracy_score(y_test, final_pred_test2))

In [None]:
# SVC keeping first 65 components
final_svm_model3 = SVC(kernel = 'rbf').fit(pca_X_train3, y_train)
final_pred_train3 = final_svm_model3.predict(pca_X_train3)
final_pred_test3 = final_svm_model3.predict(pca_X_test3)
print('Test Data Accuracy after PCA Transformation: ', accuracy_score(y_test, final_pred_test3))

Despite the slight variation in performance each time the SVM model is refit with the corresponding number of PCA components, we observed that most of the time, it is ideal to keep only the first 17 or 34 components given its boost in performance from our very baseline accuracy of 34.5% to 37.5%.

In [None]:
# cross-validated SVM w/PCA
svc_w_pca = SVC(kernel = 'rbf')
svc_pca_scores = cross_val_score(svc_w_pca, X, y, cv = 10, scoring = 'accuracy')
print(svc_pca_scores)
print('Average k-fold score with RBF kernel: ', svc_pca_scores.mean())
print('Length of list', len(svc_pca_scores)) 
print('Max of list', max(svc_pca_scores))

In [None]:
# cross-validated SVM w/PCA
svc_w_pca2 = SVC(kernel = 'poly')
svc_pca_scores2 = cross_val_score(svc_w_pca2, X, y, cv = 10, scoring = 'accuracy')
print(svc_pca_scores2)
print('Average k-fold score with poly kernel: ', svc_pca_scores2.mean())
print('Length of list', len(svc_pca_scores2)) 
print('Max of list', max(svc_pca_scores2))

When cross validation is performed with SVM + PCA combined, it can be seen that the new combined method with the RBF kernel still yields slightly better performance than the same with a polynomial kernel.

## LSTM

In [None]:
def to_3D(input_data):
    print("Before reshaping: ", input_data.shape)
    X = []
    for row in input_data: 
        X.append([row])
    X = np.asarray(X)
    print("After reshaping: ", X.shape)
    return X

In [None]:
features = df.iloc[:, :-1].to_numpy()

In [None]:
features_3D = to_3D(features)

In [None]:
targets = df.iloc[:, -1].to_numpy()
le = LabelEncoder()
targets_arr = le.fit_transform(targets)
targets_arr = to_categorical(targets_arr)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features_3D, targets_arr, test_size=0.2, random_state=1)

In [None]:
def create_model():
    model = Sequential()
    model.add(InputLayer(input_shape=(1, 216)))
    model.add(SpatialDropout1D(0.1))
    model.add(LSTM(100, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(6, activation='softmax'))
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    return modelmodel = create_model()
model.summary()

In [None]:
model = create_model()
model.summary()

In [None]:
earlystopping = EarlyStopping(patience=5,restore_best_weights=True)

checkpoint_path = "ckpts/cp_fs.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
cp_callback = ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=1)
batch_size = 16
epochs = 50

history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size,
         validation_split=0.15,
         callbacks=[cp_callback, earlystopping])

### Visualization

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

### Metrics

In [None]:
accuracy = model.evaluate(X_test, y_test)
print('Loss: {:0.3f}\nAccuracy: {:0.3f}'.format(accuracy[0],accuracy[1]))

In [None]:
prediction = model.predict_classes(X_test)
label = np.where(y_test)[1]

In [None]:
confusion = confusion_matrix(labels=label, predictions=prediction, num_classes=6)

In [None]:
cate = ['Angry', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad']

In [None]:
cm = pd.DataFrame(confusion.numpy(), index = [i for i in cate] , columns = [i for i in cate])
sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()