In [1]:
import numpy as np
import librosa, librosa.display
import tensorflow.keras as keras
from tensorflow.keras.utils import to_categorical
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
import pickle
import os

In [2]:
# data augmentation (noise)
def manipulate(data, noise_factor):
    noise = np.random.randn(len(data[0]))
    augmented_data = data + noise_factor * noise
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    return augmented_data

In [3]:
def cnn_train(X_train_raw, y_train, model_save_path):
    
    print("Input audio shape: ", X_train_raw.shape)
    
    # Data augmentation (add noise)
    print("Performing data augmentation (adding noise)...")
    aug_train = manipulate(X_train_raw, 0.1)
    X_train_raw = np.concatenate((X_train_raw, aug_train))
    y_train = np.concatenate((y_train, y_train))
    
    # Shuffle data (with augmentation)
    shuffler = np.random.permutation(len(X_train_raw))
    X_train_raw = X_train_raw[shuffler]
    y_train = y_train[shuffler]
    
    # Min-max normalization
    X_train_raw_scaled = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(X_train_raw.T)
    X_train_raw_scaled = X_train_raw_scaled.transform(X_train_raw.T).T
    X_train_raw = X_train_raw_scaled
    print("Training audio shape: ", X_train_raw.shape)
    
    # Extract MFCC feature
    print("Extracting MFCC feature...")
    sr = 44100
    n_mfcc = 13
    n_fft_mfcc = 2048
    hop_length_mfcc = 512

    MFCC_feature_matrix = []
    for i in range(X_train_raw.shape[0]):
        MFCC_feature_matrix += [librosa.feature.mfcc(X_train_raw[i], sr = sr, n_mfcc = n_mfcc, n_fft = n_fft_mfcc, hop_length = hop_length_mfcc)]
    MFCC_feature_matrix = np.array(MFCC_feature_matrix)
    print("MFCC feature shape: ", MFCC_feature_matrix.shape)

    # Add a depth of 1 so the data can be used in CNN
    X_train_total = MFCC_feature_matrix[..., np.newaxis]

    # One-hot encode output
    y_train_total = to_categorical(y_train)
    
    # CNN parameters
    EPOCHS = 120
    BATCH_SIZE = 64
    LEARNING_RATE = 0.0005
    NUM_OF_ENSEMBLE_MODEL = 50

    for i in range(NUM_OF_ENSEMBLE_MODEL):
        print("Training model number ", i)
        
        sample_ind = np.random.choice(np.array(range(X_train_total.shape[0])), int(X_train_total.shape[0]*0.8), replace=False)
        X_train = X_train_total[sample_ind]
        y_train = y_train_total[sample_ind]
        
        # Build CNN model
        model = keras.Sequential()
        # conv layer 1
        model.add(keras.layers.Conv2D(filters=32, kernel_size=(10, 10), activation="relu", padding="same",
                                      input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3]),
                                      kernel_regularizer=keras.regularizers.l2(0.001)))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.MaxPool2D((3, 3), strides=(2, 2), padding="same"))

        model.add(keras.layers.Conv2D(filters=32, kernel_size=(7, 7), activation="relu", padding="same", 
                                      kernel_regularizer=keras.regularizers.l2(0.001)))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(0.3))

        # conv layer 2
        model.add(keras.layers.Conv2D(filters=64, kernel_size=(7, 7), activation="relu", padding="same", 
                                      kernel_regularizer=keras.regularizers.l2(0.001)))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.MaxPool2D((3, 3), strides=(2, 2), padding="same"))
        model.add(keras.layers.Dropout(0.3))

        # conv layer 3
        model.add(keras.layers.Conv2D(filters=128, kernel_size=(3, 3), activation="relu", padding="same", 
                                      kernel_regularizer=keras.regularizers.l2(0.001)))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.MaxPool2D((3, 3), strides=(2, 2), padding="same"))
        model.add(keras.layers.Dropout(0.3))

        # flatten output
        model.add(keras.layers.Flatten())
        model.add(keras.layers.Dense(512, activation="relu"))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(0.5))
        model.add(keras.layers.Dense(256, activation="relu"))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(0.5))
        model.add(keras.layers.Dense(128, activation="relu"))
        model.add(keras.layers.BatchNormalization())
        model.add(keras.layers.Dropout(0.5))

        # softmax classifier
        model.add(keras.layers.Dense(9, activation="softmax"))

        # compile model
        optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
        # need to use sparse_categorical_crossentropy since our output are integers not one-hot encoded
        model.compile(optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"])

        # train model
        history = model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True, verbose=1)

        # save model
        model.save(model_save_path+'/model'+str(i)+'.h5')

In [4]:
def knn_train(X_train, y_train, model_save_path):
    # KNN parameters
    N_NEIGHBORS = 3
    METRIC = 'manhattan'
    WEIGHTS = 'distance'
    
    # MFCC
    sr = 44100
    n_mfcc = 13
    n_fft_mfcc = 2048
    hop_length_mfcc = 512

    print("Extracting MFCC feature...")
    MFCC_feature_matrix = []
    for i in range(X_train.shape[0]):
        MFCC_feature_matrix += [librosa.feature.mfcc(X_train[i], sr = sr, n_mfcc = n_mfcc, n_fft = n_fft_mfcc, hop_length = hop_length_mfcc)]
    MFCC_feature_matrix = np.array(MFCC_feature_matrix)
    MFCC_feature_matrix = np.mean(MFCC_feature_matrix, axis=2)
    
    # STFT
    n_fft_stft = 4096
    hop_length_stft = 2048

    print("Extracting STFT feature...")
    STFT_feature_matrix = []
    for i in range(X_train.shape[0]):
        STFT_feature_matrix += [np.abs(librosa.core.stft(X_train[i], n_fft = n_fft_stft, hop_length = hop_length_stft))]
    STFT_feature_matrix = np.array(STFT_feature_matrix)
    STFT_feature_matrix = np.mean(STFT_feature_matrix, axis=2)
    
    FeatureMatrix = np.concatenate((MFCC_feature_matrix, STFT_feature_matrix), axis=1)
    
    knn_class_1 = KNeighborsClassifier(n_neighbors = N_NEIGHBORS, 
                                 metric = METRIC,
                                 weights = WEIGHTS)
    
    print("Training KNN model...")
    model = knn_class_1.fit(FeatureMatrix, y_train)
    
    # save model
    knnPickle = open(model_save_path, 'wb') 
    pickle.dump(model, knnPickle)
    
    print("K-NN training completed")

In [8]:
def train(X_train, y_train):
    
    # create folder to put ensemble learners
    if not os.path.exists('Ensemble_Learners'):
        os.makedirs('Ensemble_Learners')
        
    # train CNN and save model
    print("Training ensemble learners...")
    cnn_train(X_train, y_train, 'Ensemble_Learners')
    # train KNN and save model
    print("Training knn...")
    knn_train(X_train, y_train, 'model_knn')

In [6]:
# load training data
# load data
# 2400 x 100000
data_training = np.load('data_training.npy')
# 1 - 8 (2400 x 1)
labels_training = np.load('labels_training.npy')

# label is from 1-8, so I added a place holder at position 0
labels_names = ['place holder', 'neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprise']

In [9]:
# train and save model
train(data_training, labels_training)

Training ensemble learners...
Input audio shape:  (2400, 100000)
Performing data augmentation (adding noise)...
Training audio shape:  (4800, 100000)
Extracting MFCC feature...
MFCC feature shape:  (4800, 13, 196)
Training model number  0
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 13, 196, 32)       3232      
_________________________________________________________________
batch_normalization (BatchNo (None, 13, 196, 32)       128       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 7, 98, 32)         0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 7, 98, 32)         50208     
_________________________________________________________________
batch_normalization_1 (Batch (None, 7, 98, 32)         128       
_______________

UnknownError:  Failed to get convolution algorithm. This is probably because cuDNN failed to initialize, so try looking to see if a warning log message was printed above.
	 [[node sequential/conv2d/Conv2D (defined at <ipython-input-3-30850119bac8>:106) ]] [Op:__inference_train_function_3131]

Function call stack:
train_function
