### An example of 1D CNN on RAVDESS data (numpy implementation)
Wei Li

In [1]:
## RAVDESS Emotional speech audio
## https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
## Speech audio-only files (16bit, 48kHz .wav) from the RAVDESS.
##
## This portion of the RAVDESS contains 1440 files: 60 trials per actor x 24 actors = 1440.
## The RAVDESS contains 24 professional actors (12 female, 12 male),
## vocalizing two lexically-matched statements in a neutral North American accent.
## Speech emotions includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
## Each expression is produced at two levels of emotional intensity (normal, strong), with an additional neutral expression.

# The filename consists of a 7-part numerical identifier (e.g., 03-01-06-01-02-01-12.wav). These identifiers define the stimulus characteristics:

# Filename identifiers:
# Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
# Vocal channel (01 = speech, 02 = song).
# Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
# Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
# Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
# Repetition (01 = 1st repetition, 02 = 2nd repetition).
# Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).
# Filename example: 03-01-06-01-02-01-12.wav

# Emotions in the RAVDESS dataset
# emotions = {
#     "01": "neutral",
#     "02": "calm",
#     "03": "happy",
#     "04": "sad",
#     "05": "angry",
#     "06": "fearful",
#     "07": "disgust",
#     "08": "surprised",
# }

In [1]:
import os
import sys
import soundfile
import librosa

# SoundFile is not available for a conda install.
# pip install SoundFile

# If error occurs: ".../envs/py38torch/lib/libffi.7.dylib' (no such file)"
# try the following:
# cd /Users/wli169/miniconda3/envs/py38torch/lib/
# ln -s libffi.6.dylib libffi.7.dylib
# This command creates a symbolic link named libffi.7.dylib
# that points to the file libffi.6.dylib.

sys.path
# If running the .py script, uncomment the next line
# sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
# add parent directory: adds the parent directory of the module requiring it (__file__)
# to the beginning of the module search path.

# change the working directory to the parent folder
os.chdir("..")

import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from utils import get_data_utils
from utils import data_processor
from models.cnn import *
from nn.modules.loss import *
from nn.modules.activation import *
from nn.modules.linear import *
from nn.modules.dropout import *
from nn.modules.initializer import *
from optim.sgd import *
from optim.adam import *
from evaluation.multiclass_eval import *

import random

random_seed = 123
os.environ["PL_GLOBAL_SEED"] = str(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
# %pip install watermark
%load_ext watermark
%watermark -a "Wei Li" -u -t -d -v -p numpy,torch,soundfile,librosa

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Author: Wei Li

Python implementation: CPython
Python version       : 3.8.17
IPython version      : 8.12.2

numpy    : 1.21.5
torch    : 1.12.1
soundfile: 0.12.1
librosa  : 0.10.1



In [4]:
# subset of emotions under consideration
# emotions_labels = ["calm", "happy", "fearful", "disgust"]
emotions_labels = [
    "neutral",
    "calm",
    "happy",
    "sad",
    "angry",
    "fearful",
    "disgust",
    "surprised",
]

###############################################
#### -----  check on one example -------- ####
###############################################

example, sr = librosa.load(
    "/Users/wli169/Documents/Work/datasets/RAVDESS-speech/Actor_01/03-01-01-01-01-01-01.wav",
    sr=48000,  # default resampling rate sr=22050
)
# Note: an alternative way to load is to use soundfile.SoundFile()
# however,there is some file cannot be properly loaded,
# e.g. /Actor_20/03-01-06-01-01-02-20.wav
# as the duration got truncated

librosa.get_duration(y=example, sr=48000)

type(example), example.shape
# np.ndarray shape: (158558,) where 48000*1*3.3 secs
# mono (1 channel)

librosa.feature.mfcc(
    y=example, sr=48000, n_mfcc=40
).shape  # (n_mfcc, t=num_frames)=(40, 310), default n_mfcc=40

stft = librosa.stft(example)
librosa.feature.chroma_stft(S=stft, sr=48000, n_chroma=12).shape  #  default n_chroma=12

librosa.feature.melspectrogram(
    y=example, sr=48000, n_mels=128
).shape  # default n_mels=128

(128, 310)

In [5]:
######################################
#### -----  load data -------- ####
### We set chroma=True, mel=True
######################################

# This function extracts audio features from a sound file
def extract_feature(file_name, mfcc, chroma, mel):
    # Open the sound file using 'librosa' library and read samples into variable 'X'
    X, sample_rate = librosa.load(file_name, sr=48000, dtype="float32")
    # If chroma is true, calculate short-time Fourier transform of X
    if chroma:
        stft = np.abs(librosa.stft(X))

    # Create an empty numpy array called 'result'
    result = np.array([])
    # If mfcc is true, calculate mean across all frames of 40 MFCC coefficients and store them in 'result'
    if mfcc:
        mfccs = np.mean(
            librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0
        )  # mfccs: shape (n_mfcc, )
        result = np.hstack((result, mfccs))
    # If chroma is true, calculate mean across all frames of chroma feature and store it in 'result'
    if chroma:
        chroma = np.mean(
            librosa.feature.chroma_stft(S=stft, sr=sample_rate, n_chroma=12).T,
            axis=0,
        )
        result = np.hstack((result, chroma))
    # If mel is true, calculate mean across all frames of mel spectrogram and store it in 'result'
    if mel:
        mel = np.mean(
            librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128).T,
            axis=0,
        )
        result = np.hstack((result, mel))
    # Return the resulting numpy array containing extracted features
    return result


example_feature = extract_feature(
    "/Users/wli169/Documents/Work/datasets/RAVDESS-speech/Actor_01/03-01-01-01-01-01-01.wav",
    mfcc=True,
    chroma=True,
    mel=True,
)
# shape (180, ) , 40 mfccs + 12 chroma + 128 mels = 180 features
example_feature.shape


(180,)

In [6]:
# load np_ravdess numpy data
x_dat, y_dat = get_data_utils.get_np_ravdess(emotions_labels, extract_feature, chroma_T=True, mel_T=True)
x_dat.shape, y_dat.shape
# total sample:  ((1440, 180), (1440,))

x_train, x_test, y_train, y_test = train_test_split(
    x_dat, y_dat, test_size=0.2, random_state=2023, stratify=y_dat
)
x_train.shape, x_test.shape
# since we have a small dataset, we just use validation data as test data
# stratify=y_dat: each category is roughly equally represented in training and test data

# Get the shape of the training and testing datasets
print((x_train.shape, x_test.shape))
print((y_train.shape, y_test.shape))

((1152, 180), (288, 180))
((1152,), (288,))


In [7]:
###########################################
### ----- toy MLP from scikilearn ----- ###
###########################################

# Initialize the Multi Layer Perceptron Classifier
model = MLPClassifier(
    alpha=0.01,
    batch_size=128,
    epsilon=1e-08,
    hidden_layer_sizes=(256, 128),
    learning_rate="adaptive",
    max_iter=1000,
    random_state=2023,
)

# DataFlair - Train the model
model.fit(x_train, y_train)

# DataFlair - Predict for the test set
y_pred = model.predict(x_test)

# DataFlair - Calculate the accuracy of our model
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

# DataFlair - Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy * 100))
# Accuracy: 47.22%

Accuracy: 47.22%


In [8]:
######################################
######## ----- CONV1D ----- ##########
######################################

# process data targets
y_train = data_processor.label_to_num(y_train, emotions_labels)
y_test = data_processor.label_to_num(y_test, emotions_labels)

np.bincount(y_train), np.bincount(y_test)
# each categori is roughly equally represented in training and test data

y_train = data_processor.to_onehot(y_train.reshape(-1, 1), len(emotions_labels))
y_test = data_processor.to_onehot(y_test.reshape(-1, 1), len(emotions_labels))

# normalize data input
dat_norm = data_processor.input_normalizer()
x_train = dat_norm.fit_transform(x_train, method="column")
# nomalize using mean and std column-wise
dat_norm.mu.shape  # vector of means
dat_norm.std.shape  # vector of stds

x_test = dat_norm.transform(x_test)

# reshape to (sample size, number channel=1, num_features)
x_train = x_train[:, np.newaxis, :]
x_test = x_test[:, np.newaxis, :]

# train, validation, test data
# since we have a small dataset, we just use validation data as test data
train_data = [x_train, y_train]
test_data = [x_test, y_test]


##### ---- set up the user-defined model  ---- ######

# x (batch_size, in_channels=1, in_width=180)
# Conv1D (out_channels=8, kernel=10, stride=1)-> Tanh
# pool (kernel=2)
# Conv1D (out_channels=8, kernel=10, stride=2)-> ReLu
# pool (kernel=2)
# Conv1D (out_channels=4, kernel=4, stride=2) -> Sigmoid
# pool (kernel=1)
# Flatten1D
# Linear (out_features=8)

input_width = x_train.shape[2]  # num_features
input_channels = 1
num_linear_neurons = len(emotions_labels)
out_channels = [8, 8, 4]
kernel_sizes = [10, 10, 4]
strides = [1, 2, 2]
pool_kernel_sizes = [2, 2, 1]
activations = [Tanh(), ReLU(), Sigmoid()]

conv_weight_init_fn = weight_init_He_CNN
linear_weight_init_fn = weight_init_He_CNN
bias_init_fn = bias_init_zeros
# criterion = SoftmaxCrossEntropy()
criterion = CrossEntropyLoss()
lr = 1e-2

cnn1d_model = CNN1D(
    input_width,
    input_channels,
    out_channels,
    kernel_sizes,
    strides,
    num_linear_neurons,
    activations,
    conv_weight_init_fn,
    bias_init_fn,
    linear_weight_init_fn,
    pool_kernel_sizes,
    pool_mode="average",
)

type(cnn1d_model)

# check some utility functions
cnn1d_model.print_structure()

model_paras_list = cnn1d_model.get_parameters()

-----------------------
The model architecture:
layer0:
	sublayer0: <nn.modules.conv.Conv1D object at 0x28f219730>
	sublayer1: <nn.modules.activation.Tanh object at 0x11011f1c0>
layer1:
	sublayer0: <nn.modules.conv.Pool1D object at 0x28f219520>
layer2:
	sublayer0: <nn.modules.conv.Conv1D object at 0x28f223e20>
	sublayer1: <nn.modules.activation.ReLU object at 0x28d5406a0>
layer3:
	sublayer0: <nn.modules.conv.Pool1D object at 0x28f223f10>
layer4:
	sublayer0: <nn.modules.conv.Conv1D object at 0x28f223a60>
	sublayer1: <nn.modules.activation.Sigmoid object at 0x28f219190>
layer5:
	sublayer0: <nn.modules.conv.Pool1D object at 0x28f223c10>
layer6:
	sublayer0: <nn.modules.conv.Flatten1D object at 0x28f219640>
layer7:
	sublayer0: <nn.modules.linear.Linear object at 0x28f223910>

---------------------------------
layers with learnable parameters:
layer0 
 (0)conv1d
(8, 1, 10)
(8, 1)

layer2 
 (0)conv1d
(8, 8, 10)
(8, 1)

layer4 
 (0)conv1d
(4, 8, 4)
(4, 1)

layer7 
 (0)linear
(8, 32)
(8, 1)

la

In [9]:
# training
optimizer = Adam(model_paras_list, lr=lr)
optimizers = [optimizer]

num_epochs = 100
batch_size = 128

output = trainer_multiclass(
    cnn1d_model,
    optimizers,
    criterion,
    train_data,
    test_data,
    num_epochs,
    batch_size,
    print_all=False,
)

training_losses, training_errors, validation_losses, validation_errors = output

# reference performance:
# out_channels = [8, 8, 4]
# kernel_sizes = [10, 10, 4]
# strides = [1, 2, 2]
# activations = [Tanh(), ReLU(), Sigmoid()]
# Epoch: 100/100 | Train loss: 0.8035 | Validation loss: 2.2211
# Epoch: 100/100 | Train error: 0.2622 | Validation error: 0.6319

Epoch: 001/100 | Train loss: 2.0219 | Validation loss: 1.9361 
Epoch: 001/100 | Train error: 0.8047 | Validation error: 0.7431 
Time elapsed: 0.88 min
Epoch: 002/100 | Train loss: 1.9308 | Validation loss: 1.8792 
Epoch: 002/100 | Train error: 0.7674 | Validation error: 0.7535 
Time elapsed: 0.88 min
Epoch: 003/100 | Train loss: 1.8821 | Validation loss: 1.8427 
Epoch: 003/100 | Train error: 0.7405 | Validation error: 0.7292 
Time elapsed: 0.88 min
Epoch: 004/100 | Train loss: 1.8502 | Validation loss: 1.8251 
Epoch: 004/100 | Train error: 0.7240 | Validation error: 0.7049 
Time elapsed: 0.88 min
Epoch: 005/100 | Train loss: 1.8215 | Validation loss: 1.8083 
Epoch: 005/100 | Train error: 0.6979 | Validation error: 0.7257 
Time elapsed: 0.89 min
Epoch: 006/100 | Train loss: 1.7926 | Validation loss: 1.8029 
Epoch: 006/100 | Train error: 0.7040 | Validation error: 0.7153 
Time elapsed: 0.89 min
Epoch: 007/100 | Train loss: 1.7714 | Validation loss: 1.7859 
Epoch: 007/100 | Train error: 0

In [10]:
##------ evaluating ----- ####
# note: eight classes to classify

# evaluate
evaluator_multiclass(cnn1d_model, criterion, test_data, batch_size=128)
# error rate 0.534, compared with MLP error rate 0.528

Testing Loss:  1.7354156856716285
Testing Error:  0.5347222222222222


(1.7354156856716285, 0.5347222222222222)