### An example of RNN on RAVDESS data (numpy implementation)
Wei Li

In [1]:
## RAVDESS Emotional speech audio
## https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
## Speech audio-only files (16bit, 48kHz .wav) from the RAVDESS.
##
## This portion of the RAVDESS contains 1440 files: 60 trials per actor x 24 actors = 1440.
## The RAVDESS contains 24 professional actors (12 female, 12 male),
## vocalizing two lexically-matched statements in a neutral North American accent.
## Speech emotions includes calm, happy, sad, angry, fearful, surprise, and disgust expressions.
## Each expression is produced at two levels of emotional intensity (normal, strong), with an additional neutral expression.

# The filename consists of a 7-part numerical identifier (e.g., 03-01-06-01-02-01-12.wav). These identifiers define the stimulus characteristics:

# Filename identifiers:
# Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
# Vocal channel (01 = speech, 02 = song).
# Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
# Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
# Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
# Repetition (01 = 1st repetition, 02 = 2nd repetition).
# Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).
# Filename example: 03-01-06-01-02-01-12.wav

# Emotions in the RAVDESS dataset
# emotions = {
#     "01": "neutral",
#     "02": "calm",
#     "03": "happy",
#     "04": "sad",
#     "05": "angry",
#     "06": "fearful",
#     "07": "disgust",
#     "08": "surprised",
# }

In [2]:
import os
import sys
import soundfile
import librosa

# SoundFile is not available for a conda install.
# pip install SoundFile

# If error occurs: ".../envs/py38torch/lib/libffi.7.dylib' (no such file)"
# try the following:
# cd /Users/wli169/miniconda3/envs/py38torch/lib/
# ln -s libffi.6.dylib libffi.7.dylib
# This command creates a symbolic link named libffi.7.dylib
# that points to the file libffi.6.dylib.

sys.path
# If running the .py script, uncomment the next line
# sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
# add parent directory: adds the parent directory of the module requiring it (__file__)
# to the beginning of the module search path.

# change the working directory to the parent folder
os.chdir("..")


import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from utils import get_data_utils
from utils import data_processor
from models.rnn_classifier import *
from nn.modules.loss import *
from nn.modules.activation import *
from nn.modules.linear import *
from nn.modules.dropout import *
from nn.modules.initializer import *
from optim.adam_rnn import *
from optim.adam import *
from evaluation.multiclass_eval import *

import random

random_seed = 123
os.environ["PL_GLOBAL_SEED"] = str(random_seed)
random.seed(random_seed)
np.random.seed(random_seed)

In [3]:
# %pip install watermark
%load_ext watermark
%watermark -a "Wei Li" -u -t -d -v -p numpy,torch,soundfile,librosa

Author: Wei Li

Last updated: 2023-12-28 21:27:52

Python implementation: CPython
Python version       : 3.8.17
IPython version      : 8.12.2

numpy    : 1.21.5
torch    : 1.12.1
soundfile: 0.12.1
librosa  : 0.10.1



In [4]:
# subset of emotions under consideration
# emotions_labels = ["calm", "happy", "fearful", "disgust"]
emotions_labels = [
    "neutral",
    "calm",
    "happy",
    "sad",
    "angry",
    "fearful",
    "disgust",
    "surprised",
]

###############################################
#### -----  check on one example -------- ####
###############################################

example, sr = librosa.load(
    "/Users/wli169/Documents/Work/datasets/RAVDESS-speech/Actor_01/03-01-01-01-01-01-01.wav",
    sr=48000,  # default resampling rate sr=22050
)
# WL: an alternative way to load is to use soundfile.SoundFile()
# however,there is some file cannot be properly loaded,
# e.g. /Actor_20/03-01-06-01-01-02-20.wav
# as the duration got truncated

librosa.get_duration(y=example, sr=48000)

type(example), example.shape
# np.ndarray shape: (158558,) where 48000*1*3.3 secs
# mono (1 channel)

librosa.feature.mfcc(
    y=example, sr=48000, n_mfcc=10
).shape  # (n_mfcc, t=num_frames)=(10, 310), default n_mfcc=10

stft = librosa.stft(example)
librosa.feature.chroma_stft(S=stft, sr=48000, n_chroma=12).shape  #  default n_chroma=12

librosa.feature.melspectrogram(
    y=example, sr=48000, n_mels=128
).shape  # default (n_mels=128, num_frames)

(128, 310)

In [5]:
stft = librosa.stft(example)
ch1=librosa.feature.chroma_stft(S=stft, sr=48000, n_chroma=12)

mc1=librosa.feature.mfcc(
    y=example, sr=48000, n_mfcc=10
)

ch1.shape, mc1.shape

result = np.vstack((mc1, ch1))
result.shape

(22, 310)

In [6]:
######################################
#### -----  load data -------- ####
### We set chroma=False, mel=False
######################################

# This function extracts audio features from a sound file
def extract_feature2(file_name, mfcc, chroma, mel):
    # Open the sound file using 'librosa' library and read samples into variable 'X'
    X, sample_rate = librosa.load(file_name, sr=48000, dtype="float32")
    # If chroma is true, calculate short-time Fourier transform of X
    if chroma:
        stft = np.abs(librosa.stft(X))
    if mfcc:
        mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=10).T
        # mfccs: shape (num_frames, n_mfcc)
        result = mfccs
    if chroma:
        chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate, n_chroma=12).T,
        result = np.vstack((result, chroma))
    if mel:
        mel = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128).T,
        result = np.vstack((result, mel))
    # Return the resulting numpy array containing extracted features

    # shape (num_frames, n_mfcc + n_crhoma + n_mel)
    return result

### We set chroma=False, mel=False

example_feature = extract_feature2(
    "/Users/wli169/Documents/Work/datasets/RAVDESS-speech/Actor_01/03-01-01-01-01-01-01.wav",
    mfcc=True,
    chroma=False,
    mel=False,
)
# shape (num_frames, num_mfccs) 
example_feature.shape


(310, 10)

In [7]:
# load np_ravdess numpy data
x_dat, y_dat = get_data_utils.get_np_ravdess2(emotions_labels, extract_feature2, max_frames=310, chroma_T=False, mel_T=False)
x_dat.shape, y_dat.shape

x_train, x_test, y_train, y_test = train_test_split(
    x_dat, y_dat, test_size=0.2, random_state=2023, stratify=y_dat
)
x_train.shape, x_test.shape
# since we have a small dataset, we just use validation data as test data
# stratify=y_dat: each category is roughly equally represented in training and test data

# Get the shape of the training and testing datasets
print((x_train.shape, x_test.shape))  # (num_examples, num_frames, num_features)
print((y_train.shape, y_test.shape))

((1152, 310, 10), (288, 310, 10))
((1152,), (288,))


In [8]:
######################################
######## ----- RNN  ----- ##########
######################################

# process data targets
y_train = data_processor.label_to_num(y_train, emotions_labels)
y_test = data_processor.label_to_num(y_test, emotions_labels)

np.bincount(y_train), np.bincount(y_test)
# each categori is roughly equally represented in training and test data

y_train = data_processor.to_onehot(y_train.reshape(-1, 1), len(emotions_labels))
y_test = data_processor.to_onehot(y_test.reshape(-1, 1), len(emotions_labels))

# normalize data input
dat_norm = data_processor.input_normalizer2()
x_train = dat_norm.fit_transform(x_train, method="column")
# nomalize using mean and std column-wise
dat_norm.mu.shape  # vector of means (1, 1, 10)
dat_norm.std.shape  # vector of stds (1, 1, 10)

x_test = dat_norm.transform(x_test)

# train, validation, test data
# since we have a small dataset, we just use validation data as test data
train_data = [x_train, y_train]
test_data = [x_test, y_test]

# check shapes
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((1152, 310, 10), (1152, 8), (288, 310, 10), (288, 8))

In [9]:
##### ---- set up the user-defined RNN model  ---- ######

# example, with two rnn layers (layer0, layer1)
#
#                       (output :logits)
# layer 2                    Linear
# layer 1    RNN  --- RNN --- RNN
# layer 0    RNN  --- RNN --- RNN
#            time0---time1---time2

# rnn input: x (np.array): The input sequences, of shape (batch_size, seq_len, input_size).

input_size = x_train.shape[2]  # num_features
output_size = len(emotions_labels)
rnn_layers = 2 # number of rnn layers
hidden_size = 32

rnn_weight_init_fn = weight_init_He
linear_weight_init_fn = weight_init_He

# My model
my_rnn_model = RNNClassifier(
    input_size, hidden_size, output_size, num_layers=rnn_layers
)

In [10]:
type(my_rnn_model)

# check some utility functions
my_rnn_model.print_structure()

model_paras_list = my_rnn_model.get_parameters()

# initialize W parameters
print(len(model_paras_list))

for i in range(len(model_paras_list)-1):
    model_paras_list[i]['W_ih'].data=rnn_weight_init_fn(model_paras_list[i]['W_ih'].data)
    model_paras_list[i]['W_hh'].data=rnn_weight_init_fn(model_paras_list[i]['W_hh'].data)
model_paras_list[len(model_paras_list)-1]['W'].data=linear_weight_init_fn(model_paras_list[len(model_paras_list)-1]['W'].data)
# the last layer is a Linear layer

-----------------------
The model architecture:
layer0:
	sublayer0: <nn.modules.rnn_cell.RNNCell object at 0x16954cdf0>
layer1:
	sublayer0: <nn.modules.rnn_cell.RNNCell object at 0x1691ffd90>
layer2:
	sublayer0: <nn.modules.linear.Linear object at 0x16954cd00>

---------------------------------
layers with learnable parameters:
layer0 
 (0)RNNCell
(32, 10)
(32, 32)
(32,)
(32,)

layer1 
 (0)RNNCell
(32, 32)
(32, 32)
(32,)
(32,)

layer2 
 (0)linear
(8, 32)
(8, 1)

layer0 
 (0)RNNCell
(32, 10)
(32, 32)
(32,)
(32,)

layer1 
 (0)RNNCell
(32, 32)
(32, 32)
(32,)
(32,)

layer2 
 (0)linear
(8, 32)
(8, 1)

3


In [11]:
####################
#### training  #####
####################

criterion = CrossEntropyLoss()

# criterion = CrossEntropyLoss2()
# note that CrossEntropyLoss2() returns loss (np.ndarra) 
# (batch size, ) loss for each obs in the batch

lr1 = 1e-3
lr2 = 1e-3

optimizer1 = Adam_rnn(model_paras_list[0:rnn_layers], lr=lr1)
optimizer2 = Adam([model_paras_list[rnn_layers]], lr=lr2) # the output layer
# [model_paras_list[4]]] is a list.

optimizers = [optimizer1, optimizer2]

num_epochs = 200
batch_size = 128

output = trainer_multiclass(
    my_rnn_model,
    optimizers,
    criterion,
    train_data,
    test_data,
    num_epochs,
    batch_size,
    print_all=False,
)

training_losses, training_errors, validation_losses, validation_errors = output

Epoch: 001/200 | Train loss: 2.3488 | Validation loss: 2.1331 
Epoch: 001/200 | Train error: 0.8759 | Validation error: 0.8472 
Time elapsed: 0.01 min
Epoch: 002/200 | Train loss: 2.1153 | Validation loss: 2.0181 
Epoch: 002/200 | Train error: 0.8203 | Validation error: 0.8021 
Time elapsed: 0.01 min
Epoch: 003/200 | Train loss: 2.0317 | Validation loss: 1.9643 
Epoch: 003/200 | Train error: 0.8056 | Validation error: 0.7674 
Time elapsed: 0.01 min
Epoch: 004/200 | Train loss: 1.9835 | Validation loss: 1.9459 
Epoch: 004/200 | Train error: 0.7700 | Validation error: 0.7535 
Time elapsed: 0.01 min
Epoch: 005/200 | Train loss: 1.9464 | Validation loss: 1.9267 
Epoch: 005/200 | Train error: 0.7457 | Validation error: 0.7569 
Time elapsed: 0.01 min
Epoch: 006/200 | Train loss: 1.9148 | Validation loss: 1.9136 
Epoch: 006/200 | Train error: 0.7448 | Validation error: 0.7431 
Time elapsed: 0.01 min
Epoch: 007/200 | Train loss: 1.8891 | Validation loss: 1.8956 
Epoch: 007/200 | Train error: 0

In [12]:
##------ evaluating ----- ####
# note: eight classes to classify

# evaluate
evaluator_multiclass(my_rnn_model, criterion, test_data, batch_size=256)
# error rate 0.684, not impressive, yet demontrating the numpy implementation 
# of simple RNN module works as intended (check the counterpart performance using Pytorch modules)

Testing Loss:  1.8741259876555771
Testing Error:  0.6840277777777778


(1.8741259876555771, 0.6840277777777778)