In [1]:
import os
import numpy as np
import numpy as _np
if not hasattr(_np, 'complex'):
    _np.complex = complex

import librosa
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, Conv1D, ZeroPadding1D,
    Activation, Multiply, Add,
    Softmax, Layer
)
from tensorflow.keras.models import Model


2025-04-25 18:44:47.255386: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745606689.306057   20650 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745606689.842916   20650 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-25 18:44:55.231173: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:

SAMPLE_RATE  = 16000 
MAX_DURATION = 3.0                      
MAX_SAMPLES  = int(SAMPLE_RATE * MAX_DURATION) 
BATCH_SIZE   = 8                     
EPOCHS       = 50                      


# Load data

In [3]:
TARGET_EMOTIONS = ['neutral', 'sad', 'happy', 'angry']

In [4]:
EMO_MAP = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

def load_ravdess(path):
    X, y = [], []
    for fp in glob(os.path.join(path, "Actor_*", "*.wav")):
        code = os.path.basename(fp).split('-')[2]
        label = EMO_MAP.get(code)
        if label not in TARGET_EMOTIONS:
            continue
        sig, _ = librosa.load(fp, sr=SAMPLE_RATE)
        sig, _ = librosa.effects.trim(sig, top_db=20)
        if len(sig) < MAX_SAMPLES:
            sig = np.pad(sig, (0, MAX_SAMPLES - len(sig)), 'constant')
        else:
            sig = sig[:MAX_SAMPLES]
        X.append(sig)
        y.append(label)
    return np.array(X), np.array(y)


In [None]:
CREMA_EMO_MAP = {
    "ANG": "angry",
    "DIS": "disgust",
    "FEA": "fearful",
    "HAP": "happy",
    "NEU": "neutral",
    "SAD": "sad"
}


def load_crema(path, limit=3500):
    X, y = [], []
    files = glob(os.path.join(path, "*.wav"))
    count = 0
    for fp in files:
        if count >= limit:
            break
        filename = os.path.basename(fp)
        parts = filename.split('_')
        emo_code = parts[2]
        label = CREMA_EMO_MAP.get(emo_code)
        if label not in TARGET_EMOTIONS:
            continue
        sig, _ = librosa.load(fp, sr=SAMPLE_RATE)
        sig, _ = librosa.effects.trim(sig, top_db=20)
        if len(sig) < MAX_SAMPLES:
            sig = np.pad(sig, (0, MAX_SAMPLES - len(sig)), 'constant')
        else:
            sig = sig[:MAX_SAMPLES]
        X.append(sig)
        y.append(label)
        count += 1
    return np.array(X), np.array(y)



In [6]:
RAVDESS_PATH = '../data/RAVDESS_Data'
CREMA_PATH   = '../data/CREMA_Data'

print("Loading & trimming RAVDESS...")
X_ravdess, y_ravdess = load_ravdess(RAVDESS_PATH)
print(f"Loaded {X_ravdess.shape[0]} samples from RAVDESS")

print("Loading & trimming CREMA-D...")
X_crema, y_crema = load_crema(CREMA_PATH)
print(f"Loaded {X_crema.shape[0]} samples from CREMA-D")

Loading & trimming RAVDESS...
Loaded 672 samples from RAVDESS
Loading & trimming CREMA-D...
Loaded 3000 samples from CREMA-D


In [7]:

X_total = np.concatenate([X_ravdess, X_crema])
y_total = np.concatenate([y_ravdess, y_crema])

print(f"Tổng số samples sau khi gộp: {X_total.shape[0]}")

X_total, y_total = shuffle(X_total, y_total, random_state=42)

lb = LabelBinarizer()
y_onehot = lb.fit_transform(y_total)


X_tr, X_va, y_tr, y_va = train_test_split(
    X_total, y_onehot, test_size=0.2, stratify=y_total, random_state=42
)

X_tr = X_tr[..., np.newaxis]
X_va = X_va[..., np.newaxis]

print(f"Train set: {X_tr.shape}, Validation set: {X_va.shape}")
print(f"Số lượng class: {len(lb.classes_)} - {lb.classes_}")

Tổng số samples sau khi gộp: 3672
Train set: (2937, 48000, 1), Validation set: (735, 48000, 1)
Số lượng class: 4 - ['angry' 'happy' 'neutral' 'sad']


# Model 

In [8]:
class AttentionPooling(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.score_conv = Conv1D(1, 1, padding='same', name="attn_score_conv")
        self.softmax    = Softmax(axis=1, name="attn_weights")

    def call(self, inputs):
        score   = self.score_conv(inputs)       # (batch, time, 1)
        weights = self.softmax(score)           # (batch, time, 1)
        # Context vector: weighted sum over time
        return tf.reduce_sum(weights * inputs, axis=1)  # (batch, channels)

def conv1d_causal(x, filters, kernel_size, dilation_rate, name):
    pad = (kernel_size - 1) * dilation_rate
    x   = ZeroPadding1D((pad, 0), name=f"{name}_pad")(x)
    return Conv1D(filters, kernel_size,
                  dilation_rate=dilation_rate,
                  padding='valid',
                  name=name)(x)

def dilation_block(x, res_ch, dil_ch, skip_ch, k_size, rate, name):
    # gated conv
    f = conv1d_causal(x, dil_ch, k_size, rate, name=f"{name}_f")
    f = Activation('tanh',    name=f"{name}_tanh")(f)
    g = conv1d_causal(x, dil_ch, k_size, rate, name=f"{name}_g")
    g = Activation('sigmoid', name=f"{name}_sigmoid")(g)
    z = Multiply(name=f"{name}_gate")([f,g])
    skip = Conv1D(skip_ch, 1, padding='same', name=f"{name}_skip")(z)
    res  = Conv1D(res_ch,   1, padding='same', name=f"{name}_res")(z)
    x    = Add(name=f"{name}_out")([res, x])
    return x, skip

def build_wavenet_ser(time_steps, n_classes,
                      res_ch=32, dil_ch=32, skip_ch=64,  
                      k_size=3,
                      num_res_blocks=2,                 
                      num_dil_layers=5):                
    inp = Input(shape=(time_steps,1), name="input_wave")
    x   = Conv1D(res_ch, 1, padding='same', name="pre_conv")(inp)

    skips = []
    for b in range(num_res_blocks):
        for i in range(num_dil_layers):
            rate = 2 ** i
            x, s = dilation_block(x, res_ch, dil_ch, skip_ch,
                                  k_size, rate,
                                  name=f"blk{b+1}_dil{i+1}")
            skips.append(s)


    x = Add(name="skip_sum")(skips)
    x = Activation('relu', name="post_relu")(x)
    x = Conv1D(skip_ch, 1, activation='relu', name="post_conv1")(x)
    x = Conv1D(n_classes, 1, name="post_conv2")(x)

    context = AttentionPooling(name="attn_pool")(x)
    out     = Activation('softmax', name="softmax")(context)
    return Model(inp, out, name="WaveNet_SER")

model = build_wavenet_ser(
    MAX_SAMPLES, len(lb.classes_),
    res_ch=32, dil_ch=32, skip_ch=64,
    k_size=3, num_res_blocks=2,
    num_dil_layers=5
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

I0000 00:00:1745606908.357738   20650 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5563 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


In [9]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
]

history = model.fit(
    X_tr, y_tr,
    validation_data=(X_va, y_va),
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    callbacks=callbacks
)

loss, acc = model.evaluate(X_va, y_va, batch_size=BATCH_SIZE)
print(f"\nFinal validation accuracy: {acc*100:.2f}%")

2025-04-25 18:48:38.981426: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 563904000 exceeds 10% of free system memory.
2025-04-25 18:48:43.004271: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 563904000 exceeds 10% of free system memory.


Epoch 1/50


I0000 00:00:1745606939.500947   23319 service.cc:148] XLA service 0x7f7ee4003560 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1745606939.506948   23319 service.cc:156]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-04-25 18:49:00.293602: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1745606941.982224   23319 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1745606961.775990   23319 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 423ms/step - accuracy: 0.4186 - loss: 1.1870 - val_accuracy: 0.5524 - val_loss: 1.0183 - learning_rate: 0.0010
Epoch 2/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 387ms/step - accuracy: 0.5559 - loss: 1.0171 - val_accuracy: 0.5170 - val_loss: 1.0579 - learning_rate: 0.0010
Epoch 3/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 387ms/step - accuracy: 0.5461 - loss: 1.0031 - val_accuracy: 0.5905 - val_loss: 0.9878 - learning_rate: 0.0010
Epoch 4/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 386ms/step - accuracy: 0.5540 - loss: 0.9959 - val_accuracy: 0.5714 - val_loss: 1.0407 - learning_rate: 0.0010
Epoch 5/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 386ms/step - accuracy: 0.5554 - loss: 0.9673 - val_accuracy: 0.5646 - val_loss: 1.0793 - learning_rate: 0.0010
Epoch 6/50
[1m368/368[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m