## import lib ##

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import Sequence
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay



## define spectogram  ##

In [None]:
def wav_to_spectrogram(wav_file, xdim=180, ydim=128):
    if not wav_file.endswith('.wav'):
        raise ValueError(f"Expected .wav file, but got: {wav_file}")
    
    # Load audio
    audio, sr = librosa.load(wav_file, sr=None)
    duration = librosa.get_duration(y=audio, sr=sr)

    # Tạo mel-spectrogram (n_mels=ydim)
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=ydim, fmax=8000)
    spectrogram = librosa.power_to_db(spectrogram, ref=np.max)

    # Điều chỉnh thời gian (chiều thứ 2) về đúng xdim
    # Sử dụng librosa.util.fix_length để pad/truncate
    spectrogram = librosa.util.fix_length(spectrogram, size=xdim, axis=1)

    # Đảm bảo đúng kích thước (ydim, xdim) → (128, 180)
    # spectrogram.shape[0] = ydim, spectrogram.shape[1] = xdim
    if spectrogram.shape[0] < ydim:
        spectrogram = np.pad(spectrogram, ((0, ydim - spectrogram.shape[0]), (0, 0)), 
                             mode='constant', constant_values=0)
    elif spectrogram.shape[0] > ydim:
        spectrogram = spectrogram[:ydim, :]

    # Cuối cùng, chuyển từ 2D -> 3D (giả RGB)
    spectrogram = np.repeat(spectrogram[..., np.newaxis], 3, axis=-1)

    return spectrogram


## define dataset class ##

In [None]:
class SpectrogramDataset(Sequence):
    def __init__(self, file_paths, labels, xdim=180, ydim=180, batch_size=32, shuffle=True):
        self.file_paths = file_paths
        self.labels = labels
        self.xdim = xdim
        self.ydim = ydim
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.file_paths) / self.batch_size))

    def __getitem__(self, index):
        batch_paths = self.file_paths[index * self.batch_size:(index + 1) * self.batch_size]
        batch_labels = self.labels[index * self.batch_size:(index + 1) * self.batch_size]

        batch_spectrograms = np.array([
            wav_to_spectrogram(str(file), self.xdim, self.ydim) 
            for file in batch_paths
        ])

        return batch_spectrograms, np.array(batch_labels)

    def on_epoch_end(self):
        if self.shuffle:
            temp = list(zip(self.file_paths, self.labels))
            np.random.shuffle(temp)
            self.file_paths, self.labels = zip(*temp)

## data loading ##

In [None]:

Ravdess = Path("../data/RAVDESS_Data")
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []

for dir_name in ravdess_directory_list:
    actor_dir = Ravdess / dir_name
    if os.path.isdir(actor_dir):
        # liệt kê file wav
        actor_files = os.listdir(actor_dir)
        for file in actor_files:
            if file.endswith(".wav"):
                part = file.split('.')[0]
                part = part.split('-')
                # part[2] chính là emotion ID
                file_emotion.append(int(part[2])) 
                file_path.append(actor_dir / file)

# Tạo dataframe 
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)



In [None]:

Ravdess_df['Emotions'].replace({
    1: 'Neutral',
    2: 'Calm',
    3: 'Happy',
    4: 'Sad',
    5: 'Angry',
    6: 'Fear',
    7: 'Disgust',
    8: 'Surprise'
}, inplace=True)


In [None]:
label_encoder = LabelEncoder()
Ravdess_df["Label"] = label_encoder.fit_transform(Ravdess_df["Emotions"])

train_df, val_df = train_test_split(
    Ravdess_df, 
    test_size=0.2, 
    stratify=Ravdess_df["Label"], 
    random_state=42
)

print("Train size:", len(train_df))
print("Val size:", len(val_df))


category_labels = label_encoder.classes_
num_classes = len(category_labels)
print("category_labels:", category_labels)


In [None]:
train_dataset = SpectrogramDataset(
    file_paths=train_df["Path"].tolist(),
    labels=train_df["Label"].tolist(),
    xdim=180,
    ydim=180,
    batch_size=16,
    shuffle=True
)

val_dataset = SpectrogramDataset(
    file_paths=val_df["Path"].tolist(),
    labels=val_df["Label"].tolist(),
    xdim=180,
    ydim=180,
    batch_size=16, 
    shuffle=False
)


## define model architecture ## 

In [None]:
# Model Fine-tuning
conv_base = keras.applications.vgg16.VGG16(
    weights="imagenet",
    include_top=False,
    input_shape=(180, 180, 3)
)

# Freeze ban đầu
conv_base.trainable = False

# Tạo head
inputs = keras.Input(shape=(180, 180, 3))
x = keras.applications.vgg16.preprocess_input(inputs)
x = conv_base(x)
x = layers.Flatten()(x)
x = layers.Dense(256, activation="relu")(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="rmsprop",
    metrics=["accuracy"]
)

model.summary()

## train the model ## 

In [None]:
early_stop = EarlyStopping(
    monitor='val_accuracy',
    patience=5,              
    restore_best_weights=True,
    verbose=1
)

checkpoint = ModelCheckpoint(
    'best_model_phase1.h5',  
    monitor='val_accuracy',  
    save_best_only=True,
    verbose=1
)

In [None]:
history1 = model.fit(
    train_dataset,
    epochs=30,
    validation_data=val_dataset,
    callbacks=[early_stop, checkpoint],
    verbose=1
)
plt.figure(figsize=(8,4))
plt.plot(history1.history["accuracy"], label="Train Acc (Phase 1)")
plt.plot(history1.history["val_accuracy"], label="Val Acc (Phase 1)")
plt.legend()
plt.show()


## fine tune model ##

In [None]:
conv_base.trainable = True
for layer in conv_base.layers[:-4]:
    layer.trainable = False

model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=keras.optimizers.Adam(learning_rate=1e-5),  
    metrics=["accuracy"]
)

checkpoint_phase2 = ModelCheckpoint(
    'best_model_phase2.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

history2 = model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset,
    callbacks=[early_stop, checkpoint_phase2],
    verbose=1
)

plt.figure(figsize=(8,4))
plt.plot(history2.history["accuracy"], label="Train Acc (Phase 2)")
plt.plot(history2.history["val_accuracy"], label="Val Acc (Phase 2)")
plt.legend()
plt.show()

In [None]:
print("=== Evaluate on val_dataset ===")
val_loss, val_acc = model.evaluate(val_dataset, verbose=1)
print(f"Final Validation Loss = {val_loss:.4f}")
print(f"Final Validation Acc  = {val_acc:.4f}")

In [None]:
all_preds = []
all_labels = []
for X_batch, y_batch in val_dataset:
    preds = model.predict(X_batch)
    all_preds.extend(np.argmax(preds, axis=1))
    all_labels.extend(y_batch)

print("Classification Report:")
print(classification_report(all_labels, all_preds, target_names=category_labels))

cm = confusion_matrix(all_labels, all_preds)
disp = ConfusionMatrixDisplay(cm, display_labels=category_labels)
disp.plot(xticks_rotation='vertical')
plt.show()

In [None]:
model.save("vgg16_model.h5")
print("Model saved to vgg16_model.h5")

In [None]:
import time
import numpy as np
from tensorflow.keras.models import load_model

# 1) Load lại mô hình đã train
deploy_model = load_model("vgg16_model.h5")
print("Loaded model from vgg16_model.h5")

# 2) Chọn 1 file audio .wav bất kỳ để test
test_wav = os.path.join(dataset_path, category_labels[0], "../data/raw_data/sad.MP3")  
# (Ví dụ: thay "VD1.wav" bằng tên file cụ thể bạn có)

# 3) Ghi nhận thời gian bắt đầu
start_time = time.time()

# 4) Load file và chuyển thành spectrogram
spectrogram = wav_to_spectrogram(test_wav, xdim=180, ydim=180)

# 5) Chuẩn bị đầu vào cho mô hình (batch_size=1)
spectrogram_input = np.expand_dims(spectrogram, axis=0)  # shape (1, 180, 180, 3)

# 6) Mô hình dự đoán
pred_prob = deploy_model.predict(spectrogram_input)  # shape (1, num_classes)
pred_label_idx = np.argmax(pred_prob, axis=1)[0]
pred_label_name = category_labels[pred_label_idx]

# 7) Ghi nhận thời gian kết thúc
end_time = time.time()
elapsed_time = end_time - start_time

# 8) In kết quả
print(f"Test file: {test_wav}")
print(f"Predicted label index = {pred_label_idx}")
print(f"Predicted label name = {pred_label_name}")
print(f"Elapsed time: {elapsed_time:.4f} seconds")