In [74]:
import tensorflow as tf
import librosa
import numpy as np
import time
from tensorflow.keras.layers import Layer, Conv1D, Softmax
from keras.saving import register_keras_serializable

In [75]:
AUDIO_PATH = "../data/Crema_Data/1001_ITH_SAD_XX.wav"
TARGET_SR = 16000   
MAX_SAMPLES = 48000 

In [76]:
def preprocess_for_wavenet(file_path):
    y, sr = librosa.load(file_path, sr=TARGET_SR)

    y = np.asarray(y, dtype=np.float32)

    if len(y) > MAX_SAMPLES:
        y = y[:MAX_SAMPLES]
    else:
        y = np.pad(y, (0, MAX_SAMPLES - len(y)), mode='constant')

    return y.reshape(1, MAX_SAMPLES, 1)

In [77]:
# ===== 1Ô∏è‚É£ ƒê·ªãnh Nghƒ©a Custom Layer Cho WaveNet =====
@register_keras_serializable()
class AttentionPooling(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.score_conv = Conv1D(1, 1, padding='same', name="attn_score_conv")
        self.softmax    = Softmax(axis=1, name="attn_weights")

    def call(self, inputs):
        score   = self.score_conv(inputs)
        weights = self.softmax(score)
        return tf.reduce_sum(weights * inputs, axis=1)

In [78]:
def preprocess_for_vgg16(file_path):
    y, sr = librosa.load(file_path, sr=TARGET_SR)
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
    S_dB = librosa.power_to_db(S, ref=np.max)

    # Resize v·ªÅ ƒë√∫ng input VGG16 (gi·∫£ ƒë·ªãnh 224x224x3)
    S_resized = tf.image.resize(S_dB[..., np.newaxis], (224,224)).numpy()
    S_rgb = np.repeat(S_resized, 3, axis=-1)  # Chuy·ªÉn grayscale -> RGB
    return np.expand_dims(S_rgb, axis=0)  # (batch, height, width, channels)

In [79]:
model_files = {
    "WaveNet": {
        "path": "wavenet_ser_model.keras",
        "preprocess": preprocess_for_wavenet
    },
    "VGG16": {
        "path": "vgg16_model.keras",
        "preprocess": preprocess_for_vgg16
    }
}

In [80]:
results = {}

for name, config in model_files.items():
    print(f"\n‚è≥ Predict v·ªõi m√¥ h√¨nh: {name}")

    # Load Model
    if name == "WaveNet":
        model = tf.keras.models.load_model(
            config["path"],
            custom_objects={"AttentionPooling": AttentionPooling},
            compile=False
        )
    else:  # VGG16
        model = tf.keras.models.load_model(config["path"], compile=False)

    # X·ª≠ l√Ω input
    audio_input = config["preprocess"](AUDIO_PATH)

    # ƒêo th·ªùi gian
    start_time = time.time()
    pred = model.predict(audio_input, verbose=0)
    end_time = time.time()

    predict_time = end_time - start_time
    predicted_class = np.argmax(pred)

    print(f"‚û°Ô∏è Th·ªùi gian predict: {predict_time:.4f} gi√¢y | D·ª± ƒëo√°n l·ªõp: {predicted_class}")
    results[name] = predict_time

# ===== 6Ô∏è‚É£ T·ªïng K·∫øt =====
print("\nüéØ T·ªïng K·∫øt Th·ªùi Gian D·ª± ƒêo√°n:")
for model_name, t in results.items():
    print(f"{model_name}: {t:.4f} gi√¢y")


‚è≥ Predict v·ªõi m√¥ h√¨nh: WaveNet
‚û°Ô∏è Th·ªùi gian predict: 1.3268 gi√¢y | D·ª± ƒëo√°n l·ªõp: 3

‚è≥ Predict v·ªõi m√¥ h√¨nh: VGG16


ValueError: Input 0 of layer "functional_6" is incompatible with the layer: expected shape=(None, 128, 180, 3), found shape=(1, 224, 224, 3)