In [28]:
# Cell 1: Import và load model
import os
import numpy as np
import librosa
import tensorflow as tf
import matplotlib.pyplot as plt
import pickle
from tensorflow.keras.models import load_model
import random

In [29]:
# Load model và label encoder
model = tf.keras.models.load_model('../data/models/mel_final_model.keras')
with open('../data/models/mel_label_encoder.pkl', 'rb') as f:
    le = pickle.load(f)


In [30]:

# Cell 2: Hàm trích xuất đặc trưng
def extract_mel_spectrogram(audio_path, n_mels=128, n_fft=2048, hop_length=128):
    # Load audio file
    y, sr = librosa.load(audio_path, sr=None)
    
    # Extract mel spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=y, 
        sr=sr,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length,
        fmin=20,
        fmax=sr/2,
        power=2.0
    )
    
    # Convert to log scale (dB)
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    
    # Chuẩn hóa về khoảng [0,1]
    mel_spec_norm = (mel_spec_db - mel_spec_db.min()) / (mel_spec_db.max() - mel_spec_db.min())
    
    # Resize về kích thước cố định (128, 32)
    mel_spec_norm = tf.image.resize(mel_spec_norm[..., np.newaxis], (128, 32))
    mel_spec_norm = mel_spec_norm.numpy()
    mel_spec_norm = mel_spec_norm[..., 0]
    
    return mel_spec_norm


In [31]:

# Cell 3: Hàm dự đoán
def predict_command(audio_path):
    # Trích xuất đặc trưng
    mel_features = extract_mel_spectrogram(audio_path)
    
    # Reshape cho model (1, 128, 32, 1)
    mel_features = mel_features.reshape(1, 128, 32, 1)
    
    # Dự đoán
    predictions = model.predict(mel_features, verbose=0)
    
    # Áp dụng softmax để có xác suất
    predictions = tf.nn.softmax(predictions)
    predictions = predictions.numpy()
    
    predicted_class = np.argmax(predictions[0])
    confidence = predictions[0][predicted_class]
    
    # Chuyển đổi label
    predicted_command = le.inverse_transform([predicted_class])[0]
    
    return predicted_command, confidence, mel_features, predictions[0]


In [None]:
PROCESSED_DIR = '../data/processed'
test_files = []  # Global variable for test files

def get_all_audio_files(directory):
    audio_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.wav'):
                audio_files.append(os.path.join(root, file))
    return audio_files

def evaluate_random_samples(predict_fn, num_samples=5):
    global test_files
    audio_files = get_all_audio_files(PROCESSED_DIR)

    if len(audio_files) < num_samples:
        raise ValueError(f"Không đủ file audio để lấy {num_samples} mẫu (chỉ có {len(audio_files)} file)")

    test_files = random.sample(audio_files, num_samples)
    correct = 0

    fig = plt.figure(figsize=(15, num_samples * 2))

    # Đặt tiêu đề 2 cột 1 lần duy nhất
    fig.text(0.25, 0.92, "Mel Spectrogram", fontsize=16, ha='center')
    fig.text(0.75, 0.92, "Waveform", fontsize=16, ha='center')

    for i, audio_path in enumerate(test_files):
        true_label = os.path.basename(os.path.dirname(audio_path))
        pred_label, confidence, mel_spec, _ = predict_fn(audio_path)

        # Cột 1: Mel spectrogram
        plt.subplot(num_samples, 2, i * 2 + 1)
        plt.imshow(mel_spec[0, :, :, 0], aspect='auto', origin='lower', cmap='viridis')
        plt.title(f"True: {true_label} | Pred: {pred_label} ({confidence:.2f})")
        plt.xlabel("Time")
        plt.ylabel("Mel Bins")

        # Cột 2: Waveform
        plt.subplot(num_samples, 2, i * 2 + 2)
        y, sr = librosa.load(audio_path, sr=None)
        times = np.linspace(0, len(y) / sr, len(y))
        plt.plot(times, y)
        plt.title(f"True: {true_label} | Pred: {pred_label}")
        plt.xlabel("Time (s)")
        plt.ylabel("Amplitude")

        if pred_label == true_label:
            correct += 1

    plt.tight_layout(rect=[0, 0, 1, 0.9])
    plt.show()

    accuracy = correct / num_samples
    print(f"\n✅ Accuracy: {correct}/{num_samples} ({accuracy:.2%})")

evaluate_random_samples(predict_command, num_samples=10)


In [None]:

# Cell 5: In kết quả chi tiết
print("\nDetailed Results:")
print("=" * 60)
for audio_path in test_files:
    true_command = os.path.basename(os.path.dirname(audio_path))
    predicted_command, confidence, _, all_probs = predict_command(audio_path)
    
    print(f"Audio file: {os.path.basename(audio_path)}")
    print(f"True label: {true_command}")
    print(f"Prediction: {predicted_command}")
    print(f"Confidence: {confidence:.2%}")
    
    print("\nProbabilities for each class:")
    # Sort probabilities in descending order
    sorted_probs = sorted(enumerate(all_probs), key=lambda x: x[1], reverse=True)
    for i, prob in sorted_probs:
        command = le.inverse_transform([i])[0]
        print(f"{command:15s}: {prob:.2%}")
    
    print(f"\nResult: {'✓ Correct' if true_command == predicted_command else '✗ Wrong'}")
    print("=" * 60)

In [None]:
# Cell 7: Test cho một file cụ thể
def evaluate_single_file(audio_path, predict_fn):
    if not os.path.exists(audio_path):
        raise ValueError(f"Không tìm thấy file: {audio_path}")
    
    # Tạo figure với 2 subplot
    fig = plt.figure(figsize=(15, 4))
    
    # Đặt tiêu đề 2 cột
    fig.text(0.25, 0.92, "Mel Spectrogram", fontsize=16, ha='center')
    fig.text(0.75, 0.92, "Waveform", fontsize=16, ha='center')
    
    # Lấy kết quả dự đoán
    true_label = os.path.basename(os.path.dirname(audio_path))
    pred_label, confidence, mel_spec, all_probs = predict_fn(audio_path)
    
    # Cột 1: Mel spectrogram
    plt.subplot(1, 2, 1)
    plt.imshow(mel_spec[0, :, :, 0], aspect='auto', origin='lower', cmap='viridis')
    plt.title(f"True: {true_label} | Pred: {pred_label} ({confidence:.2f})")
    plt.xlabel("Time")
    plt.ylabel("Mel Bins")
    
    # Cột 2: Waveform
    plt.subplot(1, 2, 2)
    y, sr = librosa.load(audio_path, sr=None)
    times = np.linspace(0, len(y) / sr, len(y))
    plt.plot(times, y)
    plt.title(f"True: {true_label} | Pred: {pred_label}")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    
    plt.tight_layout(rect=[0, 0, 1, 0.9])
    plt.show()

    print("\n🎵 Playing audio...")
    from IPython.display import Audio
    display(Audio(audio_path))
    
    # In kết quả chi tiết
    print("\nDetailed Results:")
    print("=" * 60)
    print(f"Audio file: {os.path.basename(audio_path)}")
    print(f"True label: {true_label}")
    print(f"Prediction: {pred_label}")
    print(f"Confidence: {confidence:.2%}")
    
    print("\nProbabilities for each class:")
    # Sort probabilities in descending order
    sorted_probs = sorted(enumerate(all_probs), key=lambda x: x[1], reverse=True)
    for i, prob in sorted_probs:
        command = le.inverse_transform([i])[0]
        print(f"{command:15s}: {prob:.2%}")
    
    print(f"\nResult: {'✓ Correct' if true_label == pred_label else '✗ Wrong'}")
    print("=" * 60)

# Sử dụng: Thay đổi đường dẫn file ở đây
audio_path = "../data/processed/bat_den/bat_den_speaker04_015.wav"  # Thay đổi đường dẫn này
evaluate_single_file(audio_path, predict_command)