In [1]:
import os
import numpy as np
from hmmaudio.features import extract_features
from hmmaudio.hmm import HiddenMarkovModel
from hmmaudio.utils import load_data, load_all_data, predict_label, train_hmm
from hmmaudio.eval import evaluate_models


# HMMAudio Demo

This notebook demonstrates how to train and evaluate Hidden Markov Models for audio emotion classification using diagonal covariance matrices.

In [2]:
# Define paths
TRAIN_DATA_PATH = "data/data"
TEST_DATA_PATH = "data/test_data"

# Load training and test data
print("Loading training data...")
train_features, train_files = load_all_data(TRAIN_DATA_PATH,                  
                include_mfcc=True,
                include_delta=True,
                include_delta2=True,
                num_cepstral=13)
print("\nLoading test data...")
test_features, test_files = load_all_data(TEST_DATA_PATH,                
                include_mfcc=True,
                include_delta=True,
                include_delta2=True,
                num_cepstral=13)

Loading training data...


Processing Happy: 100%|██████████| 1271/1271 [00:05<00:00, 238.34it/s]


Happy: Loaded 1271 files


Processing Sad: 100%|██████████| 1271/1271 [00:05<00:00, 223.65it/s]


Sad: Loaded 1271 files


Processing Fear: 100%|██████████| 1271/1271 [00:05<00:00, 233.30it/s]


Fear: Loaded 1271 files


Processing Neutral: 100%|██████████| 1087/1087 [00:04<00:00, 240.22it/s]


Neutral: Loaded 1087 files


Processing Anger: 100%|██████████| 1271/1271 [00:05<00:00, 224.44it/s]


Anger: Loaded 1271 files


Processing Disgust: 100%|██████████| 1271/1271 [00:05<00:00, 213.20it/s]


Disgust: Loaded 1271 files

Loading test data...


  sample_rate, audio = wavfile.read(file_path)
Processing Happy: 100%|██████████| 48/48 [00:00<00:00, 119.05it/s]


Happy: Loaded 48 files


Processing Sad: 100%|██████████| 48/48 [00:00<00:00, 138.60it/s]


Sad: Loaded 48 files


Processing Fear: 100%|██████████| 48/48 [00:00<00:00, 139.98it/s]


Fear: Loaded 48 files


Processing Neutral: 100%|██████████| 48/48 [00:00<00:00, 124.88it/s]


Neutral: Loaded 48 files


Processing Anger: 100%|██████████| 48/48 [00:00<00:00, 131.85it/s]


Anger: Loaded 48 files


Processing Disgust: 100%|██████████| 48/48 [00:00<00:00, 124.71it/s]

Disgust: Loaded 48 files





## Train Continuous HMMs with Diagonal Covariance

We'll use diagonal covariance matrices instead of full covariance matrices for faster training.

In [8]:
# Set HMM parameters
n_states = 5 
n_features = train_features["Anger"][0].shape[1]  # Number of features
max_iter = 10  # Number of Baum-Welch iterations

# Train HMMs with diagonal covariance (faster)
hmm_models = train_hmm(
    train_features, 
    n_states=n_states, 
    n_symbols=n_features,
    max_iter=max_iter,
    continuous = True,  # Use continuous HMM
    diagonal_covariance=True,  # Use diagonal covariance for speed,
)

Training HMM for Happy


Baum-Welch Training Progress: 100%|██████████| 10/10 [07:09<00:00, 42.95s/it]


Training HMM for Sad


Baum-Welch Training Progress: 100%|██████████| 10/10 [07:48<00:00, 46.85s/it]


Training HMM for Fear


Baum-Welch Training Progress: 100%|██████████| 10/10 [07:39<00:00, 45.93s/it]


Training HMM for Neutral


Baum-Welch Training Progress: 100%|██████████| 10/10 [06:10<00:00, 37.01s/it]


Training HMM for Anger


Baum-Welch Training Progress: 100%|██████████| 10/10 [07:46<00:00, 46.69s/it]


Training HMM for Disgust


Baum-Welch Training Progress: 100%|██████████| 10/10 [08:16<00:00, 49.67s/it]


## Evaluate Models

Now we'll evaluate the trained HMM models on both the training and test sets.
We'll use the new score method to calculate the log-likelihood of each sequence and normalize by sequence length.

In [9]:
# Evaluate on training set
print("\nEvaluating on training set...")
train_accuracy, train_cm, _, _, _ = evaluate_models(
    hmm_models, 
    train_features, 
    train_files,
    normalize_by_length=True  # Normalize by sequence length to handle variable-length audio
)
print(f"Train Accuracy: {train_accuracy:.2f}")
print("Train Confusion Matrix:")
print(train_cm)

# Evaluate on test set
print("\nEvaluating on test set...")
test_accuracy, test_cm, _, _, _ = evaluate_models(
    hmm_models, 
    test_features, 
    test_files,
    normalize_by_length=True
)
print(f"Test Accuracy: {test_accuracy:.2f}")
print("Test Confusion Matrix:")
print(test_cm)


Evaluating on training set...


Evaluating Happy: 100%|██████████| 1271/1271 [01:30<00:00, 14.07it/s]
Evaluating Sad: 100%|██████████| 1271/1271 [01:40<00:00, 12.61it/s]
Evaluating Fear: 100%|██████████| 1271/1271 [01:37<00:00, 13.01it/s]
Evaluating Neutral: 100%|██████████| 1087/1087 [01:21<00:00, 13.34it/s]
Evaluating Anger: 100%|██████████| 1271/1271 [01:40<00:00, 12.67it/s]
Evaluating Disgust: 100%|██████████| 1271/1271 [01:47<00:00, 11.81it/s]


Train Accuracy: 0.39
Train Confusion Matrix:
[[386 121 180  62 324 198]
 [ 47 768 103 137  25 191]
 [173 381 271  49 258 139]
 [138 262 130 230  41 286]
 [234  16  39   1 892  89]
 [187 303 158  86 190 347]]

Evaluating on test set...


Evaluating Happy: 100%|██████████| 48/48 [00:05<00:00,  9.37it/s]
Evaluating Sad: 100%|██████████| 48/48 [00:05<00:00,  9.07it/s]
Evaluating Fear: 100%|██████████| 48/48 [00:05<00:00,  9.33it/s]
Evaluating Neutral: 100%|██████████| 48/48 [00:05<00:00,  9.35it/s]
Evaluating Anger: 100%|██████████| 48/48 [00:05<00:00,  8.68it/s]
Evaluating Disgust: 100%|██████████| 48/48 [00:05<00:00,  8.73it/s]


Test Accuracy: 0.20
Test Confusion Matrix:
[[ 3  8 22  0 14  1]
 [ 1 14 16  0 15  2]
 [ 5  5 25  0 11  2]
 [ 2 13 20  0 13  0]
 [11  3 17  0 15  2]
 [ 6  5 22  0 15  0]]


## Example: Classify a Single Audio Sample

Let's test the model on a single audio sample and print the scores for each emotion.

In [10]:
# Choose a sample from the test set
true_emotion = "Anger"
sample_index = 10  # Choose any sample index
sample_features = test_features[true_emotion][sample_index]

# Predict using utility function
predicted_emotion, scores = predict_label(hmm_models, sample_features)

# Print results
print(f"True emotion: {true_emotion}")
print(f"Predicted emotion: {predicted_emotion}")
print("\nScores for each emotion (log-likelihood/frame, higher is better):")

# Normalize scores by sequence length for fair comparison
normalized_scores = {emotion: score/len(sample_features) for emotion, score in scores.items()}

# Sort and print scores from highest to lowest
for emotion, score in sorted(normalized_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{emotion}: {score:8.2f}")

True emotion: Anger
Predicted emotion: Fear

Scores for each emotion (log-likelihood/frame, higher is better):
Fear:     0.54
Happy:    -0.25
Sad:    -0.40
Disgust:    -0.59
Neutral:    -1.08
Anger:    -1.79
