In [12]:
# https://huggingface.co/r-f/wav2vec-english-speech-emotion-recognition

In [15]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import librosa
import torch
import numpy as np

model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
model = AutoModelForAudioClassification.from_pretrained(model_id)

feature_extractor = AutoFeatureExtractor.from_pretrained(model_id, do_normalize=True)
id2label = model.config.id2label

loading configuration file config.json from cache at /Users/t.fuji/.cache/huggingface/hub/models--firdhokk--speech-emotion-recognition-with-openai-whisper-large-v3/snapshots/83e7cc6cebb3978e4cc314ebad9f1614c177a94a/config.json
Model config WhisperConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "apply_spec_augment": false,
  "architectures": [
    "WhisperForAudioClassification"
  ],
  "attention_dropout": 0.0,
  "begin_suppress_tokens": [
    220,
    50257
  ],
  "bos_token_id": 50257,
  "classifier_proj_size": 256,
  "d_model": 1280,
  "decoder_attention_heads": 20,
  "decoder_ffn_dim": 5120,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 32,
  "decoder_start_token_id": 50258,
  "dropout": 0.0,
  "encoder_attention_heads": 20,
  "encoder_ffn_dim": 5120,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 32,
  "eos_token_id": 50257,
  "id2label": {
    "0": "angry",
    "1": "disgust",
    "2": "fearful",
    "3": "happy",
    "4": "neutral",
    "5": "sad",
   

In [17]:
def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
    audio_array, sampling_rate = librosa.load(audio_path, sr=feature_extractor.sampling_rate)
    
    max_length = int(feature_extractor.sampling_rate * max_duration)
    if len(audio_array) > max_length:
        audio_array = audio_array[:max_length]
    else:
        audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))

    inputs = feature_extractor(
        audio_array,
        sampling_rate=feature_extractor.sampling_rate,
        max_length=max_length,
        truncation=True,
        return_tensors="pt",
    )
    return inputs

In [28]:
def predict_emotion(audio_path, model, feature_extractor, id2label, max_duration=30.0):
    inputs = preprocess_audio(audio_path, feature_extractor, max_duration)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    predicted_label = id2label[predicted_id]
    
    return logits[0][predicted_id], predicted_label

In [29]:
audio_path = "./test.mp3"

predicted_emotion = predict_emotion(audio_path, model, feature_extractor, id2label)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: (tensor(4.7272), 'happy')


In [20]:
id2label

{0: 'angry',
 1: 'disgust',
 2: 'fearful',
 3: 'happy',
 4: 'neutral',
 5: 'sad',
 6: 'surprised'}

In [None]:
predicted_id = torch.argmax(logits, dim=-1).item()
predicted_label = id2label[predicted_id]    
return predicted_label