<a href="https://colab.research.google.com/github/wuliopulio/EmotionClassificationModel/blob/main/Audio_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


from torch.utils.data import Dataset, Subset, DataLoader


import torchvision
from torchvision import transforms


import matplotlib.pyplot as plt
import numpy as np
import os
from PIL import Image
import librosa


dtype = torch.float




In [None]:
import kagglehub

# Download latest version
datapath = kagglehub.dataset_download("uwrfkaggler/ravdess-emotional-speech-audio")

print("Path to dataset files:", datapath)

Path to dataset files: /kaggle/input/ravdess-emotional-speech-audio


In [None]:
from pathlib import Path
import pandas as pd

base_dir = '/kaggle/input/ravdess-emotional-speech-audio/'

emotion_map = {
    '01': 'neutral',
    '02': 'neutral',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fear',
    '07': 'disgust',
    '08': 'happy'
}

file_emotion = []
file_path = []

for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith('.wav'):
            full_path = os.path.join(root, file)

            parts = file.split('-')
            emotion_code = parts[2]

            emotion = emotion_map.get(emotion_code, 'Unknown')

            file_emotion.append(emotion)
            file_path.append(full_path)

ravdess_df = pd.DataFrame({
    'Emotion': file_emotion,
    'Path': file_path
})

print(f"Total files found: {len(ravdess_df)}")
print(ravdess_df.head())

Total files found: 2880
   Emotion                                               Path
0    happy  /kaggle/input/ravdess-emotional-speech-audio/A...
1  neutral  /kaggle/input/ravdess-emotional-speech-audio/A...
2  disgust  /kaggle/input/ravdess-emotional-speech-audio/A...
3  disgust  /kaggle/input/ravdess-emotional-speech-audio/A...
4  neutral  /kaggle/input/ravdess-emotional-speech-audio/A...


In [None]:
import collections
print(collections.Counter(ravdess_df['Emotion']))

Counter({'happy': 768, 'neutral': 576, 'disgust': 384, 'fear': 384, 'sad': 384, 'angry': 384})


In [None]:
dtype = torch.float
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
STANDARD_EMOTIONS = {
    'neutral': 0,
    'happy': 1,
    'sad': 2,
    'angry': 3,
    'fear': 4,
    'disgust': 5,
}

class AudioDataset(Dataset):
  def __init__(self, df, sample_rate, n_mels, duration):
    super().__init__()


    self.df = df
    self.sample_rate = sample_rate # Target sample rate for audio loading
    self.n_mels = n_mels # Specific frequency ranges derived from the Mel scale, which is designed to mimic how humans perceive sound
    self.duration = duration # Duration audio clips are adjusted to
    self.max_len = sample_rate * duration # Max samples per audio clip


    self.labels_map = STANDARD_EMOTIONS
    self.idx_to_emotion = {v: k for k, v in STANDARD_EMOTIONS.items()}

    # Sorted list of class labels
    self.transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize((128,128))
    ])


  def __len__(self):
    return len(self.df) # Number of audio clips


  def __getitem__(self, idx):
    file_path = self.df.iloc[idx]['Path']
    emotion = self.df.iloc[idx]['Emotion'].lower()

    if 'surprise' in emotion:
        emotion = 'happy'
    elif 'fearful' in emotion:
        emotion = 'fear'

    if emotion not in self.labels_map:
        return None

    label = self.labels_map[emotion]

    y, sample_rate = librosa.load(file_path, sr=self.sample_rate)

    if len(y) < self.max_len:
        y = np.pad(y, (0, self.max_len - len(y)))
    else:
        y = y[:self.max_len]

    mel = librosa.feature.melspectrogram(y=y, sr=sample_rate, n_mels=self.n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_tensor = self.transform(mel_db)

    return mel_tensor, label

In [None]:
full_dataset = AudioDataset(ravdess_df, sample_rate=22050, n_mels=128, duration=3)

print(len(ravdess_df))
length = len(ravdess_df)

indices = np.arange(len(full_dataset))
np.random.shuffle(indices)

train_indices = indices[:int(0.8*len(indices))]
test_indices = indices[int(0.8*len(indices)):]


trainset = Subset(full_dataset, train_indices)
testset = Subset(full_dataset, test_indices)


2880


In [None]:
trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
testloader = DataLoader(testset, batch_size=32)

In [None]:
class CNN(nn.Module):
  def __init__(self):
      super(CNN, self).__init__()

      self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, padding=1)
      self.bn1 = nn.BatchNorm2d(16)
      self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)  # 64x64x16

      self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
      self.bn2 = nn.BatchNorm2d(32)
      self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)  # 32x32x32

      self.conv3 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
      self.bn3 = nn.BatchNorm2d(64)
      self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)  # 16x16x64

      self.conv4 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
      self.bn4 = nn.BatchNorm2d(128)
      self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)  # 8x8x128

      self.flattened_size = 128 * 8 * 8

      self.fc1 = nn.Linear(self.flattened_size, 256)
      self.dropout = nn.Dropout(0.5)

      self.fc2 = nn.Linear(256, 128)

      self.fc3 = nn.Linear(128, len(STANDARD_EMOTIONS))
  def forward(self, x):


    x = self.pool1(torch.relu(self.bn1(self.conv1(x))))
    x = self.pool2(torch.relu(self.bn2(self.conv2(x))))
    x = self.pool3(torch.relu(self.bn3(self.conv3(x))))
    x = self.pool4(torch.relu(self.bn4(self.conv4(x))))


    x = x.view(x.size(0), -1)


    x = torch.relu(self.fc1(x))
    x = self.dropout(x)
    x = torch.relu(self.fc2(x))
    x = self.fc3(x)


    return x


In [None]:
import torchvision.models as models

# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Correct full path
model_path = '/content/drive/MyDrive/Emotion Classifier CNN/Models/New_Audio_attempt2_v1.pth'

# Load weights
model = CNN().to(device)
model.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [None]:
for param in model.conv1.parameters():
    param.requires_grad = False
for param in model.conv2.parameters():
    param.requires_grad = False
for param in model.conv3.parameters():
    param.requires_grad = False


In [None]:
def test_model(model, dataloader):
  correct = 0.0
  total = 0.0
  loss = 0.0
  criterion = nn.CrossEntropyLoss()
  with torch.no_grad():
    for image, label in dataloader:
      image = image.to(device)
      label = label.to(device)
      outputs = model(image)
      _, predictions = torch.max(outputs, 1)


      loss += criterion(outputs, label).item()


      total += label.size(0)
      correct += (predictions == label).sum().item()


  return (100*correct/total), (loss/len(dataloader))




In [None]:
from tqdm import tqdm

optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
num_epochs = 5
batch_size = 16
loss_hist = []

criterion = nn.CrossEntropyLoss()


for epoch in range(num_epochs):
  correct = 0
  total = 0
  for inputs, labels in tqdm(trainloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=True):
    inputs = inputs.to(device)
    labels = labels.to(device)


    optimizer.zero_grad()


    outputs = model.forward(inputs)
    loss = criterion(outputs, labels)


    loss.backward()


    optimizer.step()


    loss_hist.append(loss.item())


  print(f"Epoch: {epoch+1}/{num_epochs} complete")

Epoch 1/5: 100%|██████████| 72/72 [01:34<00:00,  1.31s/it]


Epoch: 1/5 complete


Epoch 2/5: 100%|██████████| 72/72 [01:35<00:00,  1.32s/it]


Epoch: 2/5 complete


Epoch 3/5: 100%|██████████| 72/72 [01:35<00:00,  1.32s/it]


Epoch: 3/5 complete


Epoch 4/5: 100%|██████████| 72/72 [01:33<00:00,  1.30s/it]


Epoch: 4/5 complete


Epoch 5/5: 100%|██████████| 72/72 [01:32<00:00,  1.28s/it]

Epoch: 5/5 complete





In [None]:
test_model(model, testloader)

(70.3125, 0.7721916437149048)

In [None]:
save_to = os.path.join('drive', 'MyDrive', 'Emotion Classifier CNN', 'Models', "New_Audio_attempt2_v2.pth")
os.makedirs(os.path.dirname(save_to), exist_ok=True)
torch.save(model.state_dict(), save_to)

In [None]:
def predict_emotion(audio_path, model, device):
    # Preprocess exactly like training
    sample_rate = 22050
    duration = 3
    n_mels = 128

    y, sr = librosa.load(audio_path, sr=sample_rate)
    y = y[:sample_rate*duration] if len(y) > sample_rate*duration else np.pad(y, (0, sample_rate*duration - len(y)))

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    tensor = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize((128,128))
    ])(mel_db).unsqueeze(0).to(device)

    # Predict
    model.eval()
    with torch.no_grad():
        output = model(tensor)
        probs = torch.softmax(output, dim=1)
        conf, pred = torch.max(probs, 1)

    return pred.item(), conf.item(), probs.squeeze().tolist()

# Usage
class_idx, confidence, all_probs = predict_emotion('/content/drive/MyDrive/Emotion Classifier CNN/Audio/Joy.m4a', model, device)
print(f"Predicted: {class_idx} ({full_dataset.idx_to_emotion[class_idx]})")
print(f"Confidence: {confidence:.2%}")
for i, prob in enumerate(all_probs):
    print(f"{i}: {prob:.2%} {full_dataset.idx_to_emotion[i]}")

Predicted: 2 (sad)
Confidence: 54.21%
0: 0.36% neutral
1: 28.87% happy
2: 54.21% sad
3: 0.13% angry
4: 15.18% fear
5: 1.25% disgust


  y, sr = librosa.load(audio_path, sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
