In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Speech Emotion Recognition 

In this notebook we will load the spectrogram images and attempt to classify them with a pre-trained model.

In [None]:
IMAGES_FOLDER = "drive/MyDrive/UNI-AMRITA-SEM2/MLforBigData/MLBD_Project/MLBD_Dataset/Processed/"

In [None]:
import os
import numpy as np

import torchvision
from torchvision.io import read_image
import torchvision.transforms as T
from torch.utils.data import Dataset, DataLoader

In [None]:
class IndianAudioDataset(Dataset):
  def __init__(self, base):
    self.data = []
    self.labels = []
    self.categories = {"calm":0, "anger":1, "fear":2, "sad":3}
    for emotion in os.listdir(base):
      for clip in os.listdir(base+'/'+emotion):
        self.labels.append(self.categories[emotion])
    for emotion_dir in os.listdir(base):
      for image in os.listdir(os.path.join(base,emotion_dir)):
        self.data.append(read_image(os.path.join(base,emotion_dir,image)))
  def __len__(self):
    return len(self.data)
  def __getitem__(self, idx):
    return self.data[idx], self.labels[idx]

In [None]:
train_data = IndianAudioDataset(IMAGES_FOLDER)
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

In [None]:
from torchvision.models import resnet34
import torch
import torch.nn as nn
import torch.optim as optim
if torch.cuda.is_available():
  device=torch.device('cuda:0')
else:
  device=torch.device('cpu')
resnet_model = resnet34(pretrained=True)
resnet_model.fc = nn.Linear(512,50)
resnet_model.conv1 = nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
resnet_model = resnet_model.to(device)

In [None]:
learning_rate = 2e-4
optimizer = optim.Adam(resnet_model.parameters(), lr=learning_rate)
epochs = 10
loss_fn = nn.CrossEntropyLoss()
train_losses=[]
train_acc = []
def train(model, loss_fn, train_loader, epochs, optimizer, train_losses, train_acc, change_lr=None):
  for epoch in range(1,epochs+1):
    model.train()
    batch_losses=[]
    if change_lr:
      optimizer = change_lr(optimizer, epoch)
    running_loss=0
    correct=0
    total=0
    for i, data in enumerate(train_loader):
      x, y = data
      optimizer.zero_grad()
      x = x.to(device, dtype=torch.float32)
      y = y.to(device, dtype=torch.long)
      y_hat = model(x)
      loss = loss_fn(y_hat, y)
      loss.backward()
      optimizer.step()
      running_loss += loss.item()
      _, predicted = y_hat.max(1)
      total += y.size(0)
      correct += predicted.eq(y).sum().item()
    train_loss=running_loss/len(train_loader)
    accu=100.*correct/total
    train_acc.append(accu)
    train_losses.append(train_loss)
    print('Train Loss: %.3f | Accuracy: %.3f'%(train_loss,accu))
train(resnet_model, loss_fn, train_loader, epochs, optimizer, train_losses, train_acc)

Train Loss: 0.146 | Accuracy: 96.154
Train Loss: 0.047 | Accuracy: 100.000
Train Loss: 0.078 | Accuracy: 98.077
Train Loss: 0.062 | Accuracy: 98.077
Train Loss: 0.134 | Accuracy: 96.154
Train Loss: 0.311 | Accuracy: 94.231
Train Loss: 0.302 | Accuracy: 94.231
Train Loss: 0.199 | Accuracy: 94.231
Train Loss: 0.363 | Accuracy: 94.231
Train Loss: 0.136 | Accuracy: 94.231
