In [None]:
# check GPU status
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


After extracting features, we are doing classification with 1DCNN + LSTM with attention model.
Features: ZCR, RMSE, and MFCC

In [1]:
import torch
import torchvision
import matplotlib.pyplot as plt
import numpy as np
import os
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

%matplotlib inline

from torchvision import datasets
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
seed = 42
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [None]:
path_of_data =

In [None]:
# """
# Before feeding extracted audio features into the model, we need to make the
# features look like an image. This is because CNN expects an image as input.

# The following code pads audio features to ensure that each channels are of the
# same size.
# """
# # This code was adapted from Nicolas Gervais on https://stackoverflow.com/questions/59241216/padding-numpy-arrays-to-a-specific-size on 1/10/2021
# def padding(array, xx, yy):
#     """
#     :param array: numpy array
#     :param xx: desired height
#     :param yy: desirex width
#     :return: padded array
#     """
#     h = array.shape[0]
#     w = array.shape[1]

#     a = max((xx - h) // 2,0)
#     aa = max(0,xx - a - h)

#     b = max(0,(yy - w) // 2)
#     bb = max(yy - b - w,0)

#     return np.pad(array, pad_width=((a, aa), (b, bb)), mode='constant')

CNN with LSTM architecture is based on:

Figure 4. from Human–Computer Interaction with a Real-Time Speech
Emotion Recognition with Ensembling Techniques 1D
Convolution Neural Network and Attention
(https://doi.org/10.3390/s23031386)


In [None]:
"""
Building Network architecture based on the paper:
Human–Computer Interaction with a Real-Time Speech
Emotion Recognition with Ensembling Techniques 1D
Convolution Neural Network and Attention
(https://doi.org/10.3390/s23031386)

We are taking the output of CNN as the input of LSTM.
CNN captures local patterns in audio features, and
LSTM learns temporal dependencies before making final prediction. This supports
robust sequence prediction.
"""

import torch.nn as nn
import torch.nn.functional as F

class CNN_LSTM(nn.Module):
    def __init__(self):
        super(CNN_LSTM, self).__init__()
        # bn = batch normalization
        ####################
        # Convolution blocks: 1dconv, batch norm, ReLU, max pooling
        # Conv block 1
        self.conv1 = nn.Conv1d(in_channels = 3, out_channels = 8, kernel_size = 3, padding = 1)
        self.bn1 = nn.BatchNorm1d(8)

        # Conv block 2
        self.conv2 = nn.Conv1d(in_channels = 8, out_channels = 16, kernel_size = 3, padding = 1)
        self.bn2 = nn.BatchNorm1d(16)

        # Conv block 3
        self.conv3 = nn.Conv1d(in_channels = 16, out_channels = 32, kernel_size = 3, padding = 1)
        self.bn3 = nn.BatchNorm1d(32)

        # Conv block 4
        self.conv4 = nn.Conv1d(in_channels = 32, out_channels = 64, kernel_size = 3, padding = 1)
        self.bn4 = nn.BatchNorm1d(64)

        # Conv block 5
        self.conv5 = nn.Conv1d(in_channels = 64, out_channels = 128, kernel_size = 3, padding = 1)
        self.bn5 = nn.BatchNorm1d(128)

        self.relu = nn.ReLU()
        self.pool = nn.MaxPool1d(kernel_size = 2, stride = 2)
        ####################

        ####################
        # LSTM + attention block
        self.lstm1 = nn.LSTM(input_size = 128, hidden_size = 64, num_layers = 1, bias = True)

        self.attention = nn.Linear(64, 1)
        self.softmax = nn.Softmax(dim=1)

        self.lstm2 = nn.LSTM(input_size = 64, hidden_size = 64, num_layers = 1, bias = True)
        ####################


        self.fc1 = nn.Linear(64, 32) # May need to increase 32 to capture more complex data (?)
        self.bn6 = nn.BatchNorm1d(32)
        self.fc2 = nn.Linear(32, 16)


    def forward(self, x):
        # Conv block 1
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)

        # Conv block 2
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.pool(x)

        # Conv block 3
        x = self.conv3(x)
        x = self.bn3(x)
        x = self.relu(x)
        x = self.pool(x)

        # Conv block 4
        x = self.conv4(x)
        x = self.bn4(x)
        x = self.relu(x)
        x = self.pool(x)

        # Conv block 5
        x = self.conv5(x)
        x = self.bn5(x)
        x = self.relu(x)
        x = self.pool(x)

        # LSTM + attention block
        # output_tensor, hiddenstate = self.lstm()
        lstm1_out, _ = self.lstm1(x)

        attention_weights = self.softmax(self.attention(lstm1_out))
        context = torch.sum(attention_weights * lstm1_out, dim=1)

        lstm2_out, _ = self.lstm2(context.unsqueeze(-1))

        # fully connected layers and softmax
        x = self.fc1(lstm2_out.squeeze(1))
        x = self.bn6(x)
        x = self.fc2(x)

        # Softmax for prediction
        # F.softmax is used instead of self.softmax because it is not associated
        # with any parameters
        x = F.softmax(x, dim=1)

        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

net =  CNN_LSTM().to(device)
criterion = nn.CrossEntropyLoss()
optimizer =  optim.Adam(net.parameters(), lr=1e-4)

In [None]:
# Training loop
root_dir = './runs'
os.makedirs(root_dir, exist_ok=True)

def train_on_features(net, optimizer, device, trainloader, critrerion, epochs):
    

