In [None]:
from torchvision.datasets import DatasetFolder
import torch
import numpy as np
from torch.utils.data import DataLoader, random_split

device = torch.device('cuda:0')
# load the stft dataset from folder

path = "audio_data"

data = DatasetFolder(path)

# loader function for .npy files
def npy_loader(path):
    x = np.load(path)
    x = np.expand_dims(x, axis=0)
    return torch.tensor(x, dtype=torch.float32).to(device)

dataset = DatasetFolder(root="data", loader=npy_loader, extensions=[".npy"])

train_ratio = 0.8
val_ratio = 0.2
dataset_size = len(dataset)
train_size = int(train_ratio * dataset_size)
val_size = dataset_size - train_size

# Split the dataset
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,      
    num_workers=4
)

val_loader = DataLoader(
    val_dataset,
    batch_size=32,
    shuffle=False,     
    num_workers=4
)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class EmotionDetector(nn.Module): 
    def __init__(self, num_classes=6,n_fft=2048):
        super(EmotionDetector, self).__init__()
        
        self.conv1 = nn.Sequential(
            nn.Conv2d(1,32,3,padding="same"),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
        )
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(32,64,3,padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(64,128,3,padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
        )
        
        self.conv3 = nn.Sequential(
            nn.Conv2d(128,128,3,padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(128,256,3,padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
        )
        # every convolutional layer that has padding="same" keeps freq_bins unchanged. However every MaxPool2d halves the height (divided by 2)
        
        freq_bins_after_conv3 = n_fft /8 
        self.lstm_input_size = 256 * freq_bins_after_conv3  # freq_bins after conv/pooling
        self.lstm = nn.LSTM(self.lstm_input_size, 128, batch_first=True, bidirectional=True)
        
        self.fc = nn.Linear(128*2, num_classes)  # bidirectional LSTM doubles hidden size
        
    def forward(self, x):
        batch_size = x.size(0)
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)  # (batch, channels, freq, time)
        
        # prepare for LSTM: (batch, time, features)
        x = x.permute(0, 3, 1, 2)  # (batch, time, channels, freq)
        x = x.contiguous().view(batch_size, x.size(1), -1)
        
        # LSTM
        x, (h_n, c_n) = self.lstm(x)
        
        # take mean over time steps
        x = x.mean(dim=1)
        
        # classification
        x = self.fc(x)
        return x




In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt

def train_model(model, train_loader, val_loader, 
                num_epochs=20, lr=1e-3, device='cuda'):
    """
    Train a CNN+LSTM model for emotion recognition.

    Args:
        model: PyTorch nn.Module
        train_loader: DataLoader for training set
        val_loader: DataLoader for validation set
        num_epochs: number of epochs
        lr: learning rate
        device: 'cuda' or 'cpu'
    """
    
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []
    
    # Move model to device
    model = model.to(device)
    
    # Loss function
    criterion = nn.CrossEntropyLoss()
    
    # Optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for batch in train_loader:
            inputs, labels = batch
            inputs = inputs.to(device)     # (batch, 1, freq, time)
            labels = labels.to(device)     # (batch,)
            
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            
            # Compute loss
            loss = criterion(outputs, labels)
            
            # Backprop
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            
            # Compute accuracy
            _, predicted = outputs.max(1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        
        epoch_loss = running_loss / total
        epoch_acc = correct / total
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for val_batch in val_loader:
                val_inputs, val_labels = val_batch
                val_inputs = val_inputs.to(device)
                val_labels = val_labels.to(device)
                
                val_outputs = model(val_inputs)
                v_loss = criterion(val_outputs, val_labels)
                
                val_loss += v_loss.item() * val_inputs.size(0)
                _, val_pred = val_outputs.max(1)
                val_correct += (val_pred == val_labels).sum().item()
                val_total += val_labels.size(0)
        
        val_loss /= val_total
        val_acc = val_correct / val_total
        
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        train_losses.append(epoch_loss)
        train_accs.append(epoch_acc)

        
        print(f"Epoch [{epoch+1}/{num_epochs}] "
              f"Train Loss: {epoch_loss:.4f}, Acc: {epoch_acc:.4f} "
              f"Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f}")
    
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(val_losses, label='Validation Loss')
    plt.plot(train_losses, label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Loss Curve')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(val_accs, label='Validation Accuracy')
    plt.plot(train_accs, label='Training Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy (%)')
    plt.title('Accuracy Curve')
    plt.legend()

    plt.tight_layout()
    plt.show()

    
    return model

