In [None]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import librosa
import pandas as pd

Custom audio dataset that can properly extract MFCC features for sentiment analysis

In [None]:
#create custom dataset to load audio files and extract MFCC features for proper sentiment analysis
class AudioDataset(Dataset):
    def __init__(self, file_list, labels, n_mfcc = 100, sample_rate = 22050, duration = 5):
        #list of paths of audio files
        self.file_list = file_list
        #list of labels corresponding to each file
        self.labels = labels
        #number of MFCC features to extract
        self.n_mfcc = n_mfcc
        #sample rate for audio loading
        self.sample_rate = sample_rate
        #duration in which each audio file will be trimmed
        self.duration = duration
        #number of samples per file
        self.samples_per_file = int(self.sample_rate * self.duration)
    
    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        #add caching mechanism for efficient loading
        cache_path = os.path.join('cache', f'mfcc_{os.path.basename(self.file_list[idx])}.npy')
        
        if os.path.exists(cache_path):
            features = np.load(cache_path)
        else:
            audio, sr = librosa.load(self.file_list[idx], sr = self.sample_rate, duration = self.duration)
            #trim audio to fixed length
            if len(audio) > self.samples_per_file:
                audio = audio[:self.samples_per_file]
            
            #pad extra audio to fixed length
            padding = self.samples_per_file - len(audio)
            
            #get features
            base_features = np.mean(librosa.feature.mfcc(y=audio, sr=sr, n_mfcc = self.n_mfcc), axis=1)
            delta = get_delta(base_features)
            delta_delta = get_delta(delta)
            #concatenate features
            features = np.hstack([base_features, delta, delta_delta])
            
            #normalize features using z-score normalization and reshape for CNN input
            features = (features - np.mean(features)) / (np.std(features) + 1e-8)
            features = features.reshape(1, -1)
            
            #save to cache
            os.makediirs('cache', exist_ok = True)
            np.save(cache_path, features)
        
        #convert features to tensor
        return torch.tensor(features, dtype = torch.float32), torch.tensor(self.labels[idx], dtype = torch.long)

Function that can extract features out of MFCCs, essentially scanning voice and pitch changes

In [None]:
#get delta (derivative) features of MFCCs
def get_delta(features, N=2):
    num_frames, num_features = features.shape
    padding = np.zeros((N, num_features))
    
    # Pad the features at beginning and end
    padded_features = np.vstack([padding, features, padding])
    
    delta_features = np.zeros_like(features)
    
    # Formula for delta computation
    denominator = 2 * sum([i**2 for i in range(1, N+1)])
    
    for t in range(num_frames):
        delta_sum = np.zeros(num_features)
        for n in range(1, N+1):
            # Add positive contribution
            delta_sum += n * padded_features[t + 2*N - n + 1]
            # Subtract negative contribution
            delta_sum -= n * padded_features[t + n]
        
        delta_features[t] = delta_sum / denominator

Read given csv file for training and testing data

In [None]:
#get df of real life audio and classifications 
data_df = pd.read_csv('audio_data.csv')
#create list from audio file paths
file_list = data_df['file_path'].tolist()
#create list with respective audio classifications
labels = data_df['label'].tolist()

import scikit learn for testing and training

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
#split data into training and testing sets
train_files, test_files, train_labels, test_labels = train_test_split(file_list, labels, test_size = 0.2, random_state = 42, stratify = labels)

In [None]:
#create datasets and dataloaders with training and validation sets
train_dataset = AudioDataset(train_files, train_labels)
test_dataset = AudioDataset(test_files, test_labels)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size = 32, shuffle = True)
test_dataloader = DataLoader(test_dataset, batch_size = 32, shuffle = False)

import necessary neural network packages

In [None]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

Create sentiment analysis CNN

In [None]:
#define model
class SentimentModel(nn.Module):
    #create convolutional nn model
    def __init__(self, input_size, hidden_size, output_size):
        super(SentimentModel, self).__init__()
        
        #convolutional/kernel scanning layers
        self.conv1 = nn.Conv1d(1, 32, kernel_size = 3, padding = 1)
        self.bn1 = nn.BatchNorm1d(32)
        self.conv2 = nn.Conv1d(32, 64, kernel_size = 3, padding = 1)
        self.bn2 = nn.BatchNorm1d(64)

        #calculate size after convolutional layers
        self.flat_size = 64 * input_size

        #fully connected/classification layer
        self.fc1 = nn.Linear(self.flat_size, hidden_size)
        self.bn3 = nn.BatchNorm1d(hidden_size)
        #dropout layer
        self.dropout = nn.Dropout(0.5)
        #second fully connected layer
        self.fc2 = nn.Linear(hidden_size, hidden_size//2)
        self.bn4 = nn.BatchNorm1d(hidden_size//2)
        #output layer
        self.fc3 = nn.Linear(hidden_size//2, output_size)

    def forward(self, x):
        #pass data through convolutional layers
        x = x.unsqueeze(1)
        
        #CNN layers
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        
        #flatten layers
        x = x.view(x.size(0), -1)
        
        #pass data through classification layer
        x = F.relu(self.bn3(self.fc1(x)))
        
        #pass through dropout layer
        x = self.dropout(x)
       
        #second fully connected layer
        x = F.relu(self.bn4(self.fc2(x)))
        x = self.dropout(x)
        
        #output layer
        x = self.fc3(x)
        return x

In [None]:
# Define feature dimension and number of classes
feature_dim = 100 
#based on unique labels
num_classes = len(set(labels))  

In [None]:
#setup device, instantiate model, loss criterion and optimizer
#cude if GPU is available, cpu otherwise
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#initialize model
model = SentimentModel(feature_dim, 128, num_classes).to(device)
#define loss criterion
criterion = nn.CrossEntropyLoss()
#define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

train model

In [None]:
epochs = 20
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    for features, labels in train_dataloader:
        #move current batch to appropriate device
        features, labels = features.to(device), labels.to(device)
        #reset calculated gradients
        optimizer.zero_grad()
        #move input forward
        outputs = model(features)
        #calculate loss
        loss = criterion(outputs, labels)
        #pass loss backwards
        loss.backward()
        #update weights
        optimizer.step()
        #accumulate loss
        running_loss += loss.item() * features.size(0)
    
    #print loss
    epoch_loss = running_loss / len(train_dataloader.dataset)
    print(f'Epoch: {epoch+1}/{epochs} | Loss: {epoch_loss:.4f}')   

Test model

In [None]:
#evalute model
model.eval()
correct = 0
total = 0

In [None]:
#validation test
with torch.no_grad():
    #iterate through model with validation set
    for features, labels in test_dataloader:
        features, labels = features.to(device), labels.to(device)
        outputs = model(features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    print(f'Accuracy: {correct/total * 100:.2f}%')