In [2]:
import os
from pathlib import Path
import pandas as pd
import librosa
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


In [3]:
def train_model(model, train_loader, test_loader, device, num_epochs=25):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        correct_predictions = 0
        total_predictions = 0
        
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            correct_predictions += (predicted == labels).sum().item()
            total_predictions += labels.size(0)

        epoch_accuracy = (correct_predictions / total_predictions) * 100  # in percentage
        print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss / len(train_loader):.4f}, Accuracy: {epoch_accuracy:.2f}%")

    print("Training complete!")

In [4]:
def evaluate_model(model, data_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for features, labels in data_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

In [5]:
class ImprovedCRNN(nn.Module):
    def __init__(self, input_channels=1, img_height=128, img_width=216, num_classes=50,
                 map_to_seq_hidden=128, rnn_hidden_size=216, num_rnn_layers=3, dropout=0.3):
        super(ImprovedCRNN, self).__init__()
        
        # CNN backbone
        self.cnn, (output_channels, output_height, output_width) = self._cnn_backbone(
            input_channels, img_height, img_width
        )
        
        # Map CNN output to sequence
        self.map_to_seq = nn.Linear(output_channels * output_height, map_to_seq_hidden)
        
        # Recurrent layers
        self.rnn1 = nn.LSTM(
            map_to_seq_hidden,
            rnn_hidden_size,
            num_layers=num_rnn_layers,
            bidirectional=True,
            dropout=dropout,
            batch_first=False
        )
        
        # Fully connected layer
        self.fc = nn.Linear(rnn_hidden_size * 2, num_classes)

    def _cnn_backbone(self, input_channels, img_height, img_width):
        
        cnn = nn.Sequential(
            # Block 1
            nn.Conv2d(input_channels, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # Halves height and width
            nn.Dropout(0.2),

            # Block 2
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),  # Halves height and width
            nn.Dropout(0.3),

            # Block 3
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)),  # Halves height, keeps width
            nn.Dropout(0.6),

            # Block 4
            nn.Conv2d(256, 512, kernel_size=3, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=(2, 1)),  # Halves height, keeps width
            nn.Dropout(0.5)
        )

        # Calculate output shape
        final_height = img_height // (2 * 2 * 2 * 2)  # Height halved 4 times
        final_width = img_width // (2 * 2)  # Width halved 2 times
        output_shape = (512, final_height, final_width)

        return cnn, output_shape


    def forward(self, x):
        
        x = self.cnn(x)
        batch_size, channels, height, width = x.shape
        x = x.view(batch_size, channels * height, width).permute(2, 0, 1)
        x = self.map_to_seq(x)
        x, _ = self.rnn1(x)
        x = self.fc(x[-1])
        return x


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import os
from torchvision import transforms

# Custom Dataset for loading Mel Spectrograms
class MelSpectrogramDataset(Dataset):
    def __init__(self, features_dir, labels_path, transform=None):
        self.features_dir = features_dir
        self.labels = np.load(labels_path)
        self.features = np.load(os.path.join(features_dir, "features.npy"))
        self.transform = transform

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feature_path = self.features[idx]
        feature = np.load(feature_path)  # Load Mel Spectrogram
        label = self.labels[idx]  # Load corresponding label

        if self.transform:
            feature = self.transform(feature)

        # Mel spectrograms need to have a channel dimension for PyTorch Conv2D
        feature = np.expand_dims(feature, axis=0)  # Add channel dimension

        return torch.tensor(feature, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Transformation for normalization (if needed)
transform = transforms.Compose([
    transforms.Lambda(lambda x: (x - np.mean(x)) / (np.std(x) + 1e-6))  # Normalize Mel spectrogram
])

# Directories for train and test data
train_dir = "C:/Users/jimmy/Desktop/Practical_Work/processed_data/mel_spectrogram/train"
test_dir = "C:/Users/jimmy/Desktop/Practical_Work/processed_data/mel_spectrogram/test"

# Prepare train and test datasets
train_dataset = MelSpectrogramDataset(
    features_dir=train_dir,
    labels_path=os.path.join(train_dir, "labels.npy"),
    transform=transform
)

test_dataset = MelSpectrogramDataset(
    features_dir=test_dir,
    labels_path=os.path.join(test_dir, "labels.npy"),
    transform=transform
)

# DataLoaders for train and test datasets
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Device configuration (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model
model = ImprovedCRNN(
    input_channels=1,  # Mel spectrograms are treated as single-channel images
    img_height=128,  # Height of the Mel spectrogram
    img_width=216,  # Width of the Mel spectrogram
    num_classes=50  # Number of classes in the dataset
)

# Train the model
train_model(model, train_loader, test_loader, device, num_epochs=25)

# Evaluate the model on the test set
test_accuracy = evaluate_model(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.2f}%")


  from .autonotebook import tqdm as notebook_tqdm


Epoch [1/25], Loss: 3.7680, Accuracy: 3.81%
Epoch [2/25], Loss: 3.4511, Accuracy: 7.25%
Epoch [3/25], Loss: 3.2670, Accuracy: 8.06%
Epoch [4/25], Loss: 3.0960, Accuracy: 11.62%
Epoch [5/25], Loss: 2.9155, Accuracy: 15.75%
Epoch [6/25], Loss: 2.7960, Accuracy: 19.38%
Epoch [7/25], Loss: 2.6341, Accuracy: 23.38%
Epoch [8/25], Loss: 2.4956, Accuracy: 26.12%
Epoch [9/25], Loss: 2.4253, Accuracy: 27.19%
Epoch [10/25], Loss: 2.2689, Accuracy: 31.81%
Epoch [11/25], Loss: 2.3746, Accuracy: 29.94%
Epoch [12/25], Loss: 2.1267, Accuracy: 35.69%
Epoch [13/25], Loss: 2.0082, Accuracy: 39.00%
Epoch [14/25], Loss: 1.9155, Accuracy: 42.38%
Epoch [15/25], Loss: 1.8902, Accuracy: 42.12%
Epoch [16/25], Loss: 1.7877, Accuracy: 45.31%
Epoch [17/25], Loss: 1.7322, Accuracy: 45.88%
Epoch [18/25], Loss: 1.7060, Accuracy: 48.00%
Epoch [19/25], Loss: 1.6420, Accuracy: 47.88%
Epoch [20/25], Loss: 1.5890, Accuracy: 49.44%
Epoch [21/25], Loss: 1.5985, Accuracy: 50.19%
Epoch [22/25], Loss: 1.5196, Accuracy: 51.94%
