In [1]:
import torch
import torchaudio
from datasets import load_dataset, Audio
from torch.utils.data import DataLoader
import torch.nn.functional as F
from typing import List, Tuple

N_FFT = 1024
HOP_LENGTH = 512
N_MELS = 64
SAMPLE_RATE = 16000

def create_mel_transform():
    return torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH,
        n_mels=N_MELS
    )

def audio_to_mel_spec(audio_array, mel_transform):
    if not isinstance(audio_array, torch.Tensor):
        audio_tensor = torch.tensor(audio_array, dtype=torch.float32)
    else:
        audio_tensor = audio_array.to(dtype=torch.float32)
    
    mel_spec = mel_transform(audio_tensor)
    mel_spec = mel_spec.to(dtype=torch.float32)
    mel_spec = torch.log(mel_spec + 1e-9)
    mel_spec = mel_spec.unsqueeze(0)
    
    return mel_spec

def preprocess_function(examples):
    mel_transform = create_mel_transform()
    audio_arrays = [x["array"] for x in examples["audio"]]
    
    mel_specs = [audio_to_mel_spec(audio, mel_transform) for audio in audio_arrays]
    
    return {"input_values": mel_specs}

class AudioDataset(torch.utils.data.Dataset):
    def __init__(self, ds):
        self.ds = ds
    
    def __len__(self):
        return len(self.ds)
    
    def __getitem__(self, idx):
        item = self.ds[idx]
        mel_spec = item["input_values"]
        if not isinstance(mel_spec, torch.Tensor):
            mel_spec = torch.tensor(mel_spec, dtype=torch.float32)
        else:
            mel_spec = mel_spec.to(dtype=torch.float32)
        label = torch.tensor(item["label"], dtype=torch.long)
        return mel_spec, label

def collate_fn(batch: List[Tuple[torch.Tensor, torch.Tensor]]) -> Tuple[torch.Tensor, torch.Tensor]:
    specs, labels = zip(*batch)
    
    max_time = max(spec.size(-1) for spec in specs)
    
    padded_specs = []
    for spec in specs:
        pad_length = max_time - spec.size(-1)
        if pad_length > 0:
            padded_spec = F.pad(spec, (0, pad_length))
            padded_specs.append(padded_spec)
        else:
            padded_specs.append(spec)
    
    specs_tensor = torch.stack(padded_specs)
    labels_tensor = torch.stack(labels)
    
    return specs_tensor, labels_tensor

ds = load_dataset("danavery/urbansound8K")
ds = ds.remove_columns(["fsID", "start", "end", "salience", "fold", "class"])
ds = ds.rename_column("slice_file_name", "path")
ds = ds.rename_column("classID", "label")

ds = ds.cast_column("audio", Audio(sampling_rate=SAMPLE_RATE))

ds = ds.map(preprocess_function, remove_columns="audio", batched=True)
ds = ds["train"].train_test_split(test_size=0.2)

train_dataset = AudioDataset(ds["train"])
val_dataset = AudioDataset(ds["test"])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=6, pin_memory=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=6, pin_memory=True, collate_fn=collate_fn)

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio.transforms as transforms

device = "cuda" if torch.cuda.is_available() else "cpu"

class CRNN(nn.Module):
    def __init__(self, num_classes=10):
        super(CRNN, self).__init__()
        
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(kernel_size=2, padding=1)
        self.bn1 = nn.BatchNorm2d(64)
        self.elu1 = nn.ELU()
        
        self.conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(kernel_size=4, padding=1)
        self.bn2 = nn.BatchNorm2d(128)
        self.elu2 = nn.ELU()
        
        self.conv3 = nn.Conv2d(128, 128, kernel_size=3, padding=1)
        self.pool3 = nn.MaxPool2d(kernel_size=4, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.elu3 = nn.ELU()
        
        self.gru1 = nn.GRU(input_size=256, hidden_size=32, num_layers=2, batch_first=True, dropout=0.3)        
        self.fc = nn.Linear(32, num_classes)  # Output size corresponds to the number of classes
        
    def forward(self, x):
        x = self.elu1(self.bn1(self.pool1(self.conv1(x))))
        x = self.elu2(self.bn2(self.pool2(self.conv2(x))))
        x = self.elu3(self.bn3(self.pool3(self.conv3(x))))        
        batch_size, channels, height, width = x.size()
        x = x.view(batch_size, width, channels * height)
        x, _ = self.gru1(x)        
        x = x[:, -1, :]        
        x = self.fc(x)
        
        return F.log_softmax(x, dim=1)

model = CRNN().to(device)
print(model)

CRNN(
  (conv1): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=1, dilation=1, ceil_mode=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (elu1): ELU(alpha=1.0)
  (conv2): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool2): MaxPool2d(kernel_size=4, stride=4, padding=1, dilation=1, ceil_mode=False)
  (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (elu2): ELU(alpha=1.0)
  (conv3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool3): MaxPool2d(kernel_size=4, stride=4, padding=1, dilation=1, ceil_mode=False)
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (elu3): ELU(alpha=1.0)
  (gru1): GRU(256, 32, num_layers=2, batch_first=True, dropout=0.3)
  (fc): Linear(in_features=32, out_features=10, bias=True)
)


In [3]:
import torch.optim as optim
from tqdm import tqdm

model.train()

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.01)
scaler = torch.cuda.amp.GradScaler()

num_epochs = 20
for epoch in range(num_epochs):
    model.train()  
    running_loss = 0.0
    
    train_progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training", leave=False)
    for inputs, labels in train_progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)  
        optimizer.zero_grad()

        with torch.cuda.amp.autocast():  
            outputs = model(inputs)
            loss = loss_fn(outputs, labels)

        scaler.scale(loss).backward() 
        scaler.step(optimizer) 
        scaler.update() 

        running_loss += loss.item()
        
        train_progress_bar.set_postfix(loss=loss.item())

    avg_loss = running_loss / len(train_loader)

    model.eval() 
    running_val_loss = 0.0
    correct = 0
    total = 0

    val_progress_bar = tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Validation", leave=False)
    with torch.no_grad():  
        for inputs, labels in val_progress_bar:  
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.cuda.amp.autocast(): 
                outputs = model(inputs)
                loss = loss_fn(outputs, labels)
                
            running_val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            val_progress_bar.set_postfix(loss=loss.item())

    avg_val_loss = running_val_loss / len(val_loader)
    accuracy = correct / total * 100  
    
    print(f"Epoch [{epoch + 1}/{num_epochs}], "
          f"Training Loss: {avg_loss:.4f}, "
          f"Validation Loss: {avg_val_loss:.4f}, "
          f"Validation Accuracy: {accuracy:.2f}%")
print("Training complete.")

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
                                                                                

Epoch [1/20], Training Loss: 1.8927, Validation Loss: 1.6121, Validation Accuracy: 54.55%


                                                                                

Epoch [2/20], Training Loss: 1.3325, Validation Loss: 1.0904, Validation Accuracy: 70.23%


                                                                                

Epoch [3/20], Training Loss: 0.9611, Validation Loss: 0.8183, Validation Accuracy: 79.28%


                                                                                

Epoch [4/20], Training Loss: 0.7250, Validation Loss: 0.6552, Validation Accuracy: 82.66%


                                                                                

Epoch [5/20], Training Loss: 0.5389, Validation Loss: 0.5154, Validation Accuracy: 87.41%


                                                                                

Epoch [6/20], Training Loss: 0.4000, Validation Loss: 0.4673, Validation Accuracy: 87.69%


                                                                                

Epoch [7/20], Training Loss: 0.3122, Validation Loss: 0.4267, Validation Accuracy: 88.21%


                                                                                

Epoch [8/20], Training Loss: 0.2467, Validation Loss: 0.3778, Validation Accuracy: 89.07%


                                                                                

Epoch [9/20], Training Loss: 0.2008, Validation Loss: 0.3231, Validation Accuracy: 90.15%


                                                                                

Epoch [10/20], Training Loss: 0.1552, Validation Loss: 0.2976, Validation Accuracy: 91.87%


                                                                                

Epoch [11/20], Training Loss: 0.1299, Validation Loss: 0.3092, Validation Accuracy: 90.56%


                                                                                

Epoch [12/20], Training Loss: 0.1078, Validation Loss: 0.2675, Validation Accuracy: 92.39%


                                                                                

Epoch [13/20], Training Loss: 0.0885, Validation Loss: 0.3038, Validation Accuracy: 90.61%


                                                                                

Epoch [14/20], Training Loss: 0.0668, Validation Loss: 0.2795, Validation Accuracy: 92.44%


                                                                                

Epoch [15/20], Training Loss: 0.0549, Validation Loss: 0.3230, Validation Accuracy: 90.67%


                                                                                

Epoch [16/20], Training Loss: 0.0629, Validation Loss: 0.3309, Validation Accuracy: 89.93%


                                                                                

Epoch [17/20], Training Loss: 0.0457, Validation Loss: 0.3692, Validation Accuracy: 89.87%


                                                                                

Epoch [18/20], Training Loss: 0.0450, Validation Loss: 0.3350, Validation Accuracy: 90.44%


                                                                                

Epoch [19/20], Training Loss: 0.0388, Validation Loss: 0.3331, Validation Accuracy: 90.61%


                                                                                

Epoch [20/20], Training Loss: 0.0324, Validation Loss: 0.3420, Validation Accuracy: 90.84%
Training complete.


