In [1]:
import soundata

# Initialize the dataset
dataset = soundata.initialize('urbansound8k')

# Download the dataset
dataset.download()  # You can specify the download location with the `data_home` parameter.

# Validate that all the expected files are there
dataset.validate()  # This will check if all files expected by the dataset are present.


INFO: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO: NumExpr defaulting to 8 threads.
INFO: Downloading ['all', 'index']. Index is being stored in D:\anaconda\Lib\site-packages\soundata\datasets\indexes, and the rest of files in /tmp\sound_datasets\urbansound8k
INFO: [all] downloading UrbanSound8K.tar.gz
5.61GB [10:54, 9.20MB/s]                                
INFO: [index] downloading urbansound8k_index_1.0.json
1.15MB [00:01, 817kB/s]                             
100%|██████████| 1/1 [00:00<00:00, 62.05it/s]
100%|██████████| 8732/8732 [00:45<00:00, 190.83it/s]
INFO: Success: the dataset is complete and all files are valid.
INFO: --------------------


({'metadata': {}, 'clips': {}}, {'metadata': {}, 'clips': {}})

In [5]:
import soundata
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.transforms import MelSpectrogram

# Initialize the dataset
dataset = soundata.initialize('urbansound8k')
dataset.download()
dataset.validate()




INFO: Downloading ['all', 'index']. Index is being stored in D:\anaconda\Lib\site-packages\soundata\datasets\indexes, and the rest of files in /tmp\sound_datasets\urbansound8k
INFO: [all] downloading UrbanSound8K.tar.gz
INFO: /tmp\sound_datasets\urbansound8k\UrbanSound8K.tar.gz already exists and will not be downloaded. Rerun with force_overwrite=True to delete this file and force the download.
INFO: /tmp\sound_datasets\urbansound8k\audio already exists. Run with force_overwrite=True to download from scratch
INFO: /tmp\sound_datasets\urbansound8k\FREESOUNDCREDITS.txt already exists. Run with force_overwrite=True to download from scratch
INFO: /tmp\sound_datasets\urbansound8k\metadata already exists. Run with force_overwrite=True to download from scratch
INFO: /tmp\sound_datasets\urbansound8k\UrbanSound8K_README.txt already exists. Run with force_overwrite=True to download from scratch
INFO: [index] downloading urbansound8k_index_1.0.json
INFO: D:\anaconda\Lib\site-packages\soundata\dat

In [63]:
import torchaudio.transforms as T
class FixedSizeMelSpectrogram(T.MelSpectrogram):
    def __init__(self, sample_rate=22050, n_mels=64, max_pad_length=900, n_fft=2048, hop_length=512, **kwargs):
        super().__init__(sample_rate=sample_rate, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length, **kwargs)
        self.max_pad_length = max_pad_length

    def forward(self, waveform):
        # Convert stereo to mono if necessary
        if waveform.size(0) > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)

        # Generate the Mel Spectrogram
        mel_spec = super().forward(waveform)

        # Ensure the Mel Spectrogram is of fixed size
        if mel_spec.size(2) < self.max_pad_length:
            # Padding
            padded_mel_spec = torch.nn.functional.pad(mel_spec, (0, self.max_pad_length - mel_spec.size(2)))
        else:
            # Truncating
            padded_mel_spec = mel_spec[:, :, :self.max_pad_length]

        return padded_mel_spec

label_to_int = {
    'air_conditioner': 0,
    'car_horn': 1,
    'children_playing': 2,
    'dog_bark': 3,
    'drilling': 4,
    'engine_idling': 5,
    'gun_shot': 6,
    'jackhammer': 7,
    'siren': 8,
    'street_music': 9
}


class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(115200, 128)  # Adjust size based on your MelSpectrogram output
        self.fc2 = nn.Linear(128, 10)  # Assuming 10 classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class UrbanSound8KDataset(Dataset):
    def __init__(self, soundata_dataset, transform=None, target_length=1000000):  # target_length depends on your data
        self.clips = soundata_dataset.load_clips()
        self.transform = transform
        self.target_length = target_length

    def __len__(self):
        return len(self.clips)

    def __getitem__(self, idx):
        clip_key = list(self.clips.keys())[idx]
        clip = self.clips[clip_key]
        waveform, sr = torchaudio.load(clip.audio_path)
        if self.transform:
            waveform = self.transform(waveform)

        label_string = clip.tags.labels[0]
        label = label_to_int[label_string]

        return waveform, label



In [73]:

# Usage of the transform
transform = FixedSizeMelSpectrogram(sample_rate=22050, n_mels=64, max_pad_length=900)

# Create PyTorch datasets
train_dataset = UrbanSound8KDataset(dataset, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AudioClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

def train(model, device, train_loader, optimizer, criterion, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        correct = 0
        total = 0

        for data, targets in train_loader:
            data = data.to(device)
            targets = targets.to(device)

            optimizer.zero_grad()
            outputs = model(data)  # Add channel dimension
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Compute accuracy
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()
        avg_loss = total_loss / len(train_loader)
        accuracy = 100 * correct / total
        print(f'Epoch {epoch+1}, Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')

train(model, device, train_loader, optimizer, criterion)


Epoch 1, Loss: 12.5799, Accuracy: 33.50%
Epoch 2, Loss: 1.5315, Accuracy: 51.82%
Epoch 3, Loss: 1.4019, Accuracy: 61.37%
Epoch 4, Loss: 1.1271, Accuracy: 65.60%
Epoch 5, Loss: 1.2163, Accuracy: 69.84%
Epoch 6, Loss: 1.0536, Accuracy: 70.11%
Epoch 7, Loss: 0.9006, Accuracy: 77.40%
Epoch 8, Loss: 0.7792, Accuracy: 80.08%
Epoch 9, Loss: 0.8212, Accuracy: 82.48%
Epoch 10, Loss: 0.7381, Accuracy: 83.67%
Epoch 11, Loss: 0.4484, Accuracy: 88.31%
Epoch 12, Loss: 0.3376, Accuracy: 91.30%
Epoch 13, Loss: 0.7144, Accuracy: 88.99%
Epoch 14, Loss: 0.7038, Accuracy: 87.44%
Epoch 15, Loss: 0.7641, Accuracy: 86.21%
Epoch 16, Loss: 0.3453, Accuracy: 93.19%
Epoch 17, Loss: 0.2770, Accuracy: 93.84%
Epoch 18, Loss: 0.1458, Accuracy: 95.98%
Epoch 19, Loss: 0.8000, Accuracy: 92.24%
Epoch 20, Loss: 0.3702, Accuracy: 94.32%
Epoch 21, Loss: 0.3860, Accuracy: 95.68%
Epoch 22, Loss: 0.3826, Accuracy: 94.31%
Epoch 23, Loss: 0.0996, Accuracy: 97.62%
Epoch 24, Loss: 0.4093, Accuracy: 96.06%
Epoch 25, Loss: 0.2236, 