In [1]:
import os
from torch.utils.data import Dataset, DataLoader
import torch
import torchaudio
import pandas as pd
from torch import nn
from cnn import CNNNetwork

# Parameters
ANNOTATIONS_FILE = "/Users/vilourenco/datasets/UrbanSound8K/metadata/UrbanSound8K.csv" # Path to the csv file containing the annotations
AUDIO_DIR = "/Users/vilourenco/datasets/UrbanSound8K/audio" # Path to the directory containing the audio samples
SAMPLE_RATE = 22050 # The sampling rate of the audio samples
NUM_SAMPLES = 22050 # The number of samples to retain in the audio samples

BATCH_SIZE = 128 # The number of samples to use for each training step
EPOCHS = 10 # The number of times to iterate over all samples
LEARNING_RATE = 0.001 # The learning rate for the optimizer

In [3]:
import mlflow

mlflow.set_experiment("urbansound8k-cnn")
mlflow.start_run()
mlflow.pytorch.autolog()

In [4]:
###############  Dataset  ##################
class UrbanSoundDataset(Dataset):

    # __init__ is the constructor of the class
    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        """
            annotations_file: path to the csv file containing the annotations
            audio_dir: path to the directory containing the audio samples
        """
        # Read the csv file with the annotations
        # Get the full path to the audio files
        # Get the device
        # Get the transformation, and send it to the correct device
        # Get the target sample rate
        # Get the number of samples
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples
        

    def __len__(self):
        """
            Returns:
                The length of the dataset
        """
        # Return the length of the annotations dataframe
        return len(self.annotations)

    def __getitem__(self, index):
        """
            index: the index of the item to return
            Returns:
                The audio sample and its corresponding label
        """
        # Get the name of the audio sample from the pandas df
        # Load the audio sample
        # Get the audio signal tensor and the sampling frequency
        # Send the signal to the right device (for GPU processing)
        # Resample the signal if necessary
        # If the signal has multiple channels, lower to one channel
        # Cut the signal to the desired length
        # Pad the signal to the desired length
        # Use a transformation over the audio signal (e.g., MelSpectrogram)
        audio_sample_path = self.__get_audio_sample_path(index)
        label = self.__get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)   # signal -> (num_channels, samples) -> (2, 16000) -> (1, 16000)
        signal = self.__resample_if_necessary(signal, sr)
        signal = self.__mix_down_if_necessary(signal)
        signal = self.__cut_if_necessary(signal)
        signal = self.__right_pad_if_necessary(signal)
        signal = self.transformation(signal)
        signal = signal.to(self.device) # Added to test

        return signal, label
    
        # Private method to mix down the signal if necessary
    def __resample_if_necessary(self, signal, sr):
        """
            signal: the audio signal tensor
            sr: the sampling rate of the audio signal
            Returns:
                The audio signal tensor resampled at the target sampling rate
        """
        # If the sampling rate of the signal and the target sampling rate are different, resample
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            resampler = resampler
            signal = resampler(signal)
        return signal
    
    # Private method to mix down the signal if necessary
    def __mix_down_if_necessary(self, signal):
        """
            signal: the audio signal tensor
            sr: the sampling rate of the audio signal
            Returns:
                The audio signal tensor with one channel
        """
        # If the signal has more than one channel, mix it down
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal
    
    def __cut_if_necessary(self, signal):
        """
            signal: the audio signal tensor
            Returns:
                The audio signal tensor with the desired number of samples
        """
        # signal -> Tensor -> (1, num_samples)

        # If the signal has more samples than the desired number of samples, cut it
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def __right_pad_if_necessary(self, signal):
        """
            signal: the audio signal tensor
            Returns:
                The audio signal tensor padded with zeros to the desired number of samples
        """
        # signal -> Tensor -> (1, num_samples)

        # If the signal has less samples than the desired number of samples, pad it
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            # Calculate the number of missing samples
            # Create the padding, (left_pad, right_pad, top_pad, bottom_pad)
            # Pad the signal on the left with zeros
            num_missing_samples = self.num_samples - length_signal
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal
    
    def __get_audio_sample_path(self, index):
        """
        Private Method
            index: the index of the item to return
            Returns:
                The path to the audio file
        """
        # Get the fold number from the pandas df
        fold = f"fold{self.annotations.iloc[index, 5]}"
        # Get the name of the audio file
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[
            index, 0])
        return path

    def __get_audio_sample_label(self, index):
        """
        Private Method
            index: the index of the item to return
            Returns:
                The label of the audio file
        """
        # Get the label from the pandas df
        return self.annotations.iloc[index, 6]


def check_device():
    """
        Check if Metal / GPU is available and set the device accordingly
        Returns:
            The device to use for training
    """
    if torch.backends.mps.is_available():
        device = "mps"
    elif torch.cuda.is_available(): 
        device = "cuda"
    else: 
        device = "cpu"
    
    return device

In [5]:
# #Check device
# device = check_device()
# print(f"Using {device} device. PS: Not using during dataset preprocessing due to incompatibility")

device = "cpu"

In [6]:
# Create the mel spectrogram transform
mel_spectogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)
# Create the dataset
usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectogram, SAMPLE_RATE, NUM_SAMPLES, device)
# Print the length of the dataset
print(f"There are {len(usd)} samples in the dataset.")
# Get the first sample and its corresponding label
signal, label = usd[1]
print(f"Shape of the signal: {signal.shape}")

There are 8732 samples in the dataset.
Shape of the signal: torch.Size([1, 64, 44])


In [7]:
import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader

def create_data_loader(train_data, batch_size):
    """
        train_data: the dataset to load
        batch_size: the size of the batch
        Returns:
            A DataLoader that can be used in a for loop
    """
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    """
        model: the model to train
        data_loader: the DataLoader object to iterate over the dataset 
        loss_fn: the loss function to use
        optimiser: the optimiser to use to update the model weights
        device: the device to use (e.g., cpu, cuda)
    """
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    """
        model: the model to train
        data_loader: the DataLoader object to iterate over the dataset
        loss_fn: the loss function to use
        optimiser: the optimiser to use to update the model weights
        device: the device to use (e.g., cpu, cuda)
        epochs: the number of epochs to train the model
    """
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")

def check_device():
    """
        Check if Metal / GPU is available and set the device accordingly
        Returns:
            The device to use for training
    """
    if torch.backends.mps.is_available():
        device = "mps"
    elif torch.cuda.is_available(): 
        device = "cuda"
    else: 
        device = "cpu"
    
    return device

In [8]:
train_dataloader = create_data_loader(usd, BATCH_SIZE)
# Construct model and assign it to device
cnn = CNNNetwork().to(device)
print(cnn)
# Initialise loss funtion + optimiser
loss_fn = nn.CrossEntropyLoss()
optimiser = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)


In [9]:
for input, target in train_dataloader:
    input, target = input.to(device), target.to(device)
    print(input.dtype)
    print(target.dtype)
    print(input)
    print(target)
    break

torch.float32
torch.int64
tensor([[[[8.1443e-04, 2.1588e-04, 9.1436e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.4870e-03, 1.5723e-03, 4.2855e-04,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.9685e-03, 6.0449e-03, 4.1659e-03,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          ...,
          [1.7061e-04, 5.7996e-02, 7.9969e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [2.1515e-04, 1.5781e-02, 3.7936e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00],
          [3.0015e-04, 1.4416e-02, 3.1233e-01,  ..., 0.0000e+00,
           0.0000e+00, 0.0000e+00]]],


        [[[8.9202e-02, 6.2095e-01, 1.0333e-01,  ..., 3.6100e-02,
           1.7762e-01, 3.1767e-01],
          [1.3724e-01, 5.2129e-01, 3.5723e-02,  ..., 2.5212e-01,
           2.2663e-01, 8.1124e-01],
          [1.4066e-02, 3.7927e-01, 5.5285e-02,  ..., 8.5007e-01,
           4.2893e-01, 1.2757e+00],
          ...,
          [1.1717e-04, 1.9552e-

In [10]:
# train model
train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)

Epoch 1
loss: 2.3965976238250732
---------------------------
Epoch 2
loss: 2.3374831676483154
---------------------------
Epoch 3
loss: 2.2610104084014893
---------------------------
Epoch 4
loss: 2.1872196197509766
---------------------------
Epoch 5
loss: 2.2917957305908203
---------------------------
Epoch 6
loss: 2.108264446258545
---------------------------
Epoch 7
loss: 2.119719982147217
---------------------------
Epoch 8
loss: 2.0813279151916504
---------------------------
Epoch 9
loss: 2.132204055786133
---------------------------
Epoch 10
loss: 2.151926040649414
---------------------------
Finished training


In [11]:
mlflow.end_run()

In [9]:
# save model
torch.save(cnn.state_dict(), "feedforwardnet.pth")
print("Trained feed forward net saved at feedforwardnet.pth")

Trained feed forward net saved at feedforwardnet.pth


In [None]:
class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]

def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected

In [None]:
# load back the model (IF NEEDED)
cnn = CNNNetwork()
state_dict = torch.load("feedforwardnet.pth", map_location=torch.device("cpu"))
cnn.load_state_dict(state_dict)

In [None]:
# load urban sound dataset dataset
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=1024,
    hop_length=512,
    n_mels=64
)
usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                        AUDIO_DIR,
                        mel_spectrogram,
                        SAMPLE_RATE,
                        NUM_SAMPLES,
                        "cpu")
# get a sample from the urban sound dataset for inference
a = random.randint(0, len(usd))
input, target = usd[a][0], usd[a][1] # [batch size, num_channels, fr, time]
input.unsqueeze_(0)
# make an inference
predicted, expected = predict(cnn, input, target,
                              class_mapping)
print(f"Predicted: '{predicted}', expected: '{expected}'")
# calculate accuracy of the model
correct = 0
incorrect = 0
for i in range(len(usd)):
    input, target = usd[i][0], usd[i][1]
    input.unsqueeze_(0)
    predicted, expected = predict(cnn, input, target,
                                  class_mapping)
    if predicted == expected:
        correct += 1
    else:
        incorrect += 1
print(f"Correct: {correct}, incorrect: {incorrect}, accuracy: {correct/(correct+incorrect)}")
