In [26]:
import torch
import os.path

from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchsummary import summary
from torch.utils.tensorboard import SummaryWriter

ModuleNotFoundError: No module named 'tensorboard'

In [None]:
writer = SummaryWriter()

## Download dataset

In [2]:
def download_mnist_datasets():
    train_data = datasets.MNIST(
        root="data", download=True, train=True, transform=ToTensor()
    )
    validation_data = datasets.MNIST(
        root="data", download=True, train=False, transform=ToTensor()
    )
    return train_data, validation_data

In [3]:
train_data, _ = download_mnist_datasets()
print("MNIST dataset downloaded")

MNIST dataset downloaded


In [4]:
train_data

Dataset MNIST
    Number of datapoints: 60000
    Root location: data
    Split: Train
    StandardTransform
Transform: ToTensor()

## Make dataloader

In [5]:
BATCH_SIZE = 128
train_data_loader = DataLoader(train_data, batch_size = BATCH_SIZE)

## Make network

In [6]:
# FeedForwardNet inherits from nn.Module
class FeedForwardNet(nn.Module):
    
    # Constructor for defining the different layers
    def __init__(self):
        # Super allows you to call functions from nn.Module directly due to inheritance
        super().__init__()
        
        # Reshape into 1D tensor
        self.flatten = nn.Flatten()

        # Dense layers
        self.dense_layers = nn.Sequential(
            nn.Linear(
                28 * 28, 256
            ),  # 28*28 inputs (from image) and 256 outputs (neurons)
            nn.ReLU(),
            nn.Linear(256, 10),  # 256 inputs and 10 outputs (MNIST has 10 classes)
        )

        # Normalizes output from 0 to 1 (probability of it being that class)
        self.softmax = nn.Softmax(dim=1)  # dim = 1 -> 0 to 1

    # Specifies data flow/forward pass
    def forward(self, input_data):
        flattened_data = self.flatten(input_data)
        logits = self.dense_layers(flattened_data)
        predictions = self.softmax(logits)
        return predictions

In [7]:
# Checks if GPU (cuda) is available and use it. If not, use CPU
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
print(f"Using {device} device")

feed_forward_net = FeedForwardNet().to(device)

Using cuda device


In [8]:
def train_one_epoch(model, data_loader, loss_fn, optimizer, device):
    for inputs, targets in data_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Calculate loss
        predictions = model(inputs)
        loss = loss_fn(predictions, targets)

        # Backpropagate loss and update weights
        optimizer.zero_grad()  # Reset gradients for each batch
        loss.backward()  # Backprop
        optimiser.step()  # Update weights

    print(f"Loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimizer, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_one_epoch(model, data_loader, loss_fn, optimizer, device)
        print("-----------------")
    print("Training is done")

In [9]:
# Instantiate loss function + optimiser
LEARNING_RATE = 0.001
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(feed_forward_net.parameters(), lr=LEARNING_RATE)

In [10]:
EPOCHS = 10

# If model state dict already exists, load it into the model
if os.path.isfile("feedforwardnet.pth") == True:
    state_dict = torch.load("feedforwardnet.pth")
    feed_forward_net.load_state_dict(state_dict)
else:
    train(feed_forward_net, train_data_loader, loss_fn, optimizer, device, EPOCHS)
    # Store model
    torch.save(feed_forward_net.state_dict(), "feedforwardnet.pth")
    print("Model trained and stored at feedforwardnet.pth")

## Making predictions

In [11]:
class_mapping = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

In [12]:
def predict(model, input_, target, class_mapping):
    model.eval() # For evaluation and inference, turns off certain layers like batch_normalization and dropout
    # model.train() # Turns back on layers offed by eval (As we are only looking at inference, don't need this)
    
    # no need to calculate gradients as we are doing inference
    with torch.no_grad():
        predictions = model(input_.to(device)) # Tensor (1,10) -> [[0.1,0.1,...,0.6]] sum = 1 due to softmax
        predicted_index = predictions[0].argmax(0) # Prediction for example [0]
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
        
    return predicted, expected

In [13]:
_, validation_data = download_mnist_datasets()

# Get sample from validation set
# Input stored at [0][0] and output stored at [0][1] for example in [0]
input_, target = (
    validation_data[0][0],
    validation_data[0][1],
)

# Make an inference
predicted, expected = predict(feed_forward_net, input_, target, class_mapping)
print(f"Predicted: '{predicted}', Expected: '{expected}'")

Predicted: '7', Expected: '7'


# Creating a class and dataloader with Urban8K

This is to preprocess all the audio so that they are in mono and have the same length

In [14]:
from torch.utils.data import Dataset

import pandas as pd
import torchaudio
import os

In [15]:
class UrbanSoundDataset(Dataset):
    def __init__(
        self,
        annotations_file,
        audio_dir,
        transformation,
        target_sample_rate,
        num_samples,
        device,
    ):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)  # Use GPU
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        audio_sample_path = self._get_audio_sample_path(index)
        label = self._get_audio_sample_label(index)
        # get tensor of audio and the sample rate
        signal, sr = torchaudio.load(audio_sample_path)
        signal = signal.to(
            self.device
        )  # Signal registered onto device, able to use GPU
        signal = self._clean_signal(signal, sr)
        signal = self.transformation(signal)
        return signal, label

    # Performs resampling, demixing and cutting/padding
    def _clean_signal(self, signal, sr):
        # signal.shape -> (num_channels, samples)
        signal = self._resample_if_necessary(signal, sr)
        # Signals might have different sample rates
        signal = self._mix_down_if_necessary(signal)
        signal = self._cut_if_necessary(signal)
        signal = self._right_pad_if_necessary(signal)
        return signal

    def _cut_if_necessary(self, signal):
        # signal is tensor of (1, num_samples)
        # if length of signal more than num_samples
        # cut until num_samples
        if signal.shape[1] > self.num_samples:
            signal = signal[:, : self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        if signal.shape[1] < self.num_samples:
            num_missing_samples = self.num_samples - signal.shape[1]
            # 1st argument represents amount to prepend padding
            # 2nd argument represents amount to append padding
            last_dim_padding = (0, num_missing_samples)
            # pad function starts padding from last dimension
            # eg if pad(1, 1, 2, 2), pad last dim by (1, 1) and 2nd to last by (2, 2)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    # Convert original sampling rate to a specified sampling rate
    # single underscore methods for internal use
    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate).to(device) # Use GPU
            signal = resampler(signal)
        return signal

    # Convert to mono channel
    def _mix_down_if_necessary(self, signal):
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        fold = f"fold{self.annotations.iloc[index, 5]}"  # see csv for locations
        file_name = self.annotations.iloc[index, 0]
        path = os.path.join(self.audio_dir, fold, file_name)
        return path

    def _get_audio_sample_label(self, index):
        return self.annotations.iloc[index, 6]  # see csv for locations

In [16]:
ANNOTATIONS_FILE= "UrbanSound8K/metadata/UrbanSound8K.csv"
AUDIO_DIR = "UrbanSound8K/audio"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050

# Create callable object -> mel_spectrogram can be directly applied to the signal
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64
)

usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device)

In [17]:
print(f"There are {len(usd)} samples in the dataset.")

signal, label = usd[0]

There are 8732 samples in the dataset.


In [18]:
signal.shape # (channels, mels, no of frames)

torch.Size([1, 64, 44])

In [19]:
label

3

# Creating CNN for Sound Classification

In [20]:
class CNNNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(
            128 * 5 * 4, 10
        )  # out_channels * freq_axis * time_axis in this layer
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions

In [21]:
cnn = CNNNetwork()
summary(cnn.cuda(), (1, 64, 44))  # From signal.shape()

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   

From the above, we can see how the output shape changes over time.<br>
In Conv2d-1 the input of **(1, 64, 44)** becomes **(16, 66, 46)**<br>
Just before the flatten layer, we see that the input beomes **(128, 5, 4)**. This is where 128 * 5 * 4 is derived from to input into the dense (nn.Linear) layer.

The next part will be implemented in a proper python application

In [22]:
def create_data_loader(train_data, batch_size):
    train_data_loader = DataLoader(train_data, batch_size=batch_size)
    return train_data_loader


def train_single_epoch(model, data_loader, loss_fn, optimizer, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimizer, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimizer, device)
        print("-------------------")
    print("Finished training")

In [33]:
# Checks if GPU (cuda) is available and use it. If not, use CPU
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
print(f"Using {device}")

# Create callable object -> mel_spectrogram can be directly applied to the signal
mel_spectrogram = torchaudio.transforms.MelSpectrogram(
    sample_rate=SAMPLE_RATE, n_fft=1024, hop_length=512, n_mels=64
)

usd = UrbanSoundDataset(
    ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device
)

train_data_loader = create_data_loader(usd, BATCH_SIZE)

# construct model and assign to device
cnn = CNNNetwork().to(device)
print(cnn)

# initialize loss function + optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

# train model
train(cnn, train_data_loader, loss_fn, optimizer, device, EPOCHS)

# save model
torch.save(cnn.state_dict(), "neuralnet.pth")
print("Trained neural net saved at neuralnet.pth")

Using cuda
CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1
loss: 2.367712497711

In [32]:
torch.rand(20000,20000).cuda()
print("Allocated:", round(torch.cuda.memory_allocated(0)/10243,1), "GB")

Allocated: 661.3 GB
