<a href="https://colab.research.google.com/github/yootazi/audioClassifier/blob/main/audioClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title **Importing libraries** { form-width: "50%" }

!pip install torchaudio librosa boto3

import os
import torch
from torch.utils.data import Dataset
import torchaudio
import pandas as pd


In [None]:
#@title **Connecting to Google Drive** { form-width: "50%" }

from google.colab import drive
drive.mount('/content/gdrive/')



In [None]:
cd /content/gdrive/MyDrive/musicdata/UrbanSound8k/

In [17]:
#@title **Building a custom made Dataset** { form-width: "50%" }

class UrbanSoundDataset(Dataset):        # it will inherit from Dataset class imported from pytorch (torch.utils.data)

    # Creating a Custom Pytorch Dataset

    def __init__(self,
                 annotations_file,
                 audio_dir,
                 transformation,
                 target_sample_rate,
                 num_samples,
                 device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):                                                        # returning the number of samples in the dataset

        return len(self.annotations)     

    def __getitem__(self, index):                                             # with an index gets us a wav form signal and the lable of the sample. a_list[1] -> a_list.__getitem__(1)  it makes it possible to access an item through index -> returning the lable associated with the sample

        audio_sample_path = self._get_audio_sample_path(index)    
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)                       # loading the audio file using torchaudio.load
        signal = signal.to(self.device)                                       # signal is registred onto the device
        #signal = self._resample_if_necessary(signal, sr)                      # we have to make sure that all the samples hame the same sample rate - returns signal and sample rate
        signal = self._mix_down_if_necessary(signal)                          # mixing down to mono - we do not need two channels
        signal = self._cut_if_necessary(signal)                               # when num_samples > sample_rate --> adjusting the length by cutting it
        signal = self._right_pad_if_necessary(signal)                         # when num_samples <  sample_rate --> less samples than what we need we need to right padding
        signal = self.transformation(signal)                                  # applying transformation to the signal

        return signal, label

    def _cut_if_necessary(self, signal):
                                                                              # signal -> Tensor -> (num_channel, num_samples)
        if signal.shape[1] > self.num_samples:                                # when SAMPLE_RATE = NUM_SAMPLES --> one second worth of audio .shape[1] 
            signal = signal[:, :self.num_samples]                             # (1, 50000) -> (1, 22050)

        return signal

    def _right_pad_if_necessary(self, signal):

        length_signal = signal.shape[1]                                       # signal -> Tensor -> (num_channel, num_samples)
        if length_signal < self.num_samples:                                  # if the signal has more samples than what we expect then we should cut the signal
            num_missing_samples = self.num_samples - length_signal            #  [1, 1, 1] -> [1, 1, 1, 0, 0] appending num_missing_samplesto the right of the signal
            last_dim_padding = (0, num_missing_samples)                       # 0 is number of items we want to left pad, num_missing_samples are number of items we want ro right pad
            signal = torch.nn.functional.pad(signal, last_dim_padding)        # nn.functional.pad takes the signal and a padding tuple (last_dim_padding)

        return signal

    #def _resample_if_necessary(self, signal, sr):                             

        if sr != self.target_sample_rate:                                     # only if sample rate is different that the target sampler
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)  # Resample within transforms in torchaudio (sample rate of original signal, )
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):                                 # mixing down multiple channel to one channel using mean operation signal -> Tensor -> (num_channel, num_samples)  -> (2, 16000) --> stereo -> (1, 16000)

        if signal.shape[0] > 1:                                               # its not necessary when we only have one channel - checking that with .shape[0] of signal --> (2, 64,10)
            signal = torch.mean(signal, dim=0, keepdim=True)                  # applying mean operation of torchaudio(signal, dimension where we want to apply mean, we want to keep the dimension)
        return signal

    def _get_audio_sample_path(self, index):                                  # taking the path (index) of the file we are interested in (fold in the annotation file)

        fold = f"fold{self.annotations.iloc[index, 5]}"                       # identifying the fold (number of the column fold is 5) with iloc[row, column] from pandas - row is index, column is 5
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[      # joining all the information to get the file path (path to audio folder, folder number and name of the file)
            index, 0])
        
        return path

    def _get_audio_sample_label(self, index):                                 # getting the lable (classID) of the file which is column 6

        return self.annotations.iloc[index, 6]


if __name__ == "__main__":

    url='https://drive.google.com/file/d/1iudegHneVDtf3U_3mryD8sHSIkkA1QNW/view?usp=sharing'
    ANNOTATIONS_FILE='https://drive.google.com/uc?id=' + url.split('/')[-2]
    
    AUDIO_DIR = './audio'
    SAMPLE_RATE = 22050
    NUM_SAMPLES = 22050                                                       # when SAMPLE_RATE = NUM_SAMPLES --> one second worth of audio 

    if torch.cuda.is_available():                                             # processing data on gpu
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using device {device}")
    



    mel_spectrogram = torchaudio.transforms.MelSpectrogram(                   # a callable object ( ms = mel_spectrogram(signal))extracting mel spectrograms from the wav out of the dataset with torchaudio.transforms.Melspectrogram
        sample_rate=SAMPLE_RATE, 
        n_fft=1024,      # the frame size
        hop_length=512,  
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE, 
                            AUDIO_DIR, 
                            mel_spectrogram,     # creating an object from the dataset
                            SAMPLE_RATE, 
                            NUM_SAMPLES, 
                            device)
    
    print(f"There are {len(usd)} samples in the dataset.")
    signal, label = usd[0] # usd[0] should give us a signal and a label(classID)



Using device cuda
There are 8732 samples in the dataset.


In [None]:
#@title **Building the Sound Classifier (CNN)** { form-width: "50%" }


from torch import nn
from torchsummary import summary


class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()
        # 4 conv blocks / flatten / linear / softmax
        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(128 * 5 * 4, 10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)
        return predictions


if __name__ == "__main__":
    cnn = CNNNetwork()
    summary(cnn.cuda(), (1, 64, 44))   # if running on gpu
    #summary(cnn.cpu(), (1, 64, 44))   # if running on cpu

In [None]:
#@title **Training the Classifier Model** { form-width: "50%" }

import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader


LEARNING_RATE = 0.001 #@param {type:"raw"}
EPOCHS =  10 #@param {type:"integer"}
BATCH_SIZE = 128 #@param {type:"integer"}

url='https://drive.google.com/file/d/1iudegHneVDtf3U_3mryD8sHSIkkA1QNW/view?usp=sharing'
ANNOTATIONS_FILE='https://drive.google.com/uc?id=' + url.split('/')[-2]

AUDIO_DIR = './audio'
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050


def create_data_loader(train_data, batch_size):
    train_dataloader = DataLoader(train_data, batch_size=batch_size)
    return train_dataloader


def train_single_epoch(model, data_loader, loss_fn, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # calculate loss
        prediction = model(input)
        loss = loss_fn(prediction, target)

        # backpropagate error and update weights
        optimiser.zero_grad()
        loss.backward()
        optimiser.step()

    print(f"loss: {loss.item()}")


def train(model, data_loader, loss_fn, optimiser, device, epochs):
    for i in range(epochs):
        print(f"Epoch {i+1}")
        train_single_epoch(model, data_loader, loss_fn, optimiser, device)
        print("---------------------------")
    print("Finished training")


if __name__ == "__main__":
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    print(f"Using {device}")

    # instantiating our dataset object and create data loader
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            device)
    
    train_dataloader = create_data_loader(usd, BATCH_SIZE)

    # construct model and assign it to device
    cnn = CNNNetwork().to(device)
    print(cnn)

    # initialise loss funtion + optimiser
    loss_fn = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(cnn.parameters(),
                                 lr=LEARNING_RATE)

    # train model
    train(cnn, train_dataloader, loss_fn, optimiser, device, EPOCHS)


    # save model
    model_save_name = 'feedforwardnet.pth'
    path = F"/content/gdrive/My Drive/ai_music_projects/audioClassifier/{model_save_name}"
    torch.save(cnn.state_dict(), path)
    print("Trained feed forward net saved at {}".format(path))

Using cuda
CNNNetwork(
  (conv1): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv2): Sequential(
    (0): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv3): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (conv4): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(2, 2))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear): Linear(in_features=2560, out_features=10, bias=True)
  (softmax): Softmax(dim=1)
)
Epoch 1
loss: 2.425286054611

In [None]:
#@title **Making Predictions** { form-width: "50%" }


import torch
import torchaudio

class_mapping = [
    "air_conditioner",
    "car_horn",
    "children_playing",
    "dog_bark",
    "drilling",
    "engine_idling",
    "gun_shot",
    "jackhammer",
    "siren",
    "street_music"
]


def predict(model, input, target, class_mapping):
    model.eval()
    with torch.no_grad():
        predictions = model(input)
        # Tensor (1, 10) -> [ [0.1, 0.01, ..., 0.6] ]
        predicted_index = predictions[0].argmax(0)
        predicted = class_mapping[predicted_index]
        expected = class_mapping[target]
    return predicted, expected


if __name__ == "__main__":
    # load back the model
    cnn = CNNNetwork()
    #state_dict = torch.load(path)
    model_save_name = 'feedforwardnet.pth'
    path = F"/content/gdrive/My Drive/ai_music_projects/audioClassifier/{model_save_name}"
    cnn.load_state_dict(torch.load(path))

    # load urban sound dataset dataset
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE,
                            AUDIO_DIR,
                            mel_spectrogram,
                            SAMPLE_RATE,
                            NUM_SAMPLES,
                            "cpu")


    # get a sample from the urban sound dataset for inference
    input, target = usd[0][0], usd[0][1] # [batch size, num_channels, fr, time]
    input.unsqueeze_(0)

    # make an inference
    predicted, expected = predict(cnn, input, target,
                                  class_mapping)
    print(f"Predicted: '{predicted}', expected: '{expected}'")