# MelSpecClassifier

Use the spectogram of the wav file and use a CNN with 2DConv to classify the genre.
The spectograms have already been provided in the GTZAN dataset. It is cropped before being passed into the model

In [None]:
!pip install "ray[tune]"
import torch
from utils import *
import torch.nn as nn
import torch.optim as optim
import random
import tempfile
from PIL import Image
import torchvision.transforms as transforms
import torch.utils.data as Data
import os
from PIL import ImageOps
from torch.utils.data import SubsetRandomSampler
from ray import train, tune, air
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler

## Mount drive
Mount google drive if running on google colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Constant parameters used in training

Run `setup.sh` to mount Google Drive containing GTZAN

In [3]:
GTZAN_MEL = "/content/drive/MyDrive/GTZAN/Data/images_original/"

PREPROCESS_CROP = (54, 35, 42, 35)

IMAGE_INPUT_DIMENSIONS = [432, 288]
GENRES = {'blues': 0, 'classical': 1, 'country': 2, 'disco': 3,
          'hiphop': 4, 'jazz': 5, 'metal': 6, 'pop': 7, 'reggae': 8,
          'rock': 9}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)

Using device cpu


## Training

Create a `Dataset` for the mel-spectograms

In [9]:
class ImageDataset(Data.Dataset):
    def __init__(self):
        self.images = []
        self.labels = []

        # Go through all songs and tag X (tensor of image), Y as genre.
        for genre in os.listdir(GTZAN_MEL):
            for song in os.listdir(os.path.join(GTZAN_MEL, genre)):
                abs_path = os.path.join(GTZAN_MEL, genre, song)
                image = Image.open(abs_path)

                # The images have been obtained in the dataset by using the mel spectogram (librosa)
                # Cropping the image to only contain the spectogram to pass into CNN
                image_cropped = ImageOps.crop(image, PREPROCESS_CROP)

                transform = transforms.Compose([transforms.ToTensor()])
                # Convert PIL Image to tensor
                self.images.append(transform(image_cropped))
                # Convert genre tag to associated digit
                self.labels.append(GENRES[genre])

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]

The `MelSpecTrainer` model used is a CNN with 2 convolutional layers and 2 linear layers.
- There is a lack of datapoints compared to the number of dimensions.
- The true dimensionality of each genre appears to be in the order of 10
- There are no further hidden layers as a balance between complexity of network and risk of overtraining.
- The kernel size and number of convolutional layers follow empirically determined industry norms for general image detection and classification.


In [10]:
class MelSpecTrainer(nn.Module):
    def __init__(self, l1=256, l2=20):
        super().__init__()

        self.current_dimensions = IMAGE_INPUT_DIMENSIONS

        self.conv_layer_1 = nn.Sequential(nn.Conv2d(4, 32, 3),
                                          nn.ReLU(),
                                          nn.MaxPool2d(kernel_size=2, stride=3)
                                          )

        self.conv_layer_2 = nn.Sequential(nn.Conv2d(32, 16, 3),
                                          nn.ReLU(),
                                          nn.MaxPool2d(kernel_size=2, stride=3)
                                          )

        self.flatten_layer = nn.Flatten()

        self.linear_layer_1 = nn.Sequential(nn.Linear(13248, l1),
                                            nn.ReLU())

        self.linear_layer_2 = nn.Sequential(nn.Linear(l1, l2),
                                            nn.ReLU())

        self.classifier = nn.Linear(l2, 10)

    def forward(self, x):
        # First 2D convolution layer
        x = self.conv_layer_1(x)
        # Second 2D convolution layer
        x = self.conv_layer_2(x)

        # Linear layer and classifier
        x = self.flatten_layer(x)
        x = self.linear_layer_1(x)
        x = self.linear_layer_2(x)
        x = self.classifier(x)

        return x

In [11]:
for i in range(torch.cuda.device_count()):
   print(torch.cuda.get_device_properties(i).name)

Create routines for training and validation. Perform Hyperparameter Tuning to devise a closer to optimized model.

In [14]:
def train_mel_spec_model(config):

    model = MelSpecTrainer(l1=config["l1"], l2=config["l2"])
    model.to(DEVICE)
    image_dataset = ImageDataset()

    # train model with training dataset, but ray tuner uses validation dataset to tune hyperparameters
    train_model(model, DEVICE, config, image_dataset)

## Testing

 Create routine for testing model. The split being used is 80% for training, 10% for validation, and 10% for testing.

In [None]:
def test_mel_spec_model(best_result):
    best_model = MelSpecTrainer(l1=best_result.config["l1"], l2=best_result.config["l2"])
    best_model.to(DEVICE)

    image_dataset = ImageDataset()

    test_model(best_model, best_result, image_dataset, DEVICE)

# Main function

Here, we specify the range for the hyperparameters we want Ray Tune to tune on. Run the training of the model using various hyperparameters.

Test the model using the best trained model as obtained using Ray Tune

In [None]:
def run_mel_spec_classifier():
    config = {
        "l1": tune.choice([k for k in range(100, 1000, 50)]),
        "l2": tune.choice([j for j in range(5, 30, 5)]),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([l for l in range(5, 50, 5)]),
        "num_epochs": tune.choice([25, 35, 45])
    }

    # Only stop trials at least after 20 training iterations
    asha_scheduler = ASHAScheduler(time_attr='training_iteration',
                                   grace_period=20)

    # Adjust resources depending on availability
    tuner = tune.Tuner(tune.with_resources(tune.with_parameters(train_mel_spec_model),
                       resources={"cpu": 2, "gpu": 1}),
                       tune_config=tune.TuneConfig(
                           metric='loss',
                           mode="min",
                           scheduler=asha_scheduler,
                           num_samples=5,
                       ),
                       run_config=air.RunConfig(verbose=1)
                       param_space=config,)

    results = tuner.fit()
    best_result = results.get_best_result("loss", "min")

    test_mel_spec_model(best_result)

run_mel_spec_classifier()