# WavClassifier

Use the WAV file directly; extract features with a CNN using 1DConv.
State-of-the-art audio classifiers use Mel-Spectograms as described in `./mel_spec_classifier.ipynb`, but do not  preserve phase information

In [None]:
!pip install "ray[tune]"
import torch
import torch.nn as nn
import torch.optim as optim
import random
import tempfile
from scipy.io import wavfile
import torchvision.transforms as transforms
import torch.utils.data as Data
import os
from PIL import ImageOps
from torch.utils.data import SubsetRandomSampler
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler

##Mount drive
Mount google drive if running on google colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Constant parameters used in training

Run `setup.sh` to mount Google Drive containing GTZAN

In [None]:
GTZAN_WAV = "/content/drive/MyDrive/GTZAN/Data/genres_original/"

GENRES = {'blues': 0, 'classical': 1, 'country': 2, 'disco': 3,
          'hiphop': 4, 'jazz': 5, 'metal': 6, 'pop': 7, 'reggae': 8,
          'rock': 9}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)

## Training

Create a `Dataset` for the audio files

In [None]:
class WAVDataset(Data.Dataset):
    def __init__(self):
        self.wav = []
        self.labels = []

        # Go through all songs and tag X (tensor of image), Y as genre.
        for genre in os.listdir(GTZAN_WAV):
            for song in os.listdir(os.path.join(GTZAN_WAV, genre)):
                abs_path = os.path.join(GTZAN_WAV, genre, song)
                _, data = wavfile.read(abs_path)

                # Convert PIL Image to tensor
                self.wav.append(torch.from_numpy(data))
                # Convert genre tag to associated digit
                self.labels.append(GENRES[genre])

    def __len__(self):
        return len(self.wav)

    def __getitem__(self, idx):
        return self.wav[idx], self.labels[idx]

image_dataset = WAVDataset()

The `WavTrainer` model used is a CNN with 2 convolutional layers and 2 linear layers.

Each `wav` file is 30 seconds long and sampled at 22050 Hz. So, we have datapoints of size: ~661500. As humans. We make an estimation from a human standpoint regarding how long 'musical features' are to differentiate genres. The smallest 'features' seem to be differentiable within a significant fraction of a second.

So, the receptive field of the convolutional layer of the CNN should cover
a significant fraction of a second.

What is a significant fraction of a second? This is on hyperparameter tuning to decide. But the conclusion is that very small kernel sizes (such as 3 in `2DConv`) should not apply here since we wouldn't obtain much about the song features itself through essentially 0.0001 seconds of the song.


In [None]:
class WavTrainer(nn.Module):
  def __init__(self, l1=256, l2=20):
    super().__init__()

    self.conv_layer_1 = nn.Sequential(nn.Conv1d(1, 32, 20),
                                      nn.ReLU(),
                                      nn.MaxPool1d(kernel_size=10, stride=10)
                                      )

    self.conv_layer_2 = nn.Sequential(nn.Conv1d(32, 16, 20),
                                      nn.ReLU(),
                                      nn.MaxPool1d(kernel_size=10, stride=10)
                                      )

    self.flatten_layer = nn.Flatten()

    self.linear_layer_1 = nn.Sequential(nn.Linear(13248, l1),
                                        nn.ReLU())

    self.linear_layer_2 = nn.Sequential(nn.Linear(l1, l2),
                                        nn.ReLU())

    self.classifier = nn.Linear(l2, 10)

def forward(self, x):
    # First 1D convolution layer
    x = self.conv_layer_1(x)
    # Second 1D convolution layer
    x = self.conv_layer_2(x)

    # Linear layer and classifier
    x = self.flatten_layer(x)
    x = self.linear_layer_1(x)
    x = self.linear_layer_2(x)
    x = self.classifier(x)

    return x

Split into test/train/validation

In [None]:
def dataset_split(wav_dataset):
    indices = list(range(len(wav_dataset)))
    random.seed(42)
    random.shuffle(indices)

    num_train = int(len(wav_dataset) * 0.8)
    num_validation = int(len(wav_dataset) * 0.1)
    train_indices = indices[:num_train]
    test_and_validation = indices[num_train:]
    validation_indices = test_and_validation[:num_validation]
    test_indices = test_and_validation[num_validation:]

    return test_indices, train_indices, validation_indices