# WavClassifier

Use the WAV file directly; extract features with a CNN using 1DConv.
State-of-the-art audio classifiers use Mel-Spectograms as described in `./mel_spec_classifier.ipynb`, but do not  preserve phase information

In [ ]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import tempfile
from scipy.io import wavfile
import torchvision.transforms as transforms
import torch.utils.data as Data
import os
from PIL import ImageOps
from torch.utils.data import SubsetRandomSampler
from ray import train, tune
from ray.train import Checkpoint
from ray.tune.schedulers import ASHAScheduler

## Constant parameters used in training

Run `setup.sh` to mount Google Drive containing GTZAN

In [ ]:
GTZAN_WAV = "/content/drive/MyDrive/GTZAN/Data/genres_original/"

GENRES = {'blues': 0, 'classical': 1, 'country': 2, 'disco': 3,
          'hiphop': 4, 'jazz': 5, 'metal': 6, 'pop': 7, 'reggae': 8,
          'rock': 9}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device", DEVICE)

Create a `Dataset` for the audio files

In [ ]:
class WAVDataset(Data.Dataset):
    def __init__(self):
        self.wav = []
        self.labels = []

        # Go through all songs and tag X (tensor of image), Y as genre.
        for genre in os.listdir(GTZAN_WAV):
            for song in os.listdir(os.path.join(GTZAN_WAV, genre)):
                abs_path = os.path.join(GTZAN_WAV, genre, song)
                _, data = wavfile.read(song) 

                # Convert PIL Image to tensor
                self.wav.append(torch.from_numpy(abs_path))
                # Convert genre tag to associated digit
                self.labels.append(GENRES[genre])

    def __len__(self):
        return len(self.wav)

    def __getitem__(self, idx):
        return self.wav[idx], self.labels[idx]

The `WavTrainer` model used is a CNN with 2 convolutional layers and 2 linear layers.
The justifications for the architecture are consistent with `mel_spec_classifier.ipynb`