In [1]:
import warnings
warnings.filterwarnings('ignore')

import random
SEED = 442
random.seed(SEED)

import os
import sys
import time
import numpy as np
import matplotlib.pyplot as plt

import librosa
import skimage.io as io

from tqdm import tqdm
from pathlib import Path
from sklearn.model_selection import train_test_split

import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
class MelSpecDataset(torch.utils.data.Dataset):
    def __init__(self, X, y, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # mel_spec = (io.imread(self.X[idx]) / 255.).astype(np.float32)
        # mel_spec = np.load(self.X[idx])
        mel_spec = preprocess(self.X[idx])
        label = self.y[idx]
        if self.transform:
            mel_spec = self.transform(mel_spec)
        return mel_spec, label


class MelSpecNet(nn.Module):
    def __init__(self, n_classes, dropout=0.8):
        super().__init__()
        self.bn1 = nn.BatchNorm2d(1)
        self.conv1 = nn.Conv2d(1, 4, 3)
        self.bn2 = nn.BatchNorm2d(4)
        self.conv2 = nn.Conv2d(4, 8, 3)
        self.bn3 = nn.BatchNorm2d(8)
        self.conv3 = nn.Conv2d(8, 16, 3)
        self.bn4 = nn.BatchNorm2d(16)
        self.conv4 = nn.Conv2d(16, 16, 3)

        self.act = nn.ELU()
        self.pool = nn.MaxPool2d(6, 2)
        self.dropout = nn.Dropout(p=dropout)
        self.fc = nn.Linear(832, n_classes)

    def forward(self, x):
        x = self.pool(self.act(self.conv1(self.bn1(x))))
        x = self.pool(self.act(self.conv2(self.bn2(x))))
        x = self.pool(self.act(self.conv3(self.bn3(x))))
        x = self.pool(self.act(self.conv4(self.bn4(x))))

        x = torch.flatten(x, 1)
        x = self.fc(self.dropout(x))
        return x


def scale(x, _min=0.0, _max=1.0):
    std = (x - x.min()) / (x.max() - x.min())
    return std * (_max - _min) + _min


def preprocess(file_path, sr=None, start_bin=0, sample_duration=15,
               n_fft=1024, hop_length=512, n_mels=128, max_frames=512,
               scale_int8=True):
    wave_data, sr = librosa.load(file_path, sr=sr, dtype=np.float32)

    start_sample_bin = start_bin * sr
    end_sample_bin = start_sample_bin + sample_duration * sr + 1
    max_length = sr * sample_duration

    wave_data = wave_data[start_sample_bin: end_sample_bin]
    wave_length, = wave_data.shape
    d = max_length - wave_length
    if d > 0:
        wave_data = np.hstack((wave_data, np.zeros((d), dtype=wave_data.dtype)))
    elif d < 0:
        wave_data = wave_data[:d]

    wave_data = librosa.util.normalize(wave_data)
    mel_spec = librosa.feature.melspectrogram(
        wave_data, sr=sr, n_fft=n_fft, hop_length=hop_length,
        n_mels=n_mels)
    mel_spec = librosa.power_to_db(mel_spec)
    mel_spec = np.transpose(mel_spec)
    
    d = max_frames - mel_spec.shape[0]
    if d > 0:
        mel_spec = np.vstack((mel_spec, np.zeros((d, mel_spec.shape[1]))))
    elif d < 0:
        mel_spec = mel_spec[:max_frames, :]
    if scale_int8:
        mel_spec = scale(mel_spec, 0, 255).astype(np.uint8)
    return mel_spec


def transform_dataset(data_dir, output_dir):
    class_list = sorted(os.listdir(data_dir))
    X, y = [], []
    for i, g in enumerate(class_list):
        print(g)
        genre_dir = data_dir / g

        out_dir = output_dir / g
        out_dir.mkdir(exist_ok=True, parents=True)
        audio_files = sorted(os.listdir(genre_dir))
        for f in tqdm(audio_files):
            file_path = genre_dir / f
            mel_spec = preprocess(file_path)
            save_path = out_dir / str(os.path.splitext(f)[0] + ".npy")
            np.save(save_path, mel_spec)
            # image_path = out_dir / str(os.path.splitext(f)[0] + ".png")
            # io.imsave(image_path, img)

            X.append(save_path)
            y.append(i)
    return X, y

In [32]:
def evaluate(model, loader, class_list):       
    correct_pred = {c: 0 for c in class_list}
    total_pred = {c: 0 for c in class_list}

    with torch.no_grad():
        for (x, y) in loader:
            out = model(x)
            _, preds = torch.max(out, 1)
            for label, pred in zip(y, preds):
                if label == pred:
                    correct_pred[class_list[label]] += 1
                total_pred[class_list[label]] += 1
    total_acc = []
    for c, correct in correct_pred.items():
        acc = 100 * float(correct) / total_pred[c]
        print(f'Accuracy for class: {c:5s} is {acc:.1f} %')
        total_acc.append(acc)
    print(f"Average accuracy: {np.mean(total_acc):.1f} %")
    return np.mean(total_acc)


def train():
    SEED = 442
    random.seed(SEED)
    np.random.seed(SEED)

    DATA_DIR = Path("/home/s2210421/datasets/gtzan/genres")
    MELSPEC_DIR = Path("/home/s2210421/datasets/gtzan/melspec")

    genre_list = sorted(os.listdir(DATA_DIR))
    num_classes = len(genre_list)

    X, y = [], []
    for i, g in enumerate(genre_list):
        genre_dir = DATA_DIR / g
        audio_files = sorted(os.listdir(genre_dir))
        for f in audio_files:
            file_path = genre_dir / f
            X.append(file_path)
            y.append(i)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=SEED, stratify=y)
    print(f"Train: {len(X_train)} - Test: {len(X_test)}")

    batch_size = 32
    transform = transforms.Compose([
        transforms.ToTensor(),
        # transforms.Normalize(0.5, 0.5)
    ])
    train_dataset = MelSpecDataset(X_train, y_train, transform=transform)
    test_dataset = MelSpecDataset(X_test, y_test, transform=transform)
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
    test_loader = torch.utils.data.DataLoader(
        test_dataset, batch_size=batch_size, num_workers=4)
    
    model = MelSpecNet(num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=3e-4)

    n_epochs = 80
    log_step = 5
    for epoch in range(n_epochs):
        running_loss = 0
        pbar = tqdm(enumerate(train_loader, 1))
        for i, (x, y) in pbar:
            optimizer.zero_grad()

            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % log_step == 0:
                pbar.set_postfix({"Epoch": epoch + 1, "step": i, "loss": running_loss / log_step})
                running_loss = 0
    torch.save(model.state_dict(), "gtzan_model.pt")
    
    print("Train set accuracy: ")
    evaluate(model, train_loader, genre_list)
    print("Test set accuracy: ")
    evaluate(model, test_loader, genre_list)

0it [00:02, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x832 and 1296x10)