In [1]:
from mlp64 import data
from mlp64 import experiment
from mlp64 import models
from mlp64 import st
from mlp64 import resnet2d
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
from torch.utils.data import DataLoader
import torch
import torchaudio as ta
import librosa
import scipy.fft as scipyfft
librosa.set_fftlib(scipyfft)

In [3]:
import torch
import torch.nn as nn


__all__ = [
    'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
    'vgg19_bn', 'vgg19',
]


model_urls = {
    'vgg11': 'https://download.pytorch.org/models/vgg11-bbd30ac9.pth',
    'vgg13': 'https://download.pytorch.org/models/vgg13-c768596a.pth',
    'vgg16': 'https://download.pytorch.org/models/vgg16-397923af.pth',
    'vgg19': 'https://download.pytorch.org/models/vgg19-dcbb9e9d.pth',
    'vgg11_bn': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth',
    'vgg13_bn': 'https://download.pytorch.org/models/vgg13_bn-abd245e5.pth',
    'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth',
    'vgg19_bn': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth',
}


class VGG(nn.Module):

    def __init__(self, features, num_classes=1000, init_weights=True):
        super(VGG, self).__init__()
        self.features = features
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
        if init_weights:
            self._initialize_weights()

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)


def make_layers(cfg, batch_norm=False):
    layers = []
    in_channels = 1
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
        else:
            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
            if batch_norm:
                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
            else:
                layers += [conv2d, nn.ReLU(inplace=True)]
            in_channels = v
    return nn.Sequential(*layers)


cfgs = {
    'A': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'B': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
    'D': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
    'E': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}


def _vgg(arch, cfg, batch_norm, pretrained, progress, **kwargs):
    if pretrained:
        kwargs['init_weights'] = False
    model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)
    return model


def vgg11(pretrained=False, progress=True, **kwargs):
    r"""VGG 11-layer model (configuration "A") from
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg11', 'A', False, pretrained, progress, **kwargs)


def vgg11_bn(pretrained=False, progress=True, **kwargs):
    r"""VGG 11-layer model (configuration "A") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg11_bn', 'A', True, pretrained, progress, **kwargs)


def vgg13(pretrained=False, progress=True, **kwargs):
    r"""VGG 13-layer model (configuration "B")
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg13', 'B', False, pretrained, progress, **kwargs)


def vgg13_bn(pretrained=False, progress=True, **kwargs):
    r"""VGG 13-layer model (configuration "B") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg13_bn', 'B', True, pretrained, progress, **kwargs)


def vgg16(pretrained=False, progress=True, **kwargs):
    r"""VGG 16-layer model (configuration "D")
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg16', 'D', False, pretrained, progress, **kwargs)


def vgg16_bn(pretrained=False, progress=True, **kwargs):
    r"""VGG 16-layer model (configuration "D") with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg16_bn', 'D', True, pretrained, progress, **kwargs)


def vgg19(pretrained=False, progress=True, **kwargs):
    r"""VGG 19-layer model (configuration "E")
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg19', 'E', False, pretrained, progress, **kwargs)


def vgg19_bn(pretrained=False, progress=True, **kwargs):
    r"""VGG 19-layer model (configuration 'E') with batch normalization
    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`_
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _vgg('vgg19_bn', 'E', True, pretrained, progress, **kwargs)

In [4]:
#transform = lambda y: torch.as_tensor(np.abs(librosa.cqt(y.numpy().ravel(), sr=16000, hop_length=192))[np.newaxis, :])
#transform = ta.transforms.MelSpectrogram(n_fft=800, hop_length=160)
n_fft = 512
hop_length = 128
window = torch.hann_window(n_fft)
#transform = lambda y: torch.as_tensor(st.stft(y.numpy(), n_fft, hop_length))[None, :]

transform = lambda x: st.stft_torch(x, n_fft, hop_length, window)[0]

In [5]:
path = Path("/home/tim/Desktop/MLP64/dataset/nsynth-test/")
vdf = data.create_dataset_df(path / "examples.json")
target = "instrument_class"
trdf, tedf = data.get_train_test(vdf, target)
trds = data.CachedNSynth(path / "audio", trdf, target_field=target, transform=transform, cache="cache2", overwrite=True)
teds = data.CachedNSynth(path / "audio", tedf, target_field=target, transform=transform, cache="cache2", overwrite=True)

batch_size = 32
trloader = DataLoader(trds, batch_size=batch_size, num_workers=6, shuffle=True)
teloader = DataLoader(teds, batch_size=batch_size, num_workers=6)

JSON loaded into DataFrame!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["instrument_class"] = df.apply(lambda x: x["instrument_source"] * 11 + x["instrument_family"], axis=1)


In [6]:
path = Path("/home/tim/Desktop/MLP64/dataset/nsynth-valid/")
vdf = data.create_dataset_df(path / "examples.json")
target = "instrument_class"
trdf, tedf = data.get_train_test(vdf, target)
trds2 = data.CachedNSynth(path / "audio", trdf, target_field=target, transform=transform, cache="cache", overwrite=True)
teds2 = data.CachedNSynth(path / "audio", tedf, target_field=target, transform=transform, cache="cache", overwrite=True)

batch_size = 128
trloader2 = DataLoader(trds2, batch_size=batch_size, num_workers=6, shuffle=False)
teloader2 = DataLoader(teds2, batch_size=batch_size, num_workers=6)

JSON loaded into DataFrame!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["instrument_class"] = df.apply(lambda x: x["instrument_source"] * 11 + x["instrument_family"], axis=1)


In [None]:
path = Path("/home/tim/Desktop/MLP64/dataset/nsynth-train/")
vdf = data.create_dataset_df(path / "examples.json")
target = "instrument_family"
trdf, tedf = data.get_train_test(vdf, target)
trds3 = data.NSynth(path / "audio", trdf, target_field=target, transform=transform)
teds3 = data.NSynth(path / "audio", tedf, target_field=target, transform=transform)

batch_size = 64
trloader3 = DataLoader(trds3, batch_size=batch_size, num_workers=6, shuffle=True)
teloader3 = DataLoader(teds3, batch_size=batch_size, num_workers=6)

In [7]:
# n_fft_bins is this for n_fft=512, hop_length=128
#model = models.resnet18(num_classes=33, n_fft_bins=257)
model = resnet2d.ResNet(resnet2d.BasicBlock, [1, 1, 1, 1], num_classes=33, norm_layer=nn.InstanceNorm2d)
#model = vgg11(pretrained=False, num_classes=11)

In [8]:
# Train on NSynth test dataset (smallest)
exp = experiment.Experiment(model, "../experiments/classifier_instancenorm2d/", 10, trloader, teloader, continue_from_epoch=-1)
exp.run_experiment()

  0%|          | 0/80 [00:00<?, ?it/s]

Use GPU 0


loss: 1.4101, accuracy: 0.4444: 100%|██████████| 80/80 [00:22<00:00,  3.48it/s]
loss: 1.1452, accuracy: 0.7586: 100%|██████████| 20/20 [00:06<00:00,  2.92it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 0: train_acc_0.3993_train_loss_1.9494_val_acc_0.6364_val_loss_1.3235 epoch time 29.8503 seconds


loss: 0.8215, accuracy: 0.7778: 100%|██████████| 80/80 [00:10<00:00,  7.36it/s]
loss: 0.5563, accuracy: 0.7586: 100%|██████████| 20/20 [00:01<00:00, 17.40it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 1: train_acc_0.6750_train_loss_1.0344_val_acc_0.7457_val_loss_0.7734 epoch time 12.0177 seconds


loss: 0.2598, accuracy: 0.9444: 100%|██████████| 80/80 [00:10<00:00,  7.30it/s]
loss: 0.2239, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.20it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 2: train_acc_0.8470_train_loss_0.5859_val_acc_0.9344_val_loss_0.3726 epoch time 12.1204 seconds


loss: 0.2392, accuracy: 0.9444: 100%|██████████| 80/80 [00:11<00:00,  7.25it/s]
loss: 0.1588, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 16.77it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 3: train_acc_0.9302_train_loss_0.2930_val_acc_0.9375_val_loss_0.2682 epoch time 12.2289 seconds


loss: 0.0642, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.25it/s]
loss: 0.0904, accuracy: 0.9655: 100%|██████████| 20/20 [00:01<00:00, 17.11it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 4: train_acc_0.9590_train_loss_0.1766_val_acc_0.9608_val_loss_0.1528 epoch time 12.2047 seconds


loss: 0.0499, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.21it/s]
loss: 0.0518, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.45it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 5: train_acc_0.9703_train_loss_0.1277_val_acc_0.9766_val_loss_0.1013 epoch time 12.2501 seconds


loss: 0.0408, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.23it/s]
loss: 0.0643, accuracy: 0.9655: 100%|██████████| 20/20 [00:01<00:00, 16.79it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 6: train_acc_0.9867_train_loss_0.0629_val_acc_0.9639_val_loss_0.0988 epoch time 12.2608 seconds


loss: 0.0730, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.25it/s]
loss: 0.1381, accuracy: 0.9655: 100%|██████████| 20/20 [00:01<00:00, 17.43it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 7: train_acc_0.9891_train_loss_0.0544_val_acc_0.9373_val_loss_0.1693 epoch time 12.1820 seconds


loss: 0.0133, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.22it/s]
loss: 0.0154, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.14it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 8: train_acc_0.9941_train_loss_0.0406_val_acc_0.9922_val_loss_0.0420 epoch time 12.2442 seconds


loss: 0.0128, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.24it/s]
loss: 0.0042, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.27it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 9: train_acc_0.9988_train_loss_0.0131_val_acc_0.9969_val_loss_0.0252 epoch time 12.2185 seconds


loss: 0.0032, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.22it/s]
loss: 0.0031, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.04it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 10: train_acc_1.0000_train_loss_0.0065_val_acc_0.9984_val_loss_0.0186 epoch time 12.2536 seconds


loss: 0.0023, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.24it/s]
loss: 0.0024, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.02it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 11: train_acc_1.0000_train_loss_0.0040_val_acc_0.9984_val_loss_0.0176 epoch time 12.2338 seconds


loss: 0.0137, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.16it/s]
loss: 0.0021, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.15it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 12: train_acc_1.0000_train_loss_0.0032_val_acc_0.9984_val_loss_0.0173 epoch time 12.3349 seconds


loss: 0.0041, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.20it/s]
loss: 0.0018, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 17.07it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 13: train_acc_1.0000_train_loss_0.0027_val_acc_0.9984_val_loss_0.0162 epoch time 12.2821 seconds


loss: 0.0035, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.13it/s]
loss: 0.0016, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 15.89it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 14: train_acc_1.0000_train_loss_0.0023_val_acc_0.9984_val_loss_0.0156 epoch time 12.4838 seconds


loss: 0.0012, accuracy: 1.0000: 100%|██████████| 80/80 [00:11<00:00,  7.16it/s]
loss: 0.0014, accuracy: 1.0000: 100%|██████████| 20/20 [00:01<00:00, 16.54it/s]
  0%|          | 0/80 [00:00<?, ?it/s]

Epoch 15: train_acc_1.0000_train_loss_0.0020_val_acc_0.9984_val_loss_0.0157 epoch time 12.3895 seconds


loss: 0.0025, accuracy: 1.0000:  46%|████▋     | 37/80 [00:05<00:06,  6.88it/s]


KeyboardInterrupt: 

In [8]:
# Continue training from epoch 10 on validation dataset (medium [1GB])
exp2 = experiment.Experiment(model, "../experiments/classifier_instancenorm2d/", 20, trloader2, teloader2, continue_from_epoch=-2)
exp2.run_experiment()

  0%|          | 0/63 [00:00<?, ?it/s]

Use GPU 0


loss: 0.0083, accuracy: 1.0000: 100%|██████████| 63/63 [01:14<00:00,  1.18s/it]
loss: 0.0054, accuracy: 1.0000: 100%|██████████| 16/16 [00:18<00:00,  1.16s/it]
  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 16: train_acc_0.9909_train_loss_0.0350_val_acc_0.9985_val_loss_0.0093 epoch time 92.6542 seconds


loss: 0.0058, accuracy: 1.0000: 100%|██████████| 63/63 [00:32<00:00,  1.95it/s]
loss: 0.0073, accuracy: 1.0000: 100%|██████████| 16/16 [00:02<00:00,  5.52it/s]
  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 17: train_acc_0.9980_train_loss_0.0093_val_acc_0.9980_val_loss_0.0150 epoch time 35.2201 seconds


loss: 0.0015, accuracy: 1.0000: 100%|██████████| 63/63 [00:32<00:00,  1.92it/s]
loss: 0.0013, accuracy: 1.0000: 100%|██████████| 16/16 [00:03<00:00,  5.33it/s]
  0%|          | 0/63 [00:00<?, ?it/s]

Epoch 18: train_acc_0.9996_train_loss_0.0041_val_acc_0.9995_val_loss_0.0030 epoch time 35.8501 seconds


loss: 0.0008, accuracy: 1.0000: 100%|██████████| 63/63 [00:32<00:00,  1.92it/s]
loss: 0.0008, accuracy: 1.0000: 100%|██████████| 16/16 [00:03<00:00,  5.28it/s]


Epoch 19: train_acc_1.0000_train_loss_0.0013_val_acc_1.0000_val_loss_0.0021 epoch time 35.8908 seconds


{'train_acc': [0.9909474206349206, 0.998015873015873, 0.9996279761904762, 1.0],
 'train_loss': [0.035000756, 0.009342753, 0.004145426, 0.001289057],
 'val_acc': [0.99853515625, 0.998046875, 0.99951171875, 1.0],
 'val_loss': [0.009302532, 0.0149792805, 0.002967929, 0.0021318619]}

In [32]:
# Continue training from epoch 20 on training dataset (large [37GB])
exp3 = experiment.Experiment(model, "q1", 22, trloader3, teloader3, continue_from_epoch=-2)

Use GPU 0


In [20]:
exp3.run_experiment()

loss: 0.2156, accuracy: 0.9216: 100%|██████████| 2802/2802 [06:27<00:00,  7.23it/s]
loss: 1.5690, accuracy: 0.5172: 100%|██████████| 701/701 [00:38<00:00, 18.22it/s]
  0%|          | 0/2802 [00:00<?, ?it/s]

Epoch 20: train_acc_0.8018_train_loss_0.5716_val_acc_0.5058_val_loss_1.6210 epoch time 426.1866 seconds


loss: 0.0899, accuracy: 0.9804: 100%|██████████| 2802/2802 [06:30<00:00,  7.17it/s]
loss: 0.4159, accuracy: 0.9310: 100%|██████████| 701/701 [00:38<00:00, 18.25it/s]


Epoch 21: train_acc_0.9392_train_loss_0.1739_val_acc_0.9279_val_loss_0.2184 epoch time 428.9713 seconds


{'train_acc': [0.8017876761696827, 0.9391603380288589],
 'train_loss': [0.5715694, 0.17389172],
 'val_acc': [0.5057530190860348, 0.927861675439028],
 'val_loss': [1.6209569, 0.21836306]}

# Observation:
(From this run:)
```
loss: 2.1420, accuracy: 0.2549: 100%|██████████| 2802/2802 [09:13<00:00,  5.06it/s]
loss: 2.3139, accuracy: 0.1379: 100%|██████████| 701/701 [01:27<00:00,  7.98it/s]
  0%|          | 0/2802 [00:00<?, ?it/s]

Epoch 0: train_acc_0.1815_train_loss_2.2424_val_acc_0.1846_val_loss_2.2145 epoch time 641.8255 seconds

loss: 2.1385, accuracy: 0.2549: 100%|██████████| 2802/2802 [08:59<00:00,  5.20it/s]
loss: 2.3097, accuracy: 0.1379: 100%|██████████| 701/701 [01:27<00:00,  8.04it/s]
  0%|          | 0/2802 [00:00<?, ?it/s]

Epoch 1: train_acc_0.1846_train_loss_2.2140_val_acc_0.1846_val_loss_2.2140 epoch time 626.3854 seconds

loss: 2.1383, accuracy: 0.2549: 100%|██████████| 2802/2802 [09:13<00:00,  5.06it/s]
loss: 2.3093, accuracy: 0.1379: 100%|██████████| 701/701 [01:27<00:00,  8.02it/s]
  0%|          | 0/2802 [00:00<?, ?it/s]

Epoch 2: train_acc_0.1846_train_loss_2.2139_val_acc_0.1846_val_loss_2.2139 epoch time 641.2183 seconds

loss: 2.3469, accuracy: 0.1875:   5%|▍         | 127/2802 [00:25<08:52,  5.03it/s]
```


* While both losses decreas marginally, validation accuracy stays where it is; train accuracy also only increases marginally

Todo:

* Inspect confusion matrix

# Remark after the fact:

* The problem was caused by not normalising the input

Normalising fixes the issue and makes the network train.

Still, there can be big fluctuations in validation accuracy between epochs, perhaps warranting (stronger) regularisation techniques.

# TODO:

* Confusion matrix saving