In [1]:
import torch
from torch import nn
import numpy as np
from torch.utils.data import Dataset, DataLoader
import imageio as iio
import matplotlib.pyplot as plt
from pmqd.torch import PMQD
from typing import Tuple
import itertools
import torchaudio.transforms as T
from pyramids import LaplacianPyramid
import torch.nn.functional as F
from utils import conv as conv_utils

In [2]:
dataset = PMQD(root="/Users/up20938/Coding/datasets/pmqd", download=False)
sample_rate = PMQD.SAMPLE_RATE

In [3]:
reduced_sample_rate = 16050
window_size = 2048
hop_size = 64
num_mels = 512

In [4]:
transforms = nn.Sequential(
    T.Resample(orig_freq=sample_rate, new_freq=reduced_sample_rate),
    T.MelSpectrogram(
        n_mels=num_mels,
        n_fft=window_size,
        win_length=window_size,
        hop_length=hop_size,
        power=1,
        center=False,
        sample_rate=reduced_sample_rate,
        f_min=0,
        f_max=reduced_sample_rate / 2,
        window_fn=torch.hann_window
    ),
)



In [5]:
class SpectrogramDataset(Dataset):

    def __init__(self, dataset, transforms):
        self.data = dataset
        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # convert from audio to spectrogram
        audio = self.data[idx]["audio"]
        rating = self.data[idx]["rating"]
        # rating = 6-rating
        mono = audio.mean(axis=0)
        image = self.transforms(mono).unsqueeze(0)

        return image, rating

In [6]:
images = np.zeros((975,512,1024))
ratings = np.zeros((975))
for e, (X, y) in enumerate(train_dataloader):
    images[e] = X
    ratings[e] = y

NameError: name 'train_dataloader' is not defined

In [None]:
batch_size = 1

# Create data loaders.
train_data = SpectrogramDataset(dataset,transforms=transforms)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# test_dataloader = DataLoader(test_data, batch_size=batch_size)

# for X, y, s, r in train_dataloader:
for X, y in train_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

In [149]:
np.save('/Users/up20938/Coding/datasets/pmqd/specs',images)

In [150]:
np.save('/Users/up20938/Coding/datasets/pmqd/rats',ratings)

In [7]:
class SpectrogramDataset(Dataset):

    def __init__(self):
        self.images = np.load('/Users/up20938/Coding/datasets/pmqd/specs.npy')
        self.ratings = np.load('/Users/up20938/Coding/datasets/pmqd/rats.npy')

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = torch.from_numpy(self.images[idx]).float().unsqueeze(0)
        rating = torch.tensor(self.ratings[idx])
        return image, rating

In [63]:
batch_size = 5

# Create data loaders.
train_data = SpectrogramDataset()
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
# test_dataloader = DataLoader(test_data, batch_size=batch_size)

# for X, y, s, r in train_dataloader:
for X, y in train_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([5, 1, 512, 1024])
Shape of y: torch.Size([5]) torch.float64


In [9]:
class PrintLayer(nn.Module):
    def __init__(self):
        super(PrintLayer, self).__init__()

    def forward(self, x):
        print(x.shape)
        return x

In [264]:
class AlexNet(nn.Module):
    def __init__(self, num_classes: int = 1, dropout: float = 0.5) -> None:
        super().__init__()

        self.features = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
            # nn.Softmax(dim=-1)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        # x = x.argmax(axis=-1).float().requires_grad_()
        return x

In [88]:
"""
The :mod:`expert.models.networks.perceptnet` implements classes of networks
relating to PerceptNet.
"""
# Author: Alex Hepburn <alex.hepburn@bristol.ac.uk>

import torch
import torch.nn as nn

import utils.divisive_normalisation as expert_divisive_normalisation

__all__ = ['PerceptNet']

_NORMALISATIONS = {
    'batch_norm': nn.BatchNorm2d,
    'instance_norm': nn.InstanceNorm2d,
    'gdn': expert_divisive_normalisation.GDN}


class PerceptNet(nn.Module):
    """
    Neural network Network that follows the structure of human visual system
    introduced in [HEPBURN2019PER]_.
    PerceptNet is a neural network where the architecture takes inspiration
    from the various stages in the human visual system. The structure that the
    network mimics is as followed: gamma correction -> opponent colour space ->
    Von Kries transform -> center-surround filters -> LGN normalisation ->
    orientation sensitive and multiscale in V1 -> divisive normalisation in V1.
    If parameter ``pretrained`` is ``True``, then the network will be
    initialised with weights attained by maximising Pearson correlation between
    the $\ell_2$ distance in the transformed domain between an original image
    and a distorted image, and the mean opinion score (MOS) of the distorted
    image. The pretrained network was trained using a training split of the
    TID2008 This training procedure is detailed in [HEPBURN2019PER]_.
    If parameter ``normalisation`` is `'gdn'`, then the default values are
    used, which includes a `kernel_size` of 1, which means that there is
    no spatial element to the divisive normalisation.
    .. [HEPBURN2019PER] Hepburn, Alexander, et al. “PerceptNet: A Human Visual
       System Inspired Neural Network for Estimating Perceptual Distance.”
       ArXiv:1910.12548 [Cs, Eess, Stat], Oct. 2019. arXiv.org,
       http://arxiv.org/abs/1910.12548.
    Parameters
    ----------
    dims : int, optional (default=3)
        The number of dimensions of the input in the channel dimension. Usually
        either 3 for RGB images or 1 for Greyscale.
    normalisation : string, optional (default='gdn')
        The normalisation to be used in the network. The possible values are
        'batch_norm' for batch normalisation, 'instance_norm' for instance
        normalisation, and 'gdn' for general divisive normalisation.
    pretrained : boolean, optional (default=False)
        Whether to load network weights or not. The weights loaded were
        optimised for a training set of the TID2008 dataset.
    Raises
    ------
    ValueError
        ``dims`` parameter is not an integer. ``normalisation`` paramter is not
        a string or not in the known normalisation strings. ``pretrained`` is
        not a boolean.
    Attributes
    ----------
    normalisation_1 : nn.Module
        The first normalisation layer. If it is general divisive normalisation,
        the ``apply_independently`` paramter is `True` to simulate gamma
        correction being the first stage of the human visual system.
    conv_1 : nn.Conv2d
        The first convolutional layer. If default PerceptNet, this represents
        a transformation into opponent colour space.
    maxpool : nn.Module
        The max pool layer.
    normalisation_2 : nn.Module
        The second normalisation layer. If default PerceptNet, this represents
        a Von-Kries transform.
    conv_2 : nn.Conv2d
        The second convolutional layer. If default PerceptNet, this represents
        center-surround filters.
    normalisation_3 : nn.Module
        The third normalisation layer. If default PerceptNet, this represents
        LGN normalisation.
    conv_3 : nn.Conv2d
        The third convolutional layer. If default PerceptNet, this represents
        orientation sensitive and multiscale in V1.
    normalisation_4 : nn.Module
        The fourth normalistaion layer. If default PerceptNet, this represents
        the divisive normalisation in V1.
    features : nn.Sequential
        A sequential object of all the layers that create the network.
    """
    def __init__(self,
                 dims: int = 3,
                 normalisation: str = 'gdn',
                 pretrained: bool = False) -> None:
        """
        Constructs a ``PerceptNet`` class.
        """
        super(PerceptNet, self).__init__()
        # TODO: pretrained weights - look where to save the weights
        assert self._validate_input(dims, normalisation, pretrained)

        normalisation_layer = _NORMALISATIONS[normalisation]

        if normalisation_layer == expert_divisive_normalisation.GDN:
            # If GDN then make first layer channel independent
            normalisation_1 = normalisation_layer(
                dims, apply_independently=True)
        else:
            normalisation_1 = normalisation_layer(dims)
        normalisation_2 = normalisation_layer(dims)
        normalisation_3 = normalisation_layer(6)
        normalisation_4 = normalisation_layer(128)
        conv1 = nn.Conv2d(dims, dims, kernel_size=1, stride=1, padding=1)
        maxpool = nn.MaxPool2d(2)
        conv2 = nn.Conv2d(dims, 6, kernel_size=5, stride=1, padding=1)
        conv3 = nn.Conv2d(6, 128, kernel_size=5, stride=1, padding=1)
        self.linear = nn.Linear(128 * 125 * 253, 5)
        self.softmax = nn.Softmax(dim=-1)

        # Called features to be used as feature extraction just like the
        # modles in the torchvision package.
        self.features = nn.Sequential(
            normalisation_1,
            conv1,
            maxpool,
            normalisation_2,
            conv2,
            maxpool,
            normalisation_3,
            conv3,
            normalisation_4)
        # self.linear = nn.Linear(253,5)

    def _validate_input(self,
                        dims: int,
                        normalisation: str,
                        pretrained: bool):
        """
        Validates input of the generalised divisive normalisation class.
        For the description of the input parameters and exceptions raised by
        this function, please see the documentation of the
        :class:`expert.models.networks.perceptnet.PerceptNet` class.
        Returns
        -------
        is_valid
            ``True`` if input is valid, ``False`` otherwise.
        """
        is_valid = False

        if not isinstance(dims, int) or dims <= 0:
            raise TypeError('dims parameter must be an integer greater than '
                            '0.')

        if not isinstance(normalisation, str):
            raise TypeError('normalisation parameter must be a string.')

        if normalisation not in _NORMALISATIONS:
            raise ValueError('normalisation %s not defined. Please see'
                             'PerceptNet documentation for possible options.')

        if not isinstance(pretrained, bool):
            raise TypeError('pretrained parameter must be a boolean.')

        if pretrained and normalisation is not "gdn":
            raise ValueError('The pretrained network uses gdn as the '
                             'normalisation layer. If using a pretrained '
                             'network, please selects gdn as the '
                             'normalisation.')

        is_valid = True
        return is_valid

    def forward(self,
                x: torch.Tensor):
        """
        Forward Pass of the network.
        Parameters
        ----------
        x : torch.Tensor
            The input to the layer. Must be of shape [batch_size, channels,
            height, width].
        Raises
        ------
        TypeError:
            Input parameter ``x`` is not of dtype torch.float.
        Returns
        -------
        output : torch.Tensor
            Output of the network, the inputs representation in a more
            perceptually meaningful space.
        """
        if x.dtype != torch.float32:
            raise TypeError('Input x must be of type torch.float32.')

        output = self.features(x)
        # print(output.shape)
        # output = torch.mean(output,dim=(2,3)) #THIS IS WRONG
        output = torch.flatten(output, 1)
        output = self.linear(output)
        output = self.softmax(output)
        return output

  if pretrained and normalisation is not "gdn":


In [138]:
class PNet(nn.Module):

    def __init__(self, dims=1):
        super(PNet, self).__init__()
        # conv1 =
        # norm1 =
        # maxpool = nn.MaxPool2d(2)
        # conv2 = nn.Conv2d(1, 6, kernel_size=3, stride=1, padding=1)
        # norm1 = nn.BatchNorm2d(1)
        # conv3 = nn.Conv2d(6, 128, kernel_size=5, stride=1, padding=1)
        # printlayer = PrintLayer()
        # relu = nn.ReLU()
        self.linear = nn.Linear(48*16*32, 5)
        self.softmax = nn.Softmax(dim=-1)

        self.features = nn.Sequential(
            # LAYER 1
            nn.Conv2d(1, 3, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(3),
            # LAYER 2
            nn.Conv2d(3, 6, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(6),
            # LAYER 3
            nn.Conv2d(6, 12, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(12),
            # LAYER 4
            nn.Conv2d(12, 24, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(24),
            # LAYER 5
            nn.Conv2d(24, 48, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),
            nn.BatchNorm2d(48),
        )


    def forward(self, x: torch.Tensor):
        output = self.features(x)
        output = torch.flatten(output, 1)
        output = self.linear(output)
        output= self.softmax(output)
        return output



In [139]:
device='cpu'
print(f"Using {device} device")

# model = Filter(5,5,batch_size=batch_size).to(device)
# model = AlexNet()
model = PNet(dims=1)
print(model)
# for parameter in model.parameters():
#     print(parameter.shape)

Using cpu device
PNet(
  (linear): Linear(in_features=24576, out_features=5, bias=True)
  (softmax): Softmax(dim=-1)
  (features): Sequential(
    (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): Conv2d(3, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): BatchNorm2d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): Conv2d(6, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (11): BatchNorm2d(12, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): Conv2d(12, 24, kernel_si

In [140]:
for parameter in model.parameters():
    print(parameter.numel())

122880
5
27
3
3
3
162
6
6
6
648
12
12
12
2592
24
24
24
10368
48
48
48


In [141]:
# loss_fn = nn.MSELoss()
loss_fn = nn.CrossEntropyLoss(weight=torch.tensor([0.31, 0.18, 0.11, 0.06, 0.35]))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [142]:
def train(dataloader, model, loss_fn, optimizer):

    size = len(dataloader.dataset)//batch_size
    model.train()

    for batch, (X, y) in enumerate(dataloader):

        # y = y.float()-1 # change range from 1-5 to 0-4
        y = y.long() - 1 # change range from 1-5 to 0-4 and long for cross entropy
        # y = 5 - y.float() # change range from 1-5 to 4-0

        pred = model(X)
        # pred = torch.tensor(1.0, requires_grad=True)
        # print(pred)
        # print(pred, y)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 5 == 0:
            loss, current = loss.item(), (batch + 1)  # * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
            print(pred.data)
            print(pred.argmax(axis=-1), y)


In [143]:
torch.set_printoptions(precision=4, threshold=None, edgeitems=None, linewidth=200, profile=None, sci_mode=False)
epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
print("Done!")

Epoch 1
-------------------------------
loss: 1.669695  [    1/  195]
tensor([[0.3749, 0.1809, 0.0720, 0.1843, 0.1880],
        [0.2259, 0.1054, 0.1957, 0.1720, 0.3010],
        [0.1843, 0.2046, 0.1233, 0.2695, 0.2183],
        [0.1917, 0.1598, 0.1791, 0.1349, 0.3347],
        [0.4062, 0.1155, 0.2019, 0.1359, 0.1405]])
tensor([0, 4, 3, 4, 0]) tensor([3, 3, 2, 3, 4])
loss: 1.754308  [    6/  195]
tensor([[    0.0000,     0.0000,     0.0000,     0.1125,     0.8875],
        [    0.0000,     0.0000,     0.0000,     1.0000,     0.0000],
        [    0.0000,     0.0000,     1.0000,     0.0000,     0.0000],
        [    0.0000,     0.0000,     0.0000,     0.9992,     0.0008],
        [    0.0000,     0.0000,     0.0000,     0.9998,     0.0002]])
tensor([4, 3, 2, 3, 3]) tensor([2, 2, 2, 0, 2])
loss: 1.525603  [   11/  195]
tensor([[    0.0000,     0.0000,     0.9248,     0.0000,     0.0752],
        [    0.0000,     0.0000,     0.0000,     1.0000,     0.0000],
        [    0.0000,     0.0000,

In [144]:
torch.save(model.state_dict(), 'models/cnnce.pth')