In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.utils.data import DataLoader
from dataset import DCASE

In [3]:
root_dir = os.getcwd() + '/ADL_DCASE_DATA/development'
sequence_length = 3
dataset = DCASE(root_dir, sequence_length)

In [4]:
loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [5]:
iterator = iter(loader)

data, label = iterator.next()

In [57]:
class CNN(nn.Module):
    """
    Convolutional Neural Network

    Args:
        clip_length:
            length of the clips that the spectogram is split into
        num_clips:
            number of clips the spectogram is split into
    """
    
    def __init__(
        self, 
        clip_length: int, 
        num_clips: int
    ):
        super().__init__()
        self.clip_length = clip_length
        self.num_clips = num_clips

        self.conv1 = nn.Conv2d(
            in_channels=1,
            out_channels=128,
            kernel_size=5
        )
        self.batch1 = nn.BatchNorm2d(128)
        self.conv2 = nn.Conv2d(
            in_channels=128,
            out_channels=256,
            kernel_size=5
        )
        self.batch2 = nn.BatchNorm2d(256)
        self.pool1 = nn.MaxPool2d(
            kernel_size=5,
            stride=5
        )
        self.pool2 = nn.AdaptiveMaxPool2d((4, None))
        self.fc1 = nn.Linear(256*4*25*num_clips, 15)
        
        self.initialise_layer(self.conv1)
        self.initialise_layer(self.conv2)
        
    def forward(self, xs: torch.Tensor) -> torch.Tensor:
        """
        input: [B, num_clips, H, W]

        Pre-processing, change dim from:
            [B, C, H, W] -> [C, B, H, W]
        
        where (C) is the number of segments the clip is split into,
        to simulate batch of number of clip segments and uniary depth
        """
        xs = xs.permute(1,0,2,3)
        xs = self.batch1(self.conv1(xs))
        xs = self.pool1(F.relu(xs))
        xs = self.batch2(self.conv2(xs))
        xs = self.pool2(F.relu(xs))
        """
        Re-shape and flatten back to batch dim of 1:
            [num_clips, X, Y, Z] -> [B, num_clips * X * Y * Z]
            
        where (B) = 1 so you are not left with predicitions for each clip segment,
        just one prediction for the whole clip
        """
        xs = xs.view(1,-1)
        xs = self.fc1(xs)
        return xs
        
    @staticmethod
    def initialise_layer(layer):
        if hasattr(layer, "bias"):
            nn.init.zeros_(layer.bias)
        if hasattr(layer, "weight"):
            nn.init.kaiming_normal_(layer.weight)

In [58]:
import numpy as np
data.shape, label, np.prod(list(data.shape))

model = CNN(sequence_length, dataset.get_num_clips())
print(dataset.get_num_clips())
out = model.forward(data)

out.argmax(1), label

10


(tensor([8]), tensor([3]))

In [42]:
conv1 = nn.Conv2d(
    in_channels=1,
    out_channels=128,
    kernel_size=5
)
batch1 = nn.BatchNorm2d(128)
conv2 = nn.Conv2d(
    in_channels=128,
    out_channels=256,
    kernel_size=5
)
batch2 = nn.BatchNorm2d(256)
pool1 = nn.MaxPool2d(
    kernel_size=5,
    stride=5
)
pool2 = nn.AdaptiveMaxPool2d((4, None))
fc1 = nn.Linear(256*4*25*dataset.get_num_clips(), 15)

In [43]:
x = data.permute(1,0,2,3)

In [44]:
x1 = conv1(x)
x1.shape, x.shape

(torch.Size([10, 128, 56, 146]), torch.Size([10, 1, 60, 150]))

In [45]:
x2 = batch1(x1)
x2.shape, x1.shape

(torch.Size([10, 128, 56, 146]), torch.Size([10, 128, 56, 146]))

In [46]:
x3 = pool1(F.relu(x2))
x3.shape, x2.shape

(torch.Size([10, 128, 11, 29]), torch.Size([10, 128, 56, 146]))

In [47]:
x4 = conv2(x3)
x4.shape, x3.shape

(torch.Size([10, 256, 7, 25]), torch.Size([10, 128, 11, 29]))

In [48]:
x5 = batch2(x4)
x5.shape, x4.shape

(torch.Size([10, 256, 7, 25]), torch.Size([10, 256, 7, 25]))

In [49]:
x6 = pool2(x5)
x6.shape, x5.shape

(torch.Size([10, 256, 4, 25]), torch.Size([10, 256, 7, 25]))

In [54]:
x7 = x6.view(1, -1)
x7.shape, x6.shape

(torch.Size([1, 256000]), torch.Size([10, 256, 4, 25]))

In [55]:
x8 = fc1(x7)
x8.shape, x7.shape

(torch.Size([1, 15]), torch.Size([1, 256000]))

In [56]:
x8, x8.argmax(1)

(tensor([[-0.7419, -0.4693, -0.4379,  0.8357,  0.3683, -0.5924,  0.6473,  0.2824,
          -0.0940, -0.2899,  0.2667,  1.0473,  0.0095, -0.7231, -0.5727]],
        grad_fn=<AddmmBackward>), tensor([11]))