In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
from torch.utils.data import DataLoader
from dataset import DCASE
from CNN import CNN as CNN1

In [2]:
root_dir = os.getcwd() + '/ADL_DCASE_DATA/development'
sequence_length = 3
dataset = DCASE(root_dir, sequence_length)

In [3]:
loader = DataLoader(dataset, batch_size=1, shuffle=True)

In [4]:
iterator = iter(loader)

data, label = iterator.next()

In [5]:
class CNN(nn.Module):
    """
    Convolutional Neural Network

    Args:
        clip_length:
            length of the clips that the spectogram is split into
        num_clips:
            number of clips the spectogram is split into
    """
    
    def __init__(
        self, 
        clip_length: int, 
        num_clips: int,
        batch_size: int = 1
    ):
        super().__init__()
        self.clip_length = clip_length
        self.num_clips = num_clips
        self.batch_size = batch_size
        self.height = 60
        self.width = 50 * clip_length

        self.conv1 = nn.Conv2d(
            in_channels=1,
            out_channels=128,
            kernel_size=5
        )
        self.batch1 = nn.BatchNorm2d(128)
        self.conv2 = nn.Conv2d(
            in_channels=128,
            out_channels=256,
            kernel_size=5
        )
        self.batch2 = nn.BatchNorm2d(256)
        self.pool1 = nn.MaxPool2d(
            kernel_size=5,
            stride=5
        )
        self.pool2 = nn.AdaptiveMaxPool2d((4, None))
        self.fc1 = nn.Linear(256*4*25, 15)
        
        self.initialise_layer(self.conv1)
        self.initialise_layer(self.conv2)
        
    def forward(self, xs: torch.Tensor) -> torch.Tensor:
        """
        input: [B, num_clips, H, W]

        Pre-processing, change dim from:
            [B, num_clips, H, W] -> [B * num_clips, C, H, W]
        
        where (C = 1) is the number of 'channels' of uniary depth
        """
        xs = xs.view(self.batch_size * self.num_clips, 1, self.height, self.width)
        xs = self.batch1(self.conv1(xs))
        xs = self.pool1(F.relu(xs))
        xs = self.batch2(self.conv2(xs))
        xs = self.pool2(F.relu(xs))
        """
        Re-shape and flatten to:
            [B * num_clips, X, Y, Z] -> [B * num_clips, X * Y * Z]
        """
        xs = torch.flatten(xs, 1)
        xs = self.fc1(xs)
        """
        Re-shape to:
            [B * num_clips, preds] -> [B, num_clips, preds]

        where preds is 15 for 15 class predictions
        """
        xs = xs.view(self.batch_size, self.num_clips, -1)
        return xs
        
    @staticmethod
    def initialise_layer(layer):
        if hasattr(layer, "bias"):
            nn.init.zeros_(layer.bias)
        if hasattr(layer, "weight"):
            nn.init.kaiming_normal_(layer.weight)

In [7]:
import numpy as np
data.shape, label, np.prod(list(data.shape))
model = CNN(sequence_length, dataset.get_num_clips(), batch_size=1)
out = model.forward(data)

out.mean(1).detach().argmax(1)

tensor([11])

In [8]:
out.mean(1).detach().argmax(-1)

tensor([11])

In [9]:
conv1 = nn.Conv2d(
    in_channels=1,
    out_channels=128,
    kernel_size=5
)
batch1 = nn.BatchNorm2d(128)
conv2 = nn.Conv2d(
    in_channels=128,
    out_channels=256,
    kernel_size=5
)
batch2 = nn.BatchNorm2d(256)
pool1 = nn.MaxPool2d(
    kernel_size=5,
    stride=5
)
pool2 = nn.AdaptiveMaxPool2d((4, None))
fc1 = nn.Linear(256*4*25, 15)

In [10]:
data.shape

batch = 1
num_clips = dataset.get_num_clips()

x= data.view(batch*num_clips, 1, data.shape[2], data.shape[3])
# x = data.permute(1,0,2,3)

# xd.shape

In [11]:
x1 = conv1(x)
x1.shape, x.shape

(torch.Size([10, 128, 56, 146]), torch.Size([10, 1, 60, 150]))

In [12]:
x2 = batch1(x1)
x2.shape, x1.shape

(torch.Size([10, 128, 56, 146]), torch.Size([10, 128, 56, 146]))

In [13]:
x3 = pool1(F.relu(x2))
x3.shape, x2.shape

(torch.Size([10, 128, 11, 29]), torch.Size([10, 128, 56, 146]))

In [14]:
x4 = conv2(x3)
x4.shape, x3.shape

(torch.Size([10, 256, 7, 25]), torch.Size([10, 128, 11, 29]))

In [15]:
x5 = batch2(x4)
x5.shape, x4.shape

(torch.Size([10, 256, 7, 25]), torch.Size([10, 256, 7, 25]))

In [16]:
x6 = pool2(x5)
x6.shape, x5.shape

x6[1, 1]

tensor([[-0.4544, -0.5039, -0.2890, -0.0701, -1.0898,  0.1888,  0.9238, -0.7998,
         -1.0801, -0.8860, -0.8321, -0.2204,  0.4263, -0.0442,  0.0098, -0.4575,
          0.0233,  0.2080, -0.3300,  0.5078, -0.5430, -0.9894, -0.6741,  0.2212,
         -0.0652],
        [-0.7346,  0.0399,  0.6852, -0.0701,  1.5877,  0.8423,  0.9238, -0.7998,
         -0.6065, -0.1648, -0.5405,  0.1019, -0.4892, -0.1022, -0.3889, -0.2491,
          0.1371, -0.0914,  0.7173, -0.1923,  0.0623,  0.2537, -0.2259,  0.2212,
         -0.0652],
        [ 0.9952,  1.4616,  1.9041, -1.4479,  3.8736,  1.6022,  2.0324,  0.3663,
          0.9679,  0.9734,  1.3165,  0.7411,  0.7165,  1.1762,  1.5720,  1.2958,
          0.9764,  0.3352,  0.7173,  1.3644,  0.4782,  0.5527,  0.9834,  0.6995,
          0.9802],
        [ 1.6042,  1.4616,  2.7323, -0.8269,  4.7574,  2.5914,  3.0173,  0.3663,
          1.3099,  1.1453,  1.3165,  1.3623,  1.3207,  1.2647,  1.7895,  1.3498,
          0.9764,  1.6331,  0.9238,  1.4068,  0.8506

In [17]:
x7 = x6.view(1, -1)
x7.shape, x6.shape

(torch.Size([1, 256000]), torch.Size([10, 256, 4, 25]))

In [18]:
x8 = fc1(x7)
x8.shape, x7.shape

RuntimeError: size mismatch, m1: [1 x 256000], m2: [25600 x 15] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:752

In [None]:
x8, x8.argmax(1)

In [5]:
cnn = CNN1(sequence_length, 10, batch_size=1)

In [6]:
data.shape

torch.Size([1, 10, 60, 150])

In [7]:
out = cnn(data)

1
10
60
150


In [24]:
out.mean(1).shape

pred = out.mean(1)

In [26]:
pred.argmax(-1)

tensor([7])

In [27]:
label

tensor([5])