In [1]:
import torch as th
import torch 
import torch.nn as nn
import torch.nn.functional as F

# from models.model import SEANet

In [2]:
!pwd

/home/woongjib/Projects/MBSEANet


In [3]:
""" Multi band SEANet """
from torchinfo import summary
from models.model import MBSEANet

model = MBSEANet(c_out=27, c_in=5, min_dim = 32)
wav = torch.rand(4,5,55400)

summary(
    model, input_data = wav,
    col_names=['input_size','output_size'],
    depth=2
)



Layer (type:depth-idx)                        Input Shape               Output Shape
MBSEANet                                      [4, 5, 55400]             [4, 27, 55400]
├─Conv1d: 1-1                                 [4, 5, 55296]             [4, 32, 55296]
│    └─Pad: 2-1                               [4, 5, 55296]             [4, 5, 55302]
│    └─Conv1d: 2-2                            [4, 5, 55302]             [4, 32, 55296]
│    └─ELU: 2-3                               [4, 32, 55296]            [4, 32, 55296]
├─ModuleList: 1-2                             --                        --
│    └─EncBlock: 2-4                          [4, 32, 55296]            [4, 64, 27648]
│    └─EncBlock: 2-5                          [4, 64, 27648]            [4, 128, 13824]
│    └─EncBlock: 2-6                          [4, 128, 13824]           [4, 256, 1728]
│    └─EncBlock: 2-7                          [4, 256, 1728]            [4, 512, 216]
├─Sequential: 1-3                             [4, 512, 216

In [4]:
""" STFT Feature Encoder """
from torchinfo import summary
from models.feature_encoder import ResNet18

stft = torch.rand(3,1,513,120) #[B,C,F,T]
model = ResNet18(in_channels=7)
print('output shape:', model(stft).shape) #[B,C,F,T]=[B,512,16,120]

print(summary(
    model, input_data = stft,
    col_names=['input_size','output_size','kernel_size', ],
    depth=3
))

output shape: torch.Size([3, 56, 16, 120])
Layer (type:depth-idx)                   Input Shape               Output Shape              Kernel Shape
ResNet                                   [3, 1, 513, 120]          [3, 56, 16, 120]          --
├─CausalConv2d: 1-1                      [3, 1, 513, 120]          [3, 7, 256, 120]          [7, 7]
├─ReLU: 1-2                              [3, 7, 256, 120]          [3, 7, 256, 120]          --
├─MaxPool2d: 1-3                         [3, 7, 256, 120]          [3, 7, 128, 120]          3
├─Sequential: 1-4                        [3, 7, 128, 120]          [3, 7, 128, 120]          --
│    └─BasicBlock: 2-1                   [3, 7, 128, 120]          [3, 7, 128, 120]          --
│    │    └─CausalConv2d: 3-1            [3, 7, 128, 120]          [3, 7, 128, 120]          [3, 3]
│    │    └─ReLU: 3-2                    [3, 7, 128, 120]          [3, 7, 128, 120]          --
│    │    └─CausalConv2d: 3-3            [3, 7, 128, 120]          [3, 7, 12

In [5]:
from keras import backend as K
from keras.layers import Layer, LSTM, MaxPooling1D

import torch
import torch.nn as nn
import torch.nn.functional as F

class TFiLM(nn.Module):
    """
    Temporal Feature-wise Linear Modulation in PyTorch.
    
    Input shape: (batch_size, steps, num_features)
    Output shape: Same as input
    
    Num Steps must be multiple of block size
    """
    def __init__(self, block_size, input_dim):
        super(TFiLM, self).__init__()
        self.block_size = block_size
        self.max_pool = nn.MaxPool1d(kernel_size=self.block_size)
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=input_dim, num_layers=1, batch_first=True)
        
        
    def make_normalizer(self, x):
        """
        Downsamples input along temporal dimension and generates normalization weights via LSTM.
        """
        batch_size, steps, num_features = x.shape
        n_blocks = steps // self.block_size

        # MaxPool
        # print(x.shape, 'xshape:[B,T,F]')
        # print(x.permute(0,2,1).shape)
        x_down = self.max_pool(x.permute(0,2,1)).permute(0,2,1) # pooling along t axis, block-wise
        print(x_down.shape, 'after pooling') # [B,T',F]
        
        x_rnn, _ = self.lstm(x_down) # [B,T',F] -> [B,T',F]
        return x_rnn

    def apply_normalizer(self, x, x_norm):
        """
        Applies normalization weights to respective blocks.
        """
        batch_size, steps, num_features = x.shape
        n_blocks = steps // self.block_size

        x = x.reshape(batch_size, n_blocks, self.block_size, num_features)
        print(x.shape) # [B,T/block,block,F]
        x_norm = x_norm.reshape(batch_size, n_blocks, 1, num_features)
        print(x_norm.shape) #[B,T/block,1,F]

        # 정규화 가중치를 적용
        x_out = x * x_norm

        # 원래 형태로 reshape
        x_out = x_out.reshape(batch_size, steps, num_features) #[B,T,F]
        return x_out

    def forward(self, x):
        """
        Forward pass for TFiLM layer.
        """
        # 입력 텐서 형태 확인
        assert x.dim() == 3, "Input must be 3D: (batch_size, steps, num_features)"
        assert x.size(1) % self.block_size == 0, "Number of steps must be a multiple of block_size"

        x_norm = self.make_normalizer(x)
        x_out = self.apply_normalizer(x, x_norm)
        return x_out

# 사용 예시
if __name__ == "__main__":
    from torchinfo import summary
    batch_size = 1
    steps = 120  # time steps (must be divisible by block_size)
    num_features = 32
    block_size = 10

    x = torch.randn(batch_size, steps, num_features) #[b,t,f]=[4,16,8]
    tfilm = TFiLM(block_size=block_size, input_dim=num_features)
    output = tfilm(x)
    
    print(tfilm)
    print("Input shape:", x.shape)
    print("Output shape:", output.shape)
    
    print(summary(
    tfilm, input_data=x,
    col_names=['input_size','output_size','kernel_size', 'num_params'],
    depth=2
    ))

2024-12-26 14:18:14.733415: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


torch.Size([1, 12, 32]) after pooling
torch.Size([1, 12, 10, 32])
torch.Size([1, 12, 1, 32])
TFiLM(
  (max_pool): MaxPool1d(kernel_size=10, stride=10, padding=0, dilation=1, ceil_mode=False)
  (lstm): LSTM(32, 32, batch_first=True)
)
Input shape: torch.Size([1, 120, 32])
Output shape: torch.Size([1, 120, 32])
torch.Size([1, 12, 32]) after pooling
torch.Size([1, 12, 10, 32])
torch.Size([1, 12, 1, 32])
Layer (type:depth-idx)                   Input Shape               Output Shape              Kernel Shape              Param #
TFiLM                                    [1, 120, 32]              [1, 120, 32]              --                        --
├─MaxPool1d: 1-1                         [1, 32, 120]              [1, 32, 12]               10                        --
├─LSTM: 1-2                              [1, 12, 32]               [1, 12, 32]               --                        8,448
Total params: 8,448
Trainable params: 8,448
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES

In [6]:
batch_size = 1
steps = 120  # time steps (must be divisible by block_size)
num_features = 32
block_size = 10

x = torch.randn(batch_size, steps, num_features) #[b,t,f]=[4,16,8]
print(x.shape)
xb = x.reshape(x.size(0), x.size(1)//block_size, block_size, x.size(2))
print(xb.shape)

maxpool = nn.MaxPool1d(kernel_size=12)
y = maxpool(x.permute(0,2,1)).permute(0,2,1)
print(y.shape)

rnn = nn.LSTM(input_size=num_features, hidden_size=num_features, batch_first=True)
out,_ = rnn(y)
print(out.shape)

torch.Size([1, 120, 32])
torch.Size([1, 12, 10, 32])
torch.Size([1, 10, 32])
torch.Size([1, 10, 32])


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Input 설정
batch_size = 1
steps = 120  # time steps (must be divisible by block_size)
num_features = 32
block_size = 10

x = torch.randn(batch_size, steps, num_features)  # [B, T, C]
print("Input x shape:", x.shape)

# 1. Reshape F to F^{blk} -> [B, B', T/B, C]
B_prime = steps // block_size  # 블록 개수
F_blk = x.reshape(batch_size, B_prime, block_size, num_features)
print("F_blk shape (Reshaped into blocks):", F_blk.shape)  # [B, B', T/B, C]

# 2. Pooling: 각 블록의 데이터를 Pooling하여 [B, B', C] 형태로 변환
F_pool = F_blk.mean(dim=2)  # Temporal dimension (block size) 평균
print("F_pool shape (Pooled):", F_pool.shape)  # [B, B', C]

# 3. RNN 적용: 각 블록의 pooled feature를 사용해 γ와 β 생성
rnn = nn.LSTM(input_size=num_features, hidden_size=num_features*2, batch_first=True)
F_rnn, _ = rnn(F_pool)  # [B, B', C] -> [B, B', 2*C]
gamma, beta = torch.chunk(F_rnn, 2, dim=-1)  # γ와 β를 생성
print("Gamma shape:", gamma.shape)  # [B, B', C]
print("Beta shape:", beta.shape)    # [B, B', C]

# 4. Normalization: γ와 β를 적용
# F_blk: [B, B', T/B, C], gamma/beta: [B, B', C]
gamma = gamma.unsqueeze(2)  # [B, B', 1, C]
beta = beta.unsqueeze(2)    # [B, B', 1, C]
F_norm = gamma * F_blk + beta
print("F_norm shape (Normalized):", F_norm.shape)  # [B, B', T/B, C]

# 5. Reshape F_norm back to original shape [B, T, C]
F_out = F_norm.reshape(batch_size, steps, num_features)
print("Output F_out shape:", F_out.shape)  # [B, T, C]


Input x shape: torch.Size([1, 120, 32])
F_blk shape (Reshaped into blocks): torch.Size([1, 12, 10, 32])
F_pool shape (Pooled): torch.Size([1, 12, 32])
Gamma shape: torch.Size([1, 12, 32])
Beta shape: torch.Size([1, 12, 32])
F_norm shape (Normalized): torch.Size([1, 12, 10, 32])
Output F_out shape: torch.Size([1, 120, 32])
