In [1]:
from wsw.data.data_sets import AudioImageSet
from torch.utils.data import Dataset, DataLoader
import definitions

In [22]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SpectrogramCNN(nn.Module):
    def __init__(self, num_classes):
        super(SpectrogramCNN, self).__init__()

        # Convolutional layers
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=1, padding=1)  # (128x130) -> (128x130)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)  # (128x130) -> (128x130)
        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)  # (128x130) -> (64x65)

        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)  # (64x65) -> (64x65)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1)  # (64x65) -> (64x65)
        self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)  # (64x65) -> (32x32)

        self.conv5 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1)  # (32x32) -> (32x32)
        self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)  # (32x32) -> (16x16)

        # Fully connected layers
        self.fc1 = nn.Linear(512 * 16 * 16, 256)
        self.fc2 = nn.Linear(256, num_classes)

        # Dropout to prevent overfitting
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.pool1(x)

        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.pool2(x)

        x = F.relu(self.conv5(x))
        x = self.pool3(x)

        x = x.view(x.size(0), -1)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x
    

In [36]:
training_data_dir = definitions.ROOT_DIR.joinpath("training_data")
spec_data = AudioImageSet(training_data_dir, "annotations.csv")

sample = spec_data[0]
img, label = sample.values()
img

  image = torch.load(self.root_dir / file_name)


tensor([[[1.3944e-01, 3.5614e-01, 2.7137e-01,  ..., 2.5460e-01,
          2.7713e-01, 3.1668e-01],
         [2.4745e-02, 2.2964e-02, 4.9480e-04,  ..., 1.6078e-04,
          4.5358e-03, 3.1693e-02],
         [1.1281e-02, 1.0929e-02, 3.7795e-04,  ..., 7.9406e-05,
          1.3883e-03, 1.1402e-02],
         ...,
         [6.5978e-08, 3.4526e-07, 4.8815e-07,  ..., 4.4905e-07,
          5.6078e-07, 1.0221e-06],
         [2.1297e-08, 1.3819e-07, 2.7233e-07,  ..., 2.7359e-07,
          3.6968e-07, 9.9633e-07],
         [2.7015e-09, 1.3799e-08, 2.4112e-08,  ..., 1.3423e-08,
          1.3101e-07, 9.4424e-07]]])

In [37]:
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Example parameters
num_classes = 2  
learning_rate = 0.001
num_epochs = 10

# Create model
model = SpectrogramCNN(num_classes=num_classes)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# Dummy dataset (replace with actual spectrogram dataset)
train_loader = DataLoader(spec_data, batch_size=4, shuffle=True, num_workers=0)

# Training loop
for epoch in range(num_epochs):
    for sample in train_loader:
        images, labels = sample.values()
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


Epoch [1/20], Loss: 0.9656
Epoch [2/20], Loss: 0.7131
Epoch [3/20], Loss: 0.6818
Epoch [4/20], Loss: 0.5534
Epoch [5/20], Loss: 0.7441
Epoch [6/20], Loss: 0.6906
Epoch [7/20], Loss: 0.0000
Epoch [8/20], Loss: 0.0000
Epoch [9/20], Loss: 0.0113
Epoch [10/20], Loss: 0.0000
Epoch [11/20], Loss: 0.0083
Epoch [12/20], Loss: 0.0000
Epoch [13/20], Loss: 0.0000
Epoch [14/20], Loss: 0.0000
Epoch [15/20], Loss: 0.0000
Epoch [16/20], Loss: 0.0000
Epoch [17/20], Loss: 0.0000
Epoch [18/20], Loss: 0.0000
Epoch [19/20], Loss: 0.0000
Epoch [20/20], Loss: 0.0000


In [39]:
from torchvision.transforms import Compose
from wsw.data.transforms import ClipAudio, MelSpecFromAudio, ToTensorImg

In [41]:
tfm = Compose(
    [
        ClipAudio(target_length=3, sample_rate=22050),
        MelSpecFromAudio(sample_rate=22050, n_fft=2048),
        ToTensorImg()
    ]
)

In [56]:
import librosa
wesley, sr = librosa.load("training_data/wes_test.m4a", sr=22050)
elaine, sr = librosa.load("training_data/Recording.m4a", sr=22050)
wes_spec = tfm(wesley)
ela_spec = tfm(elaine)

In [59]:
inputs = torch.concat([wes_spec, ela_spec]).unsqueeze(1)
inputs.size()

torch.Size([2, 1, 128, 130])

In [60]:
logits = model(inputs)
probs = torch.softmax(logits, dim=1)

In [61]:
probs

tensor([[6.9662e-09, 1.0000e+00],
        [1.0000e+00, 2.0282e-14]], grad_fn=<SoftmaxBackward0>)