In [1]:
import sys

sys.path.append("./../../src")


from Dataset import SpeechCommandsDataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch import optim
from torch.nn import TransformerEncoder, TransformerEncoderLayer
import torchaudio.transforms as T
from torch.cuda.amp import autocast, GradScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [17]:
torch.cuda.empty_cache()

In [3]:
train_dataset = SpeechCommandsDataset("./../../data/train", mode="modified")

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=6)

In [4]:
import torchaudio.transforms as T

mel_transform = T.MelSpectrogram(
    sample_rate=16000,
    n_mels=64
)

# Optional: Log-compression (better for neural nets)
log_mel_transform = torch.nn.Sequential(
    mel_transform,
    T.AmplitudeToDB()
)


In [11]:
class SpeechCommandTransformer(nn.Module):
    def __init__(self, num_classes: int, n_mels: int = 64, embed_dim: int = 32, num_layers: int = 4, num_heads: int = 4, device: torch.device = None):
        super().__init__()
        
        self.device = device if device else torch.device("cpu")
        
        self.feature_extractor = T.MelSpectrogram(
            sample_rate=16000,
            n_fft=400,
            hop_length=160,
            n_mels=n_mels
        )
        
        self.db = torchaudio.transforms.AmplitudeToDB()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )
        
        self.projection = nn.Linear(n_mels, embed_dim)  # Project mel bins to embed_dim

        self.layers = nn.ModuleList([
            TransformerEncoderLayer(
                d_model=embed_dim,
                nhead=num_heads,
                dim_feedforward=512,
                dropout=0.1,
                activation="gelu",
                batch_first=True
            )
            for _ in range(num_layers)
        ])


        self.cls_token = nn.Parameter(torch.randn(1, 1, embed_dim)) # Learnable [CLS] token

        self.classifier = nn.Linear(embed_dim, num_classes)

    def forward(self, waveforms: torch.Tensor):
        """
        waveforms: List[Tensor] or Tensor shape (batch_size, samples)
        """

        if isinstance(waveforms, list):
            waveforms = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True)

        features = self.feature_extractor(waveforms)  # (batch_size, n_mels, time)
        features = self.db(features)  # (batch_size, n_mels, time)
        features = self.cnn(features.unsqueeze(1))  # (batch_size, 64, n_mels, time)

        # Flatten the frequency and time dims into a sequence
        batch_size, channels, freq, time = features.shape
        features = features.permute(0, 2, 3, 1)  # (batch_size, freq, time, channels)
        features = features.reshape(batch_size, freq * time, channels)  # (batch_size, seq_len, channels)
        
        x = self.projection(features)  # (batch_size, seq_len, embed_dim)


        # Add CLS token
        batch_size = x.size(0)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # (batch_size, 1, embed_dim)
        x = torch.cat((cls_tokens, x), dim=1)  # (batch_size, 1 + time, embed_dim)

        from torch.utils.checkpoint import checkpoint_sequential
        for layer in self.layers:
            x = layer(x)
            print(f"Layer output shape: {x.shape}")

        cls_output = x[:, 0]  # take output of CLS token

        logits = self.classifier(cls_output)

        return logits

In [12]:
# Create model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SpeechCommandTransformer(num_classes=len(train_dataset.class_to_idx), device=device).to(device)

# Example batch
for waveforms, labels in train_loader:
    waveforms = waveforms.squeeze(1).to(device)  # optional
    logits = model(waveforms)  # logits shape (batch_size, num_classes)
    print(logits.shape)
    break

Layer output shape: torch.Size([8, 6465, 32])
Layer output shape: torch.Size([8, 6465, 32])
Layer output shape: torch.Size([8, 6465, 32])
Layer output shape: torch.Size([8, 6465, 32])
torch.Size([8, 12])


In [15]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
num_epochs = 10

In [16]:
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
scaler = torch.amp.GradScaler()
for epoch in range(num_epochs):
    # Memory usage before the epoch
    print("Memory before epoch:", torch.cuda.memory_allocated(device))

    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for waveforms, labels in pbar:
        waveforms = waveforms.squeeze(1).to(device)  # (batch_size, samples)
        waveforms.requires_grad_()
        labels = labels.to(device)

        optimizer.zero_grad()

        with torch.amp.autocast(device_type='cuda' if torch.cuda.is_available() else 'cpu'):
            outputs = model(waveforms)
            loss = criterion(outputs, labels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()


        running_loss += loss.item() * waveforms.size(0)

        # Compute accuracy
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        pbar.set_postfix(loss=running_loss / total, acc=100. * correct / total)

    scheduler.step()
    
    # Memory usage after each batch
    print("Memory after batch:", torch.cuda.memory_allocated(device))

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100.*correct/total:.2f}%")
    
    # Memory usage after the epoch
    print("Memory after epoch:", torch.cuda.memory_allocated(device))
    
# After training
print("Total memory allocated:", torch.cuda.memory_allocated(device))
print("Max memory allocated:", torch.cuda.max_memory_allocated(device))

# Optionally, reset max memory stats
torch.cuda.reset_max_memory_allocated(device)

Memory before epoch: 1363348480


Epoch 1/10:   0%|          | 0/6583 [00:00<?, ?it/s]

Layer output shape: torch.Size([8, 6465, 32])
Layer output shape: torch.Size([8, 6465, 32])
Layer output shape: torch.Size([8, 6465, 32])
Layer output shape: torch.Size([8, 6465, 32])


Epoch 1/10:   0%|          | 0/6583 [01:14<?, ?it/s]


KeyboardInterrupt: 

In [21]:
import torch
import torch.nn as nn
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import DataLoader
from Dataset import SpeechCommandsDataset

class SimpleCNN(nn.Module):
    def __init__(self, num_classes: int, n_mels: int = 64):
        super(SimpleCNN, self).__init__()

        # Mel spectrogram transform
        self.mel_transform = T.MelSpectrogram(sample_rate=16000, n_mels=n_mels)
        self.db_transform = T.AmplitudeToDB()

        # Simple CNN model
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.fc1 = nn.Linear(64 * n_mels * 8, 128)  # Adjust input size based on output of conv layers
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, waveforms: torch.Tensor):
        # Apply mel spectrogram transform and convert to decibels
        mel_spec = self.mel_transform(waveforms)  # (batch_size, n_mels, time)
        mel_spec = self.db_transform(mel_spec)  # Convert to dB scale

        # Add a channel dimension for Conv2d
        mel_spec = mel_spec.unsqueeze(1)  # (batch_size, 1, n_mels, time)

        # Convolutional layers
        x = self.pool(torch.relu(self.conv1(mel_spec)))  # (batch_size, 32, n_mels, time/2)
        x = self.pool(torch.relu(self.conv2(x)))  # (batch_size, 64, n_mels, time/4)

        # Flatten and feed through fully connected layers
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))  # (batch_size, 128)
        x = self.fc2(x)  # (batch_size, num_classes)

        return x


In [22]:
# Load the dataset
train_dataset = SpeechCommandsDataset("./../../data/train", mode="modified")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)

# Create the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleCNN(num_classes=len(train_dataset.class_to_idx)).to(device)


In [23]:
import torch.optim as optim
from tqdm import tqdm

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    
    for waveforms, labels in pbar:
        waveforms = waveforms.squeeze(1).to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        # Forward pass
        outputs = model(waveforms)
        loss = criterion(outputs, labels)

        # Backward pass
        loss.backward()
        optimizer.step()

        # Update running loss and accuracy
        running_loss += loss.item() * waveforms.size(0)
        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        pbar.set_postfix(loss=running_loss / total, acc=100. * correct / total)

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100.*correct/total:.2f}%")


Epoch 1/10:   0%|          | 0/1646 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x20480 and 32768x128)