In [None]:
import sys

sys.path.append("./../../src")


from Dataset import SpeechCommandsDataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
train_dataset = SpeechCommandsDataset("./../../data/train")

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=6)

In [None]:
class SpeechCommandRNN(nn.Module):
    def __init__(self, input_size=40, hidden_size=128, num_layers=2, num_classes=35):
        super(SpeechCommandRNN, self).__init__()

        # Apply a 1D Conv to reduce dimensionality and extract features
        self.conv = nn.Sequential(
            nn.Conv1d(1, 32, kernel_size=80, stride=40),  # (16000 -> 400)
            nn.ReLU(),
            nn.Conv1d(32, input_size, kernel_size=3, padding=1),
            nn.ReLU(),
        )

        # RNN
        self.rnn = nn.GRU(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            bidirectional=True,
        )

        # Output layer
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        x = x.squeeze(1)
        x = x.unsqueeze(1)  # (batch, 1, 16000)
        x = self.conv(x)  # (batch, input_size, time_steps)
        x = x.permute(0, 2, 1)  # (batch, time_steps, input_size)
        out, _ = self.rnn(x)  # (batch, time_steps, hidden_size*2)
        out = out[:, -1, :]  # Use last timestep
        out = self.fc(out)  # (batch, num_classes)
        return out

In [10]:
model = SpeechCommandRNN(num_classes=30).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for waveforms, labels in train_loader:
        waveforms, labels = waveforms.to(device), labels.to(device)
        outputs = model(waveforms)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 3.4002
Epoch 2, Loss: 3.3987
Epoch 3, Loss: 3.4656
Epoch 4, Loss: 0.9384
Epoch 5, Loss: 0.9320
Epoch 6, Loss: 0.5614
Epoch 7, Loss: 0.4878
Epoch 8, Loss: 0.5486
Epoch 9, Loss: 0.1285
Epoch 10, Loss: 0.4356


In [11]:
train_dataset = SpeechCommandsDataset("./../../data/train")

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=12)

model = SpeechCommandRNN(num_classes=30).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for waveforms, labels in train_loader:
        waveforms, labels = waveforms.to(device), labels.to(device)
        outputs = model(waveforms)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 3.4109
Epoch 2, Loss: 3.3893
Epoch 3, Loss: 3.3449
Epoch 4, Loss: 1.6897
Epoch 5, Loss: 0.4870
Epoch 6, Loss: 0.8981
Epoch 7, Loss: 0.4208
Epoch 8, Loss: 0.3664
Epoch 9, Loss: 0.2690
Epoch 10, Loss: 0.2942


In [16]:
train_dataset = SpeechCommandsDataset("./../../data/train")

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=12,
    pin_memory=True,
    persistent_workers=True,
)

model = SpeechCommandRNN(num_classes=30).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):
    model.train()
    for waveforms, labels in train_loader:
        waveforms, labels = waveforms.to(device), labels.to(device)
        outputs = model(waveforms)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 3.3854
Epoch 2, Loss: 3.3965
Epoch 3, Loss: 3.4031
Epoch 4, Loss: 3.3818
Epoch 5, Loss: 3.4566
Epoch 6, Loss: 1.8159
Epoch 7, Loss: 1.3793
Epoch 8, Loss: 0.4796
Epoch 9, Loss: 0.7491
Epoch 10, Loss: 0.6969


I check 128 previously and now 32, 64 seems like a sweet spot might test it later

In [17]:
test_dataset = SpeechCommandsDataset("./../../data/test")
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for waveforms, labels in dataloader:
            waveforms, labels = waveforms.to(device), labels.to(device)
            outputs = model(waveforms)  # shape: (batch_size, num_classes)
            _, predicted = torch.max(outputs, 1)  # Get predicted class index
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy


test_accuracy = evaluate(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.2%}")

Test Accuracy: 77.22%


Hmm its not even best model and it has 77% accuracy. Am I genius?

In [20]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


def get_all_preds(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for waveforms, labels in dataloader:
            waveforms, labels = waveforms.to(device), labels.to(device)
            outputs = model(waveforms)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_labels, all_preds


true_labels, pred_labels = get_all_preds(model, test_loader, device)

# Optional: get label names
label_names = list(test_dataset.class_to_idx.keys())

# Compute confusion matrix
cm = confusion_matrix(true_labels, pred_labels)

# Plot it
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_names)
fig, ax = plt.subplots(figsize=(12, 12))
disp.plot(include_values=True, xticks_rotation="vertical", ax=ax, cmap="Blues")
plt.title("Confusion Matrix")
plt.show()

AttributeError: 'SpeechCommandsDataset' object has no attribute 'class_to_idx'