In [1]:
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
import torch
from torch.nn.utils.rnn import pad_sequence

# Custom Dataset for MFCC
class CustomSpeechCommandsDataset(Dataset):
    def __init__(self, subset_dataset):
        self.subset_dataset = subset_dataset
        self.mfcc_transform = torchaudio.transforms.MFCC()

    def __getitem__(self, index):
        waveform, sample_rate, label, speaker_id, utterance_number = self.subset_dataset[index]
        mfcc = self.mfcc_transform(waveform).squeeze(0).transpose(0, 1)
        return mfcc, label

    def __len__(self):
        return len(self.subset_dataset)

# Initialize the dataset
root_path = './'
speech_commands_dataset = torchaudio.datasets.SPEECHCOMMANDS(root=root_path, download=False)

# Split the dataset
total_size = len(speech_commands_dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

train_set, val_set, test_set = random_split(speech_commands_dataset, [train_size, val_size, test_size])

# Create Custom Dataset for MFCC
train_set_mfcc = CustomSpeechCommandsDataset(train_set)
val_set_mfcc = CustomSpeechCommandsDataset(val_set)
test_set_mfcc = CustomSpeechCommandsDataset(test_set)

from torch.nn.utils.rnn import pad_sequence
import torch

# List of words in the dataset
words = [
    "Backward", "Bed", "Bird", "Cat", "Dog", "Down", "Eight", "Five",
    "Follow", "Forward", "Four", "Go", "Happy", "House", "Learn", "Left",
    "Marvin", "Nine", "No", "Off", "On", "One", "Right", "Seven", "Sheila",
    "Six", "Stop", "Three", "Tree", "Two", "Up", "Visual", "Wow", "Yes", "Zero"
]

# Create a dictionary to map labels to integers
label_to_int = {word.lower(): i for i, word in enumerate(words)}

# Update collate function to handle string labels
def collate_fn(batch):
    mfccs, labels = zip(*batch)
    mfccs = pad_sequence(mfccs, batch_first=True)
    labels = torch.Tensor([label_to_int[label.lower()] for label in labels]).long()
    return mfccs, labels

# Initialize DataLoaders with the new collate function
train_loader = DataLoader(train_set_mfcc, batch_size=128, shuffle=True, num_workers=2, collate_fn=collate_fn)
val_loader = DataLoader(val_set_mfcc, batch_size=128, shuffle=False, num_workers=2, collate_fn=collate_fn)
test_loader = DataLoader(test_set_mfcc, batch_size=128, shuffle=False, num_workers=2, collate_fn=collate_fn)



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence


# Initialize Conformer model parameters
input_dim = 40  # Assuming 40-dimensional MFCC
num_heads = 4
ffn_dim = 128
num_layers = 4  # Number of Conformer blocks
dropout_prob = 0.22

# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize Conformer model and additional classification layer, then move to device
num_classes = len(label_to_int)
conformer_model = torchaudio.models.Conformer(
    input_dim, 
    num_heads, 
    ffn_dim,  
    num_layers, 
    depthwise_conv_kernel_size=17,   
    dropout=dropout_prob
).to(device)

classifier_layer = nn.Linear(input_dim, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()

optimizer = optim.AdamW(
    list(conformer_model.parameters()) + list(classifier_layer.parameters()), lr=0.008, weight_decay=.02
)


# Training and Evaluation Loop
num_epochs = 7
for epoch in range(num_epochs):
    conformer_model.train()
    classifier_layer.train()
    running_loss = 0.0

    for i, (mfcc, labels) in enumerate(train_loader):
        # Move data and labels to device
        mfcc, labels = mfcc.to(device), labels.to(device)

        optimizer.zero_grad()
        output, _ = conformer_model(
            mfcc, torch.full((mfcc.size(0),), mfcc.size(1), dtype=torch.long).to(device)
        )
        output = classifier_layer(output.mean(dim=1))
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        # Print intermediate training loss every 100 batches
        if (i + 1) % 100 == 0:
            print(
                f"Epoch {epoch+1}, Batch {i+1}, Intermediate Training Loss: {running_loss / (i+1)}"
            )

    # Validation
    conformer_model.eval()
    classifier_layer.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for i, (mfcc, labels) in enumerate(val_loader):
            # Move data and labels to device
            mfcc, labels = mfcc.to(device), labels.to(device)

            output, _ = conformer_model(
                mfcc,
                torch.full((mfcc.size(0),), mfcc.size(1), dtype=torch.long).to(device),
            )
            output = classifier_layer(output.mean(dim=1))
            loss = criterion(output, labels)
            val_loss += loss.item()

            # Count correct predictions for validation accuracy
            _, predicted = torch.max(output.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print(
        f"Epoch {epoch+1}, Final Training Loss: {running_loss / len(train_loader)}, Validation Loss: {val_loss / len(val_loader)}"
    )
    print(f"Validation Accuracy: {(correct / total) * 100}%")

Epoch 1, Batch 100, Intermediate Training Loss: 2.3573558163642883
Epoch 1, Batch 200, Intermediate Training Loss: 1.7296855649352074
Epoch 1, Batch 300, Intermediate Training Loss: 1.4111414967974028
Epoch 1, Batch 400, Intermediate Training Loss: 1.217072047367692
Epoch 1, Batch 500, Intermediate Training Loss: 1.083378227829933
Epoch 1, Batch 600, Intermediate Training Loss: 0.9862859053909778
Epoch 1, Final Training Loss: 0.936647387384648, Validation Loss: 0.48994318297110406
Validation Accuracy: 85.62653562653563%
Epoch 2, Batch 100, Intermediate Training Loss: 0.4270892927050591
Epoch 2, Batch 200, Intermediate Training Loss: 0.4224137419462204
Epoch 2, Batch 300, Intermediate Training Loss: 0.41093695213397347
Epoch 2, Batch 400, Intermediate Training Loss: 0.4029212475568056
Epoch 2, Batch 500, Intermediate Training Loss: 0.39622187420725824
Epoch 2, Batch 600, Intermediate Training Loss: 0.3872585309793552
Epoch 2, Final Training Loss: 0.38277460639210265, Validation Loss: 0.

In [3]:
from sklearn.metrics import classification_report
import numpy
# Initialize lists to store true and predicted labels
y_true = []
y_pred = []

# Test evaluation
conformer_model.eval()  # Set the model to evaluation mode
classifier_layer.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for i, (mfcc, labels) in enumerate(test_loader):
        # Move data and labels to device
        mfcc, labels = mfcc.to(device), labels.to(device)

        # Forward pass
        output, _ = conformer_model(
            mfcc, torch.full((mfcc.size(0),), mfcc.size(1), dtype=torch.long).to(device)
        )
        output = classifier_layer(output.mean(dim=1))

        # Calculate loss
        loss = criterion(output, labels)
        test_loss += loss.item()

        # Count correct predictions for test accuracy
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Store true and predicted labels for classification report
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

print(f"Final Test Loss: {test_loss / len(test_loader)}")
print(f"Test Accuracy: {(correct / total) * 100}%")

# Generate classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=[str(i) for i in range(num_classes)]))

Final Test Loss: 0.236755488687251
Test Accuracy: 92.86659108087679%
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.89      0.93       159
           1       0.90      0.92      0.91       201
           2       0.97      0.89      0.93       209
           3       0.89      0.90      0.89       206
           4       0.82      0.97      0.89       209
           5       0.94      0.91      0.93       375
           6       0.95      0.98      0.96       364
           7       0.82      0.99      0.89       417
           8       0.84      0.86      0.85       146
           9       0.91      0.74      0.82       144
          10       0.90      0.95      0.93       389
          11       0.99      0.85      0.91       372
          12       0.85      0.95      0.90       203
          13       0.98      0.90      0.94       200
          14       0.91      0.86      0.88       188
          15       0.90      0.96      0.93

In [4]:
# Define a path to save the model
model_path = "conformer_model.pth"
classifier_path = "classifier_layer.pth"

# Save the entire Conformer model
torch.save(conformer_model, model_path)

# Save only the state dictionary of the Conformer model
torch.save(conformer_model.state_dict(), f"{model_path}_state_dict")

# Save the entire classifier layer
torch.save(classifier_layer, classifier_path)

# Save only the state dictionary of the classifier layer
torch.save(classifier_layer.state_dict(), f"{classifier_path}_state_dict")


In [None]:
# To load the entire model
loaded_conformer_model = torch.load(model_path)
loaded_classifier_layer = torch.load(classifier_path)

# If you saved the state dictionaries, first initialize the model and classifier
# with their respective architectures, and then load the state dictionaries:
conformer_model = torchaudio.models.Conformer(
    input_dim, num_heads, ffn_dim, num_layers, dropout=dropout_prob
).to(device)

classifier_layer = nn.Linear(input_dim, num_classes).to(device)

conformer_model.load_state_dict(torch.load(f"{model_path}_state_dict"))
classifier_layer.load_state_dict(torch.load(f"{classifier_path}_state_dict"))