In [1]:
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
import torch
from torch.nn.utils.rnn import pad_sequence

# Custom Dataset for MFCC
class CustomSpeechCommandsDataset(Dataset):
    def __init__(self, subset_dataset):
        self.subset_dataset = subset_dataset
        self.mfcc_transform = torchaudio.transforms.MFCC()

    def __getitem__(self, index):
        waveform, sample_rate, label, speaker_id, utterance_number = self.subset_dataset[index]
        mfcc = self.mfcc_transform(waveform).squeeze(0).transpose(0, 1)
        return mfcc, label

    def __len__(self):
        return len(self.subset_dataset)

# Initialize the dataset
root_path = './'
speech_commands_dataset = torchaudio.datasets.SPEECHCOMMANDS(root=root_path, download=False)

# Split the dataset
total_size = len(speech_commands_dataset)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size

train_set, val_set, test_set = random_split(speech_commands_dataset, [train_size, val_size, test_size])

# Create Custom Dataset for MFCC
train_set_mfcc = CustomSpeechCommandsDataset(train_set)
val_set_mfcc = CustomSpeechCommandsDataset(val_set)
test_set_mfcc = CustomSpeechCommandsDataset(test_set)

from torch.nn.utils.rnn import pad_sequence
import torch

# List of words in the dataset
words = [
    "Backward", "Bed", "Bird", "Cat", "Dog", "Down", "Eight", "Five",
    "Follow", "Forward", "Four", "Go", "Happy", "House", "Learn", "Left",
    "Marvin", "Nine", "No", "Off", "On", "One", "Right", "Seven", "Sheila",
    "Six", "Stop", "Three", "Tree", "Two", "Up", "Visual", "Wow", "Yes", "Zero"
]

# Create a dictionary to map labels to integers
label_to_int = {word.lower(): i for i, word in enumerate(words)}

# Update collate function to handle string labels
def collate_fn(batch):
    mfccs, labels = zip(*batch)
    mfccs = pad_sequence(mfccs, batch_first=True)
    labels = torch.Tensor([label_to_int[label.lower()] for label in labels]).long()
    return mfccs, labels

# Initialize DataLoaders with the new collate function
train_loader = DataLoader(train_set_mfcc, batch_size=128, shuffle=True, num_workers=2, collate_fn=collate_fn)
val_loader = DataLoader(val_set_mfcc, batch_size=128, shuffle=False, num_workers=2, collate_fn=collate_fn)
test_loader = DataLoader(test_set_mfcc, batch_size=128, shuffle=False, num_workers=2, collate_fn=collate_fn)



In [None]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

input_dim = 40  # Assuming 40-dimensional MFCC
# Initialize Conformer model parameters
num_heads = 4
ffn_dim = 128
num_layers = 4  # Number of Conformer blocks
# Check for GPU availability and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize Conformer model and additional classification layer, then move to device
num_classes = len(label_to_int)
# Function to be optimized
def objective(trial):
    
    # Hyperparameters to be optimized
    learning_rate = trial.suggest_float("learning_rate", 0.008, 0.016)
    dropout_prob = trial.suggest_float("dropout_prob", 0.24, 0.29)
    weight_decay = trial.suggest_float("weight_decay", 0.005, 0.2)
    
    # Initialize Conformer model and additional classification layer, then move to device
    conformer_model = torchaudio.models.Conformer(
        input_dim, 
        num_heads, 
        ffn_dim,  
        num_layers, 
        depthwise_conv_kernel_size=17,   
        dropout=dropout_prob
    ).to(device)

    classifier_layer = nn.Linear(input_dim, num_classes).to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(
        list(conformer_model.parameters()) + list(classifier_layer.parameters()), 
        lr=learning_rate, 
        weight_decay=weight_decay
    )

    # Training and Evaluation Loop
    num_epochs = 5
    for epoch in range(num_epochs):
        conformer_model.train()
        classifier_layer.train()
        running_loss = 0.0

        for i, (mfcc, labels) in enumerate(train_loader):
            mfcc, labels = mfcc.to(device), labels.to(device)

            optimizer.zero_grad()
            output, _ = conformer_model(
                mfcc, torch.full((mfcc.size(0),), mfcc.size(1), dtype=torch.long).to(device)
            )
            output = classifier_layer(output.mean(dim=1))
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Validation
        conformer_model.eval()
        classifier_layer.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for i, (mfcc, labels) in enumerate(val_loader):
                mfcc, labels = mfcc.to(device), labels.to(device)
                output, _ = conformer_model(
                    mfcc, torch.full((mfcc.size(0),), mfcc.size(1), dtype=torch.long).to(device),
                )
                output = classifier_layer(output.mean(dim=1))
                loss = criterion(output, labels)
                val_loss += loss.item()

                _, predicted = torch.max(output.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_accuracy = (correct / total) * 100
        trial.report(val_accuracy, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return val_accuracy

if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: ", len(study.trials))
    print("Best trial:")
    trial = study.best_trial

    print(f"Value: {trial.value}")
    print("Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

[I 2023-10-18 15:30:25,892] A new study created in memory with name: no-name-a67c5931-7412-4fd6-acf7-9de4be9c53a6
[I 2023-10-18 15:34:25,655] Trial 0 finished with value: 88.31033831033831 and parameters: {'learning_rate': 0.013522788412423428, 'dropout_prob': 0.28233408393227505, 'weight_decay': 0.08817480045278472}. Best is trial 0 with value: 88.31033831033831.
[I 2023-10-18 15:38:52,607] Trial 1 finished with value: 86.60933660933661 and parameters: {'learning_rate': 0.015449809523678238, 'dropout_prob': 0.28126336330952456, 'weight_decay': 0.12963409056935382}. Best is trial 0 with value: 88.31033831033831.
[I 2023-10-18 15:43:22,030] Trial 2 finished with value: 89.70893970893971 and parameters: {'learning_rate': 0.011559886236620917, 'dropout_prob': 0.2596110278181936, 'weight_decay': 0.04678156861583808}. Best is trial 2 with value: 89.70893970893971.
[I 2023-10-18 15:47:45,084] Trial 3 finished with value: 87.74333774333775 and parameters: {'learning_rate': 0.00928752329484136

In [None]:
from sklearn.metrics import classification_report
import numpy
# Initialize lists to store true and predicted labels
y_true = []
y_pred = []

# Test evaluation
conformer_model.eval()  # Set the model to evaluation mode
classifier_layer.eval()
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():
    for i, (mfcc, labels) in enumerate(test_loader):
        # Move data and labels to device
        mfcc, labels = mfcc.to(device), labels.to(device)

        # Forward pass
        output, _ = conformer_model(
            mfcc, torch.full((mfcc.size(0),), mfcc.size(1), dtype=torch.long).to(device)
        )
        output = classifier_layer(output.mean(dim=1))

        # Calculate loss
        loss = criterion(output, labels)
        test_loss += loss.item()

        # Count correct predictions for test accuracy
        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        # Store true and predicted labels for classification report
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

print(f"Final Test Loss: {test_loss / len(test_loader)}")
print(f"Test Accuracy: {(correct / total) * 100}%")

# Generate classification report
print("Classification Report:")
print(classification_report(y_true, y_pred, target_names=[str(i) for i in range(num_classes)]))