### AlexNet Variant

#### Model Architecture

In [None]:
class AlexNetVariant(nn.Module):

    hyperparameters = None
    input_shape = None
    train_losses = []
    validation_losses = []
    current_epoch = 0

    def __init__(self, input_shape, hyperparameters = None):
        super(AlexNetVariant, self).__init__()

        self.input_shape = input_shape
        self.hyperparameters = hyperparameters

        self.conv1 = nn.Conv2d(in_channels=1, out_channels=96, kernel_size=(5, 5))
        self.updateShape(5, 1, 0)
        self.maxpool1 = nn.MaxPool2d(kernel_size=(3, 3), stride=(3, 3))
        self.updateShape(3, 3, 0)

        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=(5, 5))
        self.updateShape(5, 1, 0)
        self.maxpool2 = nn.MaxPool2d(kernel_size=(3, 3), stride=(3, 3))
        self.updateShape(3, 3, 0)

        self.dropout1 = nn.Dropout(p=0.3)

        self.conv3 = nn.Conv2d(in_channels=256, out_channels=384, kernel_size=(3, 3))
        self.updateShape(3, 1, 0)
        self.conv4 = nn.Conv2d(in_channels=384, out_channels=384, kernel_size=(3, 3))
        self.updateShape(3, 1, 0)
        
        self.dropout2 = nn.Dropout(p=0.4)

        self.conv5 = nn.Conv2d(in_channels=384, out_channels=256, kernel_size=(3, 3))
        self.updateShape(3, 1, 0)
        self.maxpool3 = nn.MaxPool2d(kernel_size=(3, 3), stride=(3, 3))
        self.updateShape(3, 3, 0)

        self.bn1 = nn.BatchNorm2d(256)
        flattened = self.input_shape[0] * self.input_shape[1] * 256
        self.fc1 = nn.Linear(in_features=flattened, out_features=4096)
        self.fc2 = nn.Linear(in_features=4096, out_features=4096)
        self.fc3 = nn.Linear(in_features=4096, out_features=6)
    

    def forward(self, x) :
        x = F.relu(self.conv1(x))
        x = F.maxpool1(x)
        x = F.relu(self.conv2(x))
        x = F.maxpool2(x)

        x = self.dropout1(x)

        x = F.relu(self.conv3(x))
        x = F.relu(self.conv4(x))
        x = self.dropout2(x)

        x = F.relu(self.conv5(x))
        x = self.maxpool3(x)  
        x = self.bn1(x)
        x = flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return F.log_softmax(self.fc3(x), dim=1) # Log softmax instead of softmax for numerical stability and faster computation

    def trainEpochs(self, criterion, optimizer, train_loader, val_loader, num_epochs = 10, lr_decay = False, lr_decay_epoch = 5, lr_decay_factor = 0.1, save_each = None):
        num_epochs = self.hyperparameters['epochs']
        for epoch in range(self.current_epoch, num_epochs):
            train_loss = 0.0
            val_loss = 0.0
            self.train()
            for i, data in enumerate(train_loader):
                _, _, melSpectogram, labels = data
                melSpectogram, labels =  melSpectogram.to(device), labels.to(device)
                melSpectogram = melSpectogram.unsqueeze(1)
                optimizer.zero_grad()
                outputs = self(melSpectogram)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                train_loss += loss.item() * melSpectogram.size(0)

            self.eval()
            with torch.no_grad():
                for i, data in enumerate(val_loader):
                    _, _, melSpectogram, labels = data
                    melSpectogram, labels = melSpectogram.to(device), labels.to(device)
                    melSpectogram = melSpectogram.unsqueeze(1)
                    outputs = self(melSpectogram)
                    loss = criterion(outputs, labels)
                    val_loss += loss.item() * melSpectogram.size(0)
            train_loss = train_loss / len(train_loader.dataset)
            val_loss = val_loss / len(val_loader.dataset)
            self.training_losses.append(train_loss)
            self.validation_losses.append(val_loss)

            print("Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}".format(self.current_epoch+1, train_loss, val_loss))
            if lr_decay and (epoch+1) % lr_decay_epoch == 0:
                for param_group in optimizer.param_groups:
                    param_group['lr'] *= lr_decay_factor
            if save_each and (epoch+1) % save_each == 0:
                self.saveModel()
            self.current_epoch += 1

    def predict(self, x):
        x = self.forward(x)
        return torch.argmax(x, dim=1)
    
    def analyze(self, test_loader):
        self.eval()
        output = []
        label = []
        for i, data in enumerate(test_loader):
            _, _, melSpectogram, labels = data
            melSpectogram, labels = melSpectogram.to(device), labels.to(device)
            melSpectogram = melSpectogram.unsqueeze(1)
            output.append(self.predict(melSpectogram))
            label.append(torch.argmax(labels, dim=1))
        output = torch.cat(output)
        label = torch.cat(label)
        self.plotHistory()
        print(classification_report(label.cpu(), output.cpu(), target_names=stringLabels))
        ConfusionMatrixDisplay(confusion_matrix(label.cpu(), output.cpu()), display_labels=stringLabels).plot()
        plt.show()

    def updateShape(self, kernel_size, stride, padding):
        self.input_shape[0] = (self.input_shape[0] - kernel_size + 2*padding) // stride + 1
        self.input_shape[1] = (self.input_shape[1] - kernel_size + 2*padding) // stride + 1
  
    def plotHistory(self):
        plt.plot(self.train_losses, label = "Train")
        plt.plot(self.validation_losses, label = "Validation")
        plt.xlabel("Epoch")
        plt.ylabel("Loss")
        plt.legend()
        plt.show()

    def saveModel(self):
        path = "models/AlexNetVariant_"
        for key, value in self.hyperparameters.items():
            path += key + "_" + str(value) + "_"
        path += "current_epoch_" + str(self.current_epoch +1)
        path += ".pt"
        torch.save(self.state_dict(), path)

#### Running the Model

In [None]:
N_FFT = 512
HOP_SIZE = 160
N_MELS = 40
noise = False
time_shift = False
change_speed = False
pitch_shift = False
volume_scale = False


x_train, x_val, x_test, y_train, y_val, y_test = getFeatures(files=files, labels=labels, n_fft=N_FFT, hop_size = HOP_SIZE, n_mels= N_MELS, noise = noise, time_shift = time_shift, change_speed = change_speed, pitch_shift = pitch_shift, volume_scale = volume_scale)

train_dataset = AudioDataset(x_train, y_train)
val_dataset = AudioDataset(x_val, y_val)
test_dataset = AudioDataset(x_test, y_test)

  self.zcr = torch.tensor([data[i][0] for i in range(len(data))], dtype=torch.float32)


In [None]:
BATCH_SIZE = 128
EPOCHS = 100
LEARNING_RATE = 1e-5
LR_DECAY = True
LR_DECAY_EPOCH = 30
LR_DECAY_FACTOR = 0.25
WEIGHT_DECAY = 1e-2
SAVE_EACH = 5

hyperparameters = {"batch_size": BATCH_SIZE, "epochs": EPOCHS, "learning_rate": LEARNING_RATE, "lr_decay": LR_DECAY, "lr_decay_epoch": LR_DECAY_EPOCH, "lr_decay_factor": LR_DECAY_FACTOR, "weight_decay": WEIGHT_DECAY}

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

model = AlexNetVariant([x_train[0][2].shape[0], x_train[0][2].shape[1]], hyperparameters = hyperparameters).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay= WEIGHT_DECAY)
model.trainEpochs(criterion, optimizer, test_dataloader, val_dataloader, lr_decay=LR_DECAY, lr_decay_epoch=LR_DECAY_EPOCH, lr_decay_factor=LR_DECAY_FACTOR, save_each = SAVE_EACH)
model.analyze(test_dataloader)