In [1]:
import numpy as np
import os
import tensorflow as tf
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, models, transforms
import torch.nn.init as init
import matplotlib.pyplot as plt
from IPython.display import Audio
import librosa.display
import librosa
import zipfile
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from torchvision import datasets, models, transforms
from torch.optim import lr_scheduler
import time
from PIL import Image
import copy
from collections import Counter
import cv2
torch.cuda.empty_cache()

In [2]:
from melspecdataset import MelSpecDataset

In [3]:
transform = transforms.Compose([
    # transforms.Resize((224, 224)),  # Optionally resize images
    transforms.ToTensor(),            # Convert images to tensors
])


# Define the mapping from class names to class indices
class_mapping = {
    'car_horn': 1,
    'dog_barking': 2,
    'drilling': 3,
    'Fart': 4,
    'Guitar': 5,
    'Gunshot_and_gunfire': 6,
    'Hi-hat': 7,
    'Knock': 8,
    'Laughter': 9,
    'Shatter': 10,
    'siren': 11,
    'Snare_drum': 12,
    'Splash_and_splatter': 13
}

# Define the directories
train_directory = "train"
val_directory = "val"

# Create datasets
train_dataset = MelSpecDataset(train_directory, class_mapping, transform)
val_dataset = MelSpecDataset(val_directory, class_mapping, transform)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=2)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=2)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


datasets = {"train": train_dataset, "val": val_dataset}
dataloaders = {"train": train_dataloader, "val": val_dataloader}
dataset_sizes = {x: len(datasets[x]) for x in ['train', 'val']}

In [4]:
train_dataset.class_data, sum(train_dataset.class_data.values())

({'Snare_drum': 1000,
  'siren': 1000,
  'Hi-hat': 1000,
  'Gunshot_and_gunfire': 1000,
  'car_horn': 1000,
  'drilling': 1000,
  'Guitar': 1000,
  'Fart': 1000,
  'Laughter': 1000,
  'Splash_and_splatter': 1000,
  'dog_barking': 1000,
  'Shatter': 1000,
  'Knock': 1000},
 13000)

In [5]:
class SimplifiedResNet(nn.Module):
    def __init__(self, input_shape, num_classes):
        super(SimplifiedResNet, self).__init__()
        self.in_channels = 32  # Reduced number of initial channels

        # Initial convolutional layer
        self.conv1 = nn.Conv2d(3, self.in_channels, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_channels)
        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        # ResNet blocks
        self.layer1 = self._make_layer(ResNetBlockSimple, self.in_channels, blocks=2, stride=1)
        self.layer2 = self._make_layer(ResNetBlockSimple, self.layer1[0].conv2.out_channels, blocks=2, stride=2)
        self.layer3 = self._make_layer(ResNetBlockSimple, self.layer2[0].conv2.out_channels, blocks=2, stride=2)

        # Global average pooling
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))

        # Fully connected layer
        self.fc = nn.Linear(self.layer3[0].conv2.out_channels, num_classes)

    def _make_layer(self, block, out_channels, blocks, stride):
        layers = []
        layers.append(block(self.in_channels, out_channels, stride))
        self.in_channels = out_channels
        for _ in range(1, blocks):
            layers.append(block(out_channels, out_channels))
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.pool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x

class ResNetBlockSimple(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResNetBlockSimple, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = None
        if stride != 1 or in_channels != out_channels:
            self.downsample = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out += identity
        out = self.relu(out)
        return out



In [6]:
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        print('-' * 10)

        print("HELLO")

        #NO VALIDATION PHASE AS THAT IS HAPPENING OUT OF TRAINING [ONLY FOR HYPERPARAMETER TUNING]
        for phase in ['train']:
            model.train()  # Set model to training mode
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            index = 0
            print("STARTING ITERATION")
            for inputs, labels in dataloaders[phase]:
              print("BATCH NUMBER = ", index)

              index += 1
              optimizer.zero_grad()
              with torch.set_grad_enabled(phase == 'train'):
                  outputs = model(inputs)
                  _, preds = torch.max(outputs, 1)
                  loss = criterion(outputs, labels)

                  if phase == 'train':
                      loss.backward()
                      optimizer.step()

              running_loss += loss.item() * inputs.size(0)
              running_corrects += torch.sum(preds == labels.data)
                
            scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'EPOCH: {epoch} {phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
            
        print()

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    
    return model

In [None]:
import optuna
input_shape = (128, 345, 3)
num_classes = 13  # Assuming 14 output classes

def objective(trial):
    # Define the search space
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    gamma = trial.suggest_uniform('gamma', 0.9, 0.99)

    # Instantiate the model
    model_ft = SimplifiedResNet(input_shape, num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer_ft = optim.Adam(model_ft.parameters(), lr=lr)
    exp_lr_scheduler = lr_scheduler.ExponentialLR(optimizer_ft, gamma=gamma)
    
    # Train the model
    model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler, num_epochs=5)
    
    # Evaluate on the validation set
    model_ft.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for inputs, labels in val_dataloader:
            outputs = model_ft(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total

    return accuracy

# Perform hyperparameter optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

# Get the best hyperparameters
best_lr = study.best_params['lr']
best_gamma = study.best_params['gamma']
best_accuracy = study.best_value

print(f'Best LR: {best_lr}, Best Gamma: {best_gamma}, Best Validation Accuracy: {best_accuracy}')


  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-24 17:50:28,030] A new study created in memory with name: no-name-d4caf0e1-a31d-4014-8a2f-893706faa99b


Epoch 0/4
----------
HELLO
STARTING ITERATION


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
  gamma = trial.suggest_uniform('gamma', 0.9, 0.99)


BATCH NUMBER =  0
BATCH NUMBER =  1
BATCH NUMBER =  2


In [9]:
# Define input shape and number of classes

# Instantiate the model
model_ft = SimplifiedResNet(input_shape, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.Adam(model_ft.parameters(), lr=0.001)


#DIFFERENT SCHEDULER THIS TIME
exp_lr_scheduler =  lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)