In [19]:
# Set path variables
import os
import sys
cwd = os.getcwd()
project_dir = os.path.abspath(os.path.join(cwd, os.pardir))
sys.path.append(project_dir)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torchsummary import summary

from src import audio_util
from src.audio_dataset import AudioDS
from src.trainer import Trainer
from src.model_yc import CRNN

In [22]:
# Check if MPS is available
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device.")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU device.")
    device = torch.device("cpu")

print("Selected device:", device)

Using MPS device.
Selected device: mps


In [3]:
# Load label annotation csv
train_annotations = 'mtat_train_label.csv'
val_annotations = 'mtat_val_label.csv'
test_annotations = 'mtat_test_label.csv'

# Data path
from pathlib import Path
cwd = Path.cwd()
DATA_DIR = cwd.parent / 'data'

In [4]:
# Transformations on dataset
SAMPLE_RATE = 16000
DURATION_IN_SEC = 29.1
MEL_SPEC_DB_TRANSFORMATION = audio_util.get_audio_transforms(SAMPLE_RATE,
                                                            n_fft=512,
                                                            hop_length=256,
                                                            n_mels=96,
                                                            top_db=80)

train_data = AudioDS(annotations_file=train_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

val_data = AudioDS(annotations_file=val_annotations,
                   data_dir=DATA_DIR,
                   target_sample_rate=SAMPLE_RATE,
                   target_length=DURATION_IN_SEC,
                   transformation=MEL_SPEC_DB_TRANSFORMATION)

test_data = AudioDS(annotations_file=val_annotations,
                    data_dir=DATA_DIR,
                    target_sample_rate=SAMPLE_RATE,
                    target_length=DURATION_IN_SEC,
                    transformation=MEL_SPEC_DB_TRANSFORMATION)

In [5]:
# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 10

In [6]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
# Display batch information
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([32, 1, 96, 1819])
Labels batch shape: torch.Size([32, 50])


### CRNN Model

Add description of architecture / reference

In [15]:
# Instantiate model
crnn = CRNN()

# Instantiate trainer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(crnn.parameters(), lr=LEARNING_RATE)

trainer = Trainer(crnn, train_dataloader, val_dataloader, criterion, optimizer, device=device)

In [9]:
input_size = (train_features.size()[1:])
print(summary(crnn, input_size))

RuntimeError: input.size(-1) must be equal to input_size. Expected 512, got 174624

In [13]:
# Run training
trainer.train(epochs=EPOCHS)

Training: 100%|██████████| 10/10 [1:09:14<00:00, 415.44s/it, epoch=10, training loss=0.124, validation loss=0.142]


In [14]:
trainer.evaluate(test_dataloader)

(0.1423581662494324,
 0.08595194085027727,
 0.8910117290756215,
 0.4172853250163826)

In [15]:
# Save model
path = '../models/spec_crnn.pth'
trainer.save_model(path)

In [11]:
# Load a model
# path = '../models/MODELNAME.pth'
# trainer.load_model(path)
# print(trainer.history)

{'train_loss': [], 'train_accuracy': [], 'train_roc_auc': [], 'train_pr_auc': [], 'val_loss': [], 'val_accuracy': [], 'val_roc_auc': [], 'val_pr_auc': []}
