In [1]:
# Set path variables
import os
import sys
cwd = os.getcwd()
project_dir = os.path.abspath(os.path.join(cwd, os.pardir))
sys.path.append(project_dir)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

from torchsummary import summary

from src import audio_util
from src.audio_dataset import AudioDS
from src.trainer import Trainer
from src.model_yc import CRNN1, CRNN2

In [2]:
# Check if MPS is available
# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("Using MPS device.")
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print("CUDA is not available. Using CPU device.")
    device = torch.device("cpu")

print("Selected device:", device)

CUDA is not available. Using CPU device.
Selected device: cpu


In [3]:
# Load label annotation csv
train_annotations = 'mtat_train_label.csv'
val_annotations = 'mtat_val_label.csv'
test_annotations = 'mtat_test_label.csv'

# Data path
from pathlib import Path
cwd = Path.cwd()
DATA_DIR = cwd.parent / 'data'

In [4]:
# Transformations on dataset
SAMPLE_RATE = 16000
DURATION_IN_SEC = 29.1
MEL_SPEC_DB_TRANSFORMATION = audio_util.get_audio_transforms(SAMPLE_RATE,
                                                            n_fft=512,
                                                            hop_length=256,
                                                            n_mels=96,
                                                            top_db=80)

train_data = AudioDS(annotations_file=train_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

val_data = AudioDS(annotations_file=val_annotations,
                   data_dir=DATA_DIR,
                   target_sample_rate=SAMPLE_RATE,
                   target_length=DURATION_IN_SEC,
                   transformation=MEL_SPEC_DB_TRANSFORMATION)

test_data = AudioDS(annotations_file=val_annotations,
                    data_dir=DATA_DIR,
                    target_sample_rate=SAMPLE_RATE,
                    target_length=DURATION_IN_SEC,
                    transformation=MEL_SPEC_DB_TRANSFORMATION)

In [5]:
# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
EPOCHS = 10

In [6]:
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [7]:
# Display batch information
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([32, 1, 96, 1819])
Labels batch shape: torch.Size([32, 50])


### CRNN Model

Add description of architecture / reference

In [8]:
# Instantiate model
crnn = CRNN1()
crnn.to(device)
# Instantiate trainer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(crnn.parameters(), lr=LEARNING_RATE)

trainer = Trainer(crnn, train_dataloader, val_dataloader, criterion, optimizer, device=device)

In [9]:
input_size = (train_features.size()[1:])
print(input_size)

torch.Size([1, 96, 1819])


In [10]:
#print(summary(crnn, input_size))

torch.Size([2, 1, 96, 1819])
conv1 output shape: torch.Size([2, 64, 96, 1819])
conv2 output shape: torch.Size([2, 128, 96, 1819])
conv3 output shape: torch.Size([2, 256, 96, 1819])
conv4 output shape: torch.Size([2, 512, 96, 1819])
torch.Size([2, 174624, 512])
tensor([[[-0.0162, -0.0226, -0.0756,  ..., -0.1116, -0.0348, -0.0389],
         [-0.1095,  0.0015, -0.0368,  ..., -0.1866, -0.0758, -0.0047],
         [-0.0916, -0.0874,  0.0142,  ..., -0.2438,  0.0149,  0.1337],
         ...,
         [ 0.0324,  0.2763, -0.2798,  ...,  0.0146, -0.1977,  0.0671],
         [ 0.0197,  0.2777, -0.2286,  ..., -0.0194, -0.1925,  0.0595],
         [ 0.0266,  0.1213, -0.1164,  ...,  0.0738, -0.1751,  0.0830]],

        [[-0.0453, -0.0666, -0.1230,  ..., -0.0122, -0.0248,  0.0468],
         [ 0.0492,  0.0163, -0.1377,  ...,  0.0115, -0.1339,  0.1033],
         [ 0.0690,  0.0697, -0.1436,  ..., -0.1535, -0.1475,  0.2917],
         ...,
         [-0.2315,  0.3587, -0.3228,  ...,  0.1154, -0.0143,  0.1384],

In [11]:
# Run training
trainer.train(epochs=EPOCHS)

Training:   0%|          | 0/10 [00:00<?, ?it/s]

torch.Size([32, 1, 96, 1819])
conv1 output shape: torch.Size([32, 64, 96, 1819])
conv2 output shape: torch.Size([32, 128, 96, 1819])


Training:   0%|          | 0/10 [02:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
trainer.evaluate(test_dataloader)

In [None]:
# Save model
path = '../models/spec_crnn.pth'
trainer.save_model(path)

In [None]:
# Load a model
# path = '../models/MODELNAME.pth'
# trainer.load_model(path)
# print(trainer.history)