In [1]:
# Set path variables
import os
import sys

cwd = os.getcwd()
project_dir = os.path.abspath(os.path.join(cwd, os.pardir))
sys.path.append(project_dir)
data_path = os.path.join(project_dir, 'data/')
print(project_dir)
print(data_path)

/Users/bonuobonuo/Documents/GitHub/Tagging-Music-Sequences
/Users/bonuobonuo/Documents/GitHub/Tagging-Music-Sequences/data/


In [21]:
# for data loading process
import torch
from src.data_loader import *
import pandas as pd
from torch.utils.data import DataLoader

# load your libraries here
import torch.nn as nn
import torch.optim as optim

from torch.nn import functional as F


In [27]:
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Modeling (Adjust to whatever model you want to do)

## Data loading

In [6]:
# Load label annotation csv
train_annotations = pd.read_csv(data_path + 'mtat_train_label.csv', index_col=0).reset_index(drop=True)
val_annotations = pd.read_csv(data_path + 'mtat_val_label.csv', index_col=0).reset_index(drop=True)
test_annotations = pd.read_csv(data_path + 'mtat_test_label.csv', index_col=0).reset_index(drop=True)

### FOR RAW AUDIO DATA

Set transformation parameter to None

In [7]:
# Define global parameters across all classes
DATA_DIR = data_path
SAMPLE_RATE = 16000
DURATION_IN_SEC = 30

train_data = AudioDS(annotations_file=train_annotations, 
                     data_dir=DATA_DIR, 
                     target_sample_rate=SAMPLE_RATE, 
                     target_length=DURATION_IN_SEC, 
                     transformation=None)

val_data = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=None)

test_data = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=None)

In [8]:
# Load data from created datasets
BATCH_SIZE = 64

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [9]:
# Display batch information
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 1, 480000])
Labels batch shape: torch.Size([64, 50])


In [11]:
# Retrieve a sample
idx = 9
waveform = train_features[idx]
label = train_labels[idx]
decoded_labels = train_data.decode_labels(label)
file_path = train_data.get_filepath(idx)

print(f"Audio file path: {file_path}")
print(f"Label: {label}")
print(f"Decoded labels: {decoded_labels}")

Audio file path: /Users/bonuobonuo/Documents/GitHub/Tagging-Music-Sequences/data/mtat/0/american_bach_soloists-joseph_haydn__masses-04-quoniam_tu_solus__allegro-30-59.mp3
Label: tensor([ True, False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False,  True, False, False,
        False, False, False, False, False, False, False, False, False, False])
Decoded labels: ['guitar', 'rock', 'country']


In [12]:
# shape of waveform
# first element: number of channels in our case 1
# second element: number of samples in 30 seconds audio at a sampling rate of 16000 samples/s 
# -> 480000 = 30s * 16000 samples/s
waveform.shape

torch.Size([1, 480000])

### FOR TRANSFORMED AUDIO DATA (mel spectrograms with db)

Set transformation parameter to MEL_SPEC_DB_TRANSFORMATION

In [13]:
# Define global parameters across all classes
DATA_DIR = data_path
SAMPLE_RATE = 16000
DURATION_IN_SEC = 30
MEL_SPEC_DB_TRANSFORMATION = AudioUtil.get_audio_transforms(SAMPLE_RATE)

train_data_melspec = AudioDS(annotations_file=train_annotations, 
                     data_dir=DATA_DIR, 
                     target_sample_rate=SAMPLE_RATE, 
                     target_length=DURATION_IN_SEC, 
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

val_data_melspec = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

test_data_melspec = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

In [14]:
# Load data from created datasets
BATCH_SIZE = 64

train_dataloader_melspec = DataLoader(train_data_melspec, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader_melspec = DataLoader(val_data_melspec, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader_melspec = DataLoader(test_data_melspec, batch_size=BATCH_SIZE, shuffle=False)

In [15]:
# Display batch information
train_features, train_labels = next(iter(train_dataloader_melspec))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 1, 64, 3001])
Labels batch shape: torch.Size([64, 50])


In [16]:
# Retrieve a sample
idx = 9
mel_spec = train_features[idx]
label = train_labels[idx]
decoded_labels = train_data_melspec.decode_labels(label)
file_path = train_data_melspec.get_filepath(idx)

print(f"Audio file path: {file_path}")
print(f"Label: {label}")
print(f"Decoded labels: {decoded_labels}")

Audio file path: /Users/bonuobonuo/Documents/GitHub/Tagging-Music-Sequences/data/mtat/0/american_bach_soloists-joseph_haydn__masses-04-quoniam_tu_solus__allegro-30-59.mp3
Label: tensor([False,  True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False,  True, False, False, False,
        False, False, False, False, False, False, False, False,  True, False,
        False, False, False, False, False,  True, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False])
Decoded labels: ['classical', 'violin', 'solo', 'cello']


In [17]:
# [64, 1, 64, 3001]) tells you that your DataLoader is outputting batches 
# of 64 Mel spectrograms,
# each with a single channel, 
# 64 Mel frequency bins, 
# and a sequence length of 3001 time frames
mel_spec.shape

torch.Size([1, 64, 3001])

In [23]:

# Define the CRNN model
class CRNNModel(nn.Module):
    def __init__(self):
        super(CRNNModel, self).__init__()
        
        # 2D CNN layers
        self.conv1 = nn.Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.bn1 = nn.BatchNorm2d(64)
        self.elu1 = nn.ELU()
        self.dropout1 = nn.Dropout2d(0.1)
        
        self.conv2 = nn.Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.bn2 = nn.BatchNorm2d(128)
        self.elu2 = nn.ELU()
        self.dropout2 = nn.Dropout2d(0.1)
        
        self.conv3 = nn.Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.bn3 = nn.BatchNorm2d(256)
        self.elu3 = nn.ELU()
        self.dropout3 = nn.Dropout2d(0.1)
        
        self.conv4 = nn.Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        self.bn4 = nn.BatchNorm2d(512)
        self.elu4 = nn.ELU()
        self.dropout4 = nn.Dropout2d(0.1)
        
        # GRU layers
        self.gru1 = nn.GRU(input_size=512, hidden_size=256, num_layers=2, batch_first=True, dropout=0.1)
        self.gru2 = nn.GRU(input_size=256, hidden_size=128, num_layers=2, batch_first=True, dropout=0.1)
        
        # Fully connected layer
        self.fc = nn.Linear(128, num_classes)
        
    def forward(self, x):
        # Apply 2D CNN layers
        x = self.elu1(self.bn1(self.conv1(x)))
        x = self.dropout1(x)
        
        x = self.elu2(self.bn2(self.conv2(x)))
        x = self.dropout2(x)
        
        x = self.elu3(self.bn3(self.conv3(x)))
        x = self.dropout3(x)
        
        x = self.elu4(self.bn4(self.conv4(x)))
        x = self.dropout4(x)
        
        # Reshape for GRU
        x = x.permute(0, 3, 1, 2)  # Change dimensions for GRU
        
        # Apply GRU layers
        x, _ = self.gru1(x)
        x, _ = self.gru2(x)
        
        # Take the output from the last time step
        x = x[:, -1, :]
        
        # Fully connected layer
        x = self.fc(x)
        
        return torch.sigmoid(x)

In [28]:
# Define hyperparameters
num_classes = 50  # Adjust this based on the number of classes in your dataset
learning_rate = 0.001
batch_size = 64
num_epochs = 10

# Instantiate the model
model = CRNNModel()

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [29]:
# Training loop
for epoch in range(num_epochs):
    # Set the model in training mode
    model.train()

    # Iterate over the training DataLoader
    for batch_idx, (inputs, targets) in enumerate(train_dataloader_melspec):
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Calculate the loss
        loss = criterion(outputs, targets)

        # Backward pass
        loss.backward()

        # Update the weights
        optimizer.step()

        # Print training statistics (optional)
        if batch_idx % 10 == 0:
            print(f'Epoch {epoch + 1}/{num_epochs}, Batch {batch_idx}/{len(train_dataloader_melspec)}, Loss: {loss.item()}')

# Optional: Save the trained model
torch.save(model.state_dict(), 'trained_model.pth')

: 