In [1]:
# Set path variables
import os
import sys

cwd = os.getcwd()
project_dir = os.path.abspath(os.path.join(cwd, os.pardir))
sys.path.append(project_dir)
data_path = os.path.join(project_dir, 'data/')
print(project_dir)
print(data_path)

/home/seuh/Tagging-Music-Sequences
/home/seuh/Tagging-Music-Sequences/data/


In [2]:
# for data loading process
from src.data_loader import *
from torch.utils.data import DataLoader
import pandas as pd

import math
import numpy as np

import torch
from torch import nn
import torch.optim as optim
from torch.autograd import Variable
import os
from tqdm import tqdm

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


# CNN+Attention

## Data loading

In [4]:
# Load label annotation csv
train_annotations = pd.read_csv(data_path + 'mtat_train_label.csv', index_col=0).reset_index(drop=True)
val_annotations = pd.read_csv(data_path + 'mtat_val_label.csv', index_col=0).reset_index(drop=True)
test_annotations = pd.read_csv(data_path + 'mtat_test_label.csv', index_col=0).reset_index(drop=True)

### FOR RAW AUDIO DATA

Set transformation parameter to None

In [50]:
# Define global parameters across all classes
DATA_DIR = data_path
SAMPLE_RATE = 16000
DURATION_IN_SEC = 15

train_data = AudioDS(annotations_file=train_annotations, 
                     data_dir=DATA_DIR, 
                     target_sample_rate=SAMPLE_RATE, 
                     target_length=DURATION_IN_SEC, 
                     transformation=None)

val_data = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=None)

test_data = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_DIR,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=None)

In [66]:
# Load data from created datasets
BATCH_SIZE = 16

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [52]:
# Display batch information
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 1, 240000])
Labels batch shape: torch.Size([64, 50])


In [53]:
# Retrieve a sample
idx = 9
waveform = train_features[idx]
label = train_labels[idx]
decoded_labels = train_data.decode_labels(label)
file_path = train_data.get_filepath(idx)

print(f"Audio file path: {file_path}")
print(f"Label: {label}")
print(f"Decoded labels: {decoded_labels}")

Audio file path: /home/seuh/Tagging-Music-Sequences/data/mtat/0/american_bach_soloists-joseph_haydn__masses-04-quoniam_tu_solus__allegro-30-59.mp3
Label: tensor([ True, False, False, False, False,  True, False, False, False,  True,
        False,  True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False, False])
Decoded labels: ['guitar', 'vocal', 'rock', 'male', 'bass']


In [54]:
# shape of waveform
# first element: number of channels in our case 1
# second element: number of samples in 30 seconds audio at a sampling rate of 16000 samples/s 
# -> 480000 = 30s * 16000 samples/s
waveform.shape

torch.Size([1, 240000])

## Build Model

### Front-end CNN+waveform

In [55]:
class Conv1(nn.Module):
    def __init__(self, input_channels, output_channels):
        super(Conv1, self).__init__()
        self.conv = nn.Conv1d(input_channels, output_channels, 1)
        self.bn = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU()
    def forward(self, x):
        out = self.relu(self.bn(self.conv(x)))
        return out


class Conv7(nn.Module):
    def __init__(self, input_channels, output_channels):
        super(Conv7, self).__init__()
        self.conv = nn.Conv1d(input_channels, output_channels, 7, padding=3)
        self.bn = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.relu(self.bn(self.conv(x)))
        return out


class Conv3(nn.Module):
    def __init__(self, input_channels, output_channels):
        super(Conv3, self).__init__()
        self.conv = nn.Conv1d(input_channels, output_channels, 3, padding=1)
        self.bn = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU()
        self.mp = nn.MaxPool1d(3)

    def forward(self, x):
        out = self.mp(self.relu(self.bn(self.conv(x))))
        return out


class InitConv(nn.Module):
    def __init__(self, output_channels):
        super(InitConv, self).__init__()
        self.conv = nn.Conv1d(1, output_channels, 3, stride=3, padding=1)
        self.bn = nn.BatchNorm1d(output_channels)
        self.relu = nn.ReLU()

    def forward(self, x):
        out = self.conv(x)
        out = self.relu(self.bn(out))
        return out

In [56]:
class LeeConvModule(nn.Module):
    def __init__(self, conv_channels, num_classes):
        super(LeeConvModule, self).__init__()
        
        # initial convolution
        self.init_conv = InitConv(conv_channels)

        # stack convolution
        c = conv_channels
        channels = [c, c, c, c*2, c*2, c*2, c*2, c*2, c*4]
        self.convs = nn.ModuleList([Conv3(channels[i], channels[i+1]) for i in range(len(channels)-1)])

        # Adding three Conv7 layers
        self.conv7x1_1 = Conv7(channels[-1], channels[-1])
        
        # Final classification layer
        self.classifier = nn.Linear(channels[-1], num_classes)

    def forward(self, x):
        out = self.init_conv(x)
        for layer in self.convs:
            out = layer(out)
            
#         # Pass through Conv7 layers
#         out = self.conv7x1_1(out)
                                     
        # Global average pooling before the classification layer
        out = torch.mean(out, dim=-1)

        # Apply the final classifier
        logits = self.classifier(out)
        return logits

In [57]:
class TrainLeeConvModule:
    def __init__(self, conv_channel, num_classes=50, lr=0.001, epochs=10, model_save_path='../models/model.pth', use_cuda=True):
        self.lr = lr
        self.epochs = epochs
        self.model_save_path = model_save_path
        self.use_cuda = use_cuda and torch.cuda.is_available()
        
        # Initialize the model
        self.model = LeeConvModule(conv_channel, num_classes)
        if self.use_cuda:
            self.model.cuda()

        # Loss function and optimizer
        self.criterion = nn.BCELoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

    def save_checkpoint(self, epoch):
        checkpoint_path = f'{self.model_save_path}_epoch_{epoch}.pth'
        torch.save(self.model.state_dict(), checkpoint_path)
        print(f'Model saved to {checkpoint_path}')

    def train(self, train_features, train_labels):
        num_batches = train_features.shape[0]
        # Iterate over batches with tqdm for progress display
        tqdm_bar = tqdm(range(self.epochs), desc=f'Training Progress', leave=True)
        for epoch in tqdm_bar:
            self.model.train()
            running_loss = 0.0

            if self.use_cuda:
                    train_features, train_labels = train_features.cuda(), train_labels.cuda()

            # Zero the parameter gradients
            self.optimizer.zero_grad()

            # Forward pass
            outputs = self.model(train_features)
            loss = self.criterion(outputs, train_labels)

            # Backward pass and optimize
            loss.backward()
            self.optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / len(train_features)
            # Update tqdm bar
            tqdm_bar.set_postfix(loss=avg_loss)

        
            # Save model checkpoint
            self.save_checkpoint(epoch+1)

            # Print average loss at the end of the epoch
            avg_loss = running_loss / num_batches
            print(f'Epoch [{epoch+1}/{self.epochs}] completed, Average Loss: {avg_loss:.4f}')

        # Save the final model
        torch.save(self.model.state_dict(), self.model_save_path)
        print(f'Final model saved to {self.model_save_path}')

In [62]:

# Initialize training
config = {
    'conv_channel':64,
    'num_classes': 50,   # As per your requirement
    'lr': 0.001,         # Learning rate
    'epochs': 10,        # Number of epochs
    'model_save_path': '../models/wave_conv_model.pth', # Path to save the model
}

trainer = TrainLeeConvModule(**config)
trainer.train(train_features, train_labels)






Training Progress:   0%|          | 0/10 [00:00<?, ?it/s][A[A[A[A


OutOfMemoryError: CUDA out of memory. Tried to allocate 60.00 MiB. GPU 0 has a total capacty of 7.79 GiB of which 21.25 MiB is free. Including non-PyTorch memory, this process has 7.76 GiB memory in use. Of the allocated memory 6.76 GiB is allocated by PyTorch, and 10.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### different architecture

In [59]:
import torch.nn.functional as F

class WaveformNet(nn.Module):
    def __init__(self, num_classes=50):
        super(WaveformNet, self).__init__()
        
        # Strided convolution to reduce dimensionality
        self.strided_conv = nn.Conv1d(1, 128, kernel_size=3, stride=3, padding=1)
        self.bn0 = nn.BatchNorm1d(128)

        # Convolutional blocks
        self.conv_blocks = nn.ModuleList()
        for i in range(9):
            out_channels = 128 * (2 if i > 0 else 1)  # Double the channels after the first block
            self.conv_blocks.append(nn.Conv1d(128, out_channels, kernel_size=3, padding=1))
            self.conv_blocks.append(nn.BatchNorm1d(out_channels))
            self.conv_blocks.append(nn.ReLU())
            self.conv_blocks.append(nn.MaxPool1d(kernel_size=3, stride=3))

        # Global max pooling
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)

        # Fully connected layers
        self.fc1 = nn.Linear(512, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        # Initial strided convolution
        x = F.relu(self.bn0(self.strided_conv(x)))

        # Convolutional blocks
        for block in self.conv_blocks:
            x = block(x)
        
        # Global max pooling
        x = self.global_max_pool(x)
        x = x.view(x.size(0), -1)  # Flatten

        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [73]:
from tqdm import tqdm
import torch.optim as optim

class TrainWaveformNet:
    def __init__(self, num_classes, learning_rate, epochs, model_save_path, use_cuda):
        self.epochs = epochs
        self.model_save_path = model_save_path
        self.use_cuda = use_cuda and torch.cuda.is_available()
        self.device = torch.device("cuda" if self.use_cuda else "cpu")
        
        # Initialize the model
        self.model = WaveformNet(num_classes=num_classes)
        self.model.to(self.device)

        # Loss function and optimizer
        self.criterion = nn.BCEWithLogitsLoss()  # Using BCEWithLogitsLoss for stability
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def save_checkpoint(self, epoch):
        # Save a checkpoint of the model
        checkpoint_path = f'{self.model_save_path}_checkpoint_epoch_{epoch}.pth'
        torch.save({
            'epoch': epoch,
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
        }, checkpoint_path)
        print(f'Checkpoint saved: {checkpoint_path}')

    def train(self, train_features, train_labels):
        # Split the dataset into smaller chunks if necessary
        chunk_size = 8  # Determine a suitable chunk size based on your GPU memory
        num_chunks = len(train_features) // chunk_size
        
        # Progress bar setup
        pbar = tqdm(total=self.epochs * num_chunks, desc='Training', leave=True)

        for epoch in range(self.epochs):
            self.model.train()
            running_loss = 0.0
            
            for chunk in range(num_chunks):
                # Get the current chunk of data
                start_index = chunk * chunk_size
                end_index = start_index + chunk_size
                batch_features = train_features[start_index:end_index]
                batch_labels = train_labels[start_index:end_index]
                
                batch_features, batch_labels = batch_features.to(self.device), batch_labels.to(self.device)

                # Forward pass
                self.optimizer.zero_grad()
                outputs = self.model(batch_features)
                loss = self.criterion(outputs, batch_labels)
                running_loss += loss.item()

                # Backward and optimize
                loss.backward()
                self.optimizer.step()

                # Update progress bar
                pbar.set_postfix({'epoch': epoch+1, 'loss': running_loss / (chunk + 1)})
                pbar.update(1)

            # Save checkpoint after each epoch
            self.save_checkpoint(epoch)

        # Save the final model
        final_model_path = f'{self.model_save_path}_final.pth'
        torch.save(self.model.state_dict(), final_model_path)
        print(f'Final model saved to {final_model_path}')

        # Close the progress bar
        pbar.close()

In [74]:
# Parameters for the model and training
num_classes = 50
learning_rate = 0.001
epochs = 10
model_save_path = '../models/waveform_model'  # Adjust the path as needed

# Assume train_features and train_labels are already tensors with the correct shape
# Example usage:
trainer = TrainWaveformNet(num_classes, learning_rate, epochs, model_save_path, use_cuda=torch.cuda.is_available())
trainer.train(train_features, train_labels)  # train_features and train_labels should be tensors






Training:   0%|          | 0/80 [00:00<?, ?it/s][A[A[A[A

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacty of 7.79 GiB of which 5.25 MiB is free. Including non-PyTorch memory, this process has 7.78 GiB memory in use. Of the allocated memory 6.78 GiB is allocated by PyTorch, and 11.35 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [69]:
torch.cuda.empty_cache()