In [1]:
# Set path variables
import os
import sys
cwd = os.getcwd()
project_dir = os.path.abspath(os.path.join(cwd, os.pardir))
sys.path.append(project_dir)

In [2]:
import torch
from torch.utils.data import DataLoader
from src import audio_util
from src.audio_dataset import AudioDS

# Modeling (Adjust to whatever model you want to do)

## Data loading


### FOR RAW AUDIO DATA

Set transformation parameter to None

In [3]:
# File names
train_annotations = 'mtat_train_label.csv'
val_annotations = 'mtat_val_label.csv'
test_annotations = 'mtat_test_label.csv'

# Data path
from pathlib import Path
cwd = Path.cwd()
DATA_PATH = cwd.parent / 'data'

# Define global parameters across all classes
SAMPLE_RATE = 16000
DURATION_IN_SEC = 29.1

train_data = AudioDS(annotations_file=train_annotations, 
                     data_dir=DATA_PATH, 
                     target_sample_rate=SAMPLE_RATE, 
                     target_length=DURATION_IN_SEC, 
                     transformation=None)

val_data = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_PATH,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=None)

test_data = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_PATH,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=None)

In [4]:
# Load data from created datasets
BATCH_SIZE = 64

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [5]:
# Display batch information
train_features, train_labels = next(iter(train_dataloader))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 1, 465600])
Labels batch shape: torch.Size([64, 50])


In [6]:
# Retrieve a sample
idx = 9
waveform = train_features[idx]
label = train_labels[idx]
decoded_labels = train_data.decode_labels(label)
file_path = train_data.get_filepath(idx)

print(f"Audio file path: {file_path}")
print(f"Label: {label}")
print(f"Decoded labels: {decoded_labels}")

Audio file path: /Users/ab/Projects/Tagging-Music-Sequences/data/mtat/0/american_bach_soloists-joseph_haydn__masses-04-quoniam_tu_solus__allegro-30-59.mp3
Label: tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=torch.float64)
Decoded labels: ['classical', 'violin', 'cello']


In [7]:
# shape of waveform
# first element: number of channels in our case 1
# second element: number of samples in 30 seconds audio at a sampling rate of 16000 samples/s 
# -> 480000 = 30s * 16000 samples/s
waveform.shape

torch.Size([1, 465600])

### FOR TRANSFORMED AUDIO DATA (mel spectrograms with db)

Set transformation parameter to MEL_SPEC_DB_TRANSFORMATION

In [8]:
# Define global parameters across all classes
SAMPLE_RATE = 16000
DURATION_IN_SEC = 29.1
MEL_SPEC_DB_TRANSFORMATION = audio_util.get_audio_transforms(SAMPLE_RATE,
                                                            n_fft=512,
                                                            hop_length=256,
                                                            n_mels=96,
                                                            top_db=80)

train_data_melspec = AudioDS(annotations_file=train_annotations, 
                     data_dir=DATA_PATH, 
                     target_sample_rate=SAMPLE_RATE, 
                     target_length=DURATION_IN_SEC, 
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

val_data_melspec = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_PATH,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

test_data_melspec = AudioDS(annotations_file=val_annotations,
                     data_dir=DATA_PATH,
                     target_sample_rate=SAMPLE_RATE,
                     target_length=DURATION_IN_SEC,
                     transformation=MEL_SPEC_DB_TRANSFORMATION)

In [9]:
# Load data from created datasets
BATCH_SIZE = 64

train_dataloader_melspec = DataLoader(train_data_melspec, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader_melspec = DataLoader(val_data_melspec, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader_melspec = DataLoader(test_data_melspec, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
# Display batch information
train_features, train_labels = next(iter(train_dataloader_melspec))
print(f"Feature batch shape: {train_features.size()}")
print(f"Labels batch shape: {train_labels.size()}")

Feature batch shape: torch.Size([64, 1, 96, 1819])
Labels batch shape: torch.Size([64, 50])


In [11]:
# Retrieve a sample
idx = 9
mel_spec = train_features[idx]
label = train_labels[idx]
decoded_labels = train_data_melspec.decode_labels(label)
file_path = train_data_melspec.get_filepath(idx)

print(f"Audio file path: {file_path}")
print(f"Label: {label}")
print(f"Decoded labels: {decoded_labels}")

Audio file path: /Users/ab/Projects/Tagging-Music-Sequences/data/mtat/0/american_bach_soloists-joseph_haydn__masses-04-quoniam_tu_solus__allegro-30-59.mp3
Label: tensor([1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       dtype=torch.float64)
Decoded labels: ['guitar', 'classical', 'string', 'india']


In [12]:
# [64, 1, 64, 3001]) tells you that your DataLoader is outputting batches 
# of 64 Mel spectrograms,
# each with a single channel, 
# 64 Mel frequency bins, 
# and a sequence length of 3001 time frames
mel_spec.shape

torch.Size([1, 96, 1819])