In [1]:
from sklearn.model_selection import train_test_split
from pathlib import Path
import soundfile as sf
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchaudio
import librosa
from tqdm import tqdm, trange
import pickle
import matplotlib.pyplot as plt




In [3]:
rawAudioPath = Path('../data/musicnet_lousy/train_data')
song_id = 1727

sequence, samplingRate = librosa.load(rawAudioPath / (str(song_id) + '.wav'), sr=16000)
sequence = torch.tensor(sequence).float()

In [4]:
sequence.shape[0]

7152954

In [5]:
labeling = pd.read_csv(f'../data/musicnet_lousy/train_labels/{song_id}.csv')

In [5]:
labeling

Unnamed: 0,start_time,end_time,instrument,note,start_beat,end_beat,note_value
0,9182,90078,43,53,4.000,1.500,Dotted Quarter
1,9182,33758,42,65,4.000,0.500,Eighth
2,9182,62430,1,69,4.000,1.000,Quarter
3,9182,202206,44,41,4.000,3.500,Whole
4,9182,62430,1,81,4.000,1.000,Quarter
...,...,...,...,...,...,...,...
6575,19196894,19421150,44,29,365.000,3.000,Dotted Half
6576,19226590,19233758,1,60,365.375,0.125,Thirty Second
6577,19226590,19233758,1,48,365.375,0.125,Thirty Second
6578,19233758,19421150,1,65,365.500,2.500,Dotted Half


In [6]:
# the sampling in the label files are for the 44KHz sampling - need to convert to our 16KHz

In [6]:
labeling['start_time'] = labeling['start_time'] / 44100
labeling['end_time'] = labeling['end_time'] / 44100

In [12]:
labeling

Unnamed: 0,start_time,end_time,instrument,note,start_beat,end_beat,note_value
0,0.208209,2.042585,43,53,4.000,1.500,Dotted Quarter
1,0.208209,0.765488,42,65,4.000,0.500,Eighth
2,0.208209,1.415646,1,69,4.000,1.000,Quarter
3,0.208209,4.585170,44,41,4.000,3.500,Whole
4,0.208209,1.415646,1,81,4.000,1.000,Quarter
...,...,...,...,...,...,...,...
6575,435.303719,440.388889,44,29,365.000,3.000,Dotted Half
6576,435.977098,436.139637,1,60,365.375,0.125,Thirty Second
6577,435.977098,436.139637,1,48,365.375,0.125,Thirty Second
6578,436.139637,440.388889,1,65,365.500,2.500,Dotted Half


In [8]:
n_notes = 128 # include silence as additional note
n_instruments = 128 # include silence as an instrument

Each vector in the latent representation spans a 10ms window from the original audio, ie. $0.01 \times 16000 = 160$ samples

In [9]:
window_size_ms = 10

In [10]:
downsampling = window_size_ms / 1000 * samplingRate

In [11]:
n_windows = int(np.ceil(sequence.shape[0] / downsampling))

In [12]:
target_matrix_tensor = torch.zeros(n_windows, n_notes + 1, n_instruments + 1)

In [13]:
for i in trange(n_windows):
    end = (i+1) * window_size_ms / 1000
    playing = labeling[(labeling['start_time'] <= end) & \
                       (end <= labeling['end_time'])][['note', 'instrument']].values
    notes, instruments = playing[:, 0], playing[:,1]
    if len(notes) == 0:
        # if silence, then instrument == 0 plays note == 0
        target_matrix_tensor[i, 0, 0] = 1
    else:
        target_matrix_tensor[i, notes, instruments] = 1

100%|██████████| 44706/44706 [01:10<00:00, 637.46it/s]


In [18]:
# check: the first 20 tensors should be silent, then we have 5 notes being playes:

for i in range(21):
    print(target_matrix_tensor[i].sum())

tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(1.)
tensor(5.)


Wrap the above into a function:

In [185]:
def MusicTranscripter(sequence_path,
                      labels_path,
                      path_to_save,
                      n_notes=128,
                      window_size_ms=30):
    
    '''
    Provides binary target matrices denoting all instruments that play during
    for each window_size_ms window in the latent representations
    '''

    labeling = pd.read_csv(labels_path)

    # convert time columns to seconds for simplicity
    labeling['start_time'] = labeling['start_time'] / 44100
    labeling['end_time'] = labeling['end_time'] / 44100
    
    rawAudioPath = Path(sequence_path)
    sequence, samplingRate = librosa.load(sequence_path, sr=16000)
    downsampling = window_size_ms / 1000 * samplingRate
    n_windows = int(np.ceil(sequence.shape[0] / downsampling)) # allow for aggregating windows
    transcription = torch.zeros(n_windows, n_notes + 1)
    

    for i in range(n_windows):
        
        start= (i) * window_size_ms / 1000
        end = (i+1) * window_size_ms / 1000
    
        notes = labeling[(labeling['start_time'] <= end) & \
                       (start<= labeling['start_time'])].note.values

        if len(notes) == 0:
            # if silence, then instrument == 0 plays note == 0
            transcription[i, 0] = 1
        else:
            transcription[i, notes] = 1
    track_id = sequence_path.split('.')[-2].split('/')[-1]
    torch.save(transcription, f'{path_to_save}/label_{track_id}.pt')

# Exploring data

In [186]:
from time import time
# longest track on training data
track_id=2315
sequence_path = f"../data/musicnet_lousy/train_data/{track_id}.wav" 
labels_path = f"../data/musicnet_lousy/train_labels/{track_id}.csv"

t1= time()
label_transcription =MusicTranscripter(sequence_path,
                      labels_path,
                      "../data/musicnet_lousy",
                      n_notes=128,
                      window_size_ms=30)
print(f"elapsed time creating transcription label: {time()-t1}")


elapsed time creating transcription label: 24.079020738601685


In [178]:
sequence_path.split('.')[-2].split('/')[-1]

'2315'

In [154]:
t1=time()
torch.save(label_transcription, f'label_t_{track_id}.pt')
print(f"elapsed time saving label: {time()-t1}")

elapsed time saving label: 0.038186073303222656


In [168]:
t1=time()
loaded_label = torch.load('label_t_2315.pt', map_location=torch.device('cpu'))
print(f"elapsed time loading label: {time()-t1}")

elapsed time loading label: 0.01041555404663086


# Creating labels

In [230]:
from tqdm.notebook import tqdm
import os
def create_transcription_labels(path_metadata, path_to_labels, path_to_seqs, path_to_save, windows_size=30, select_ensemble=None):
    '''
    Creates transcription labels per track given a window size.

    path_metadata : Path to csv metadata file
    path_to_labels: Path to labels directory 
    path_to_seqs  : Path to wav files directory
    path_to_save  : Path to directory where labels will be stored
    window_size   : time window size for which the labels will be created
    select_enseble: name of the ensemble to filter by, by default None, i.e. all ensembles are considered 
    '''
    metadata = pd.read_csv(path_metadata) 
    
    if not os.path.isdir(path_to_save):
        os.mkdir(path_to_save)

    if select_ensemble is not None:
        metadata = metadata[metadata['ensemble'] == select_ensemble]
    track_ids = metadata.id.values
    
    list(map(lambda x: MusicTranscripter(f"{path_to_seqs}/{x}.wav", f"{path_to_labels}/{x}.csv", path_to_save, window_size_ms= windows_size),
     tqdm(track_ids)))

  0%|          | 0/153 [02:03<?, ?it/s]


In [232]:
path_metadata = "../data/musicnet_metadata_train.csv"
path_to_labels= "../data/musicnet_lousy/train_labels"
path_to_seqs= "../data/musicnet_lousy/train_data"
path_to_save= "../data/musicnet_lousy/train_transcription_labels"

create_transcription_labels(path_metadata, path_to_labels, path_to_seqs, path_to_save, windows_size=300, select_ensemble='Solo Piano')

  0%|          | 0/153 [00:00<?, ?it/s]