This notebook explains how we build a validation set from the MIREX dataset with the following steps 
1. Clean the MIREX dataset
2. Data Augmentation
3. Query and candidate

# Clean MIREX Dataset


1. Make sure the midi is aligned with the audio and correct the tempo using `change_tempo` function from `./src/utils.py` to adjust the tempo. Following changes have been made:
    * Bach polyphonic was corrected, from 120 BPM to 84 BPM.
    * Beethoven monophonic was corrected, from 120 BPM to 192 BPM.
    

2. Integrate meta data from .krn file and audio in order to make sure the midi is aligned with the audio.
    * In practice, step 1 is done after 2 step for the two pieces of which the `midi end/s` and `audio end/s` diverge.


3. Get all repeated pattern from the dataset. The timestamps of patterns are first calculated in beats, then converted to seconds given the BPM. 

In [1]:
import os
import numpy as np
import pandas as pd

data_dir = "./data/JKUPDD-Aug2013/groundTruth/"

## Correct MIDI Tempo

In [3]:
import pretty_midi
from src.utils import change_tempo

fname = "./data/JKUPDD-Aug2013/groundTruth/beethovenOp2No1Mvt3/monophonic/midi/sonata01-3.mid"
pm = pretty_midi.PrettyMIDI(fname)
corrected_pm = change_tempo(pm, orig_tempo=120, new_tempo=192)
corrected_pm.write(fname)

fname = "./data/JKUPDD-Aug2013/groundTruth/bachBWV889Fg/monophonic/midi/wtc2f20.mid"
pm = pretty_midi.PrettyMIDI(fname)
corrected_pm = change_tempo(pm, orig_tempo=120, new_tempo=150)
corrected_pm.write(fname)

## Metadata

The metadata contains the bpm of each piece and the onset of the first note in midi, audio and beat. 

In [2]:
from src.clean_mirex import integrate_metadata

# Gather metadata from .krn file, midi and audio
meta_df = integrate_metadata(data_dir)

# Manually correct bpm based on the beat and audio duration
meta_df.loc[(meta_df["piece"] == "beethovenOp2No1Mvt3") & (meta_df["version"] == "monophonic"), "bpm"] = 192
meta_df.loc[(meta_df["piece"] == "gibbonsSilverSwan1612") & (meta_df["version"] == "monophonic"), "bpm"] = 150

# meta_df.to_csvv("./metadata/mirex.csv")
meta_df

Unnamed: 0,piece,version,bpm,midi start/s,midi end/s,audio start/s,audio end/s,beat start,beat end
0,beethovenOp2No1Mvt3,monophonic,192,0.0,174.375,0.622721,175.937506,-1.0,556.0
1,chopinOp24No4,monophonic,138,0.0,243.47792,0.867302,245.217415,-1.0,558.0
2,mozartK282Mvt2,monophonic,120,0.0,264.5,0.997778,267.0,-1.0,527.0
3,gibbonsSilverSwan1612,monophonic,150,0.4,168.4,0.397778,169.6,1.0,420.0
4,bachBWV889Fg,monophonic,84,14.999985,240.714045,14.997732,242.142857,21.0,336.0
5,beethovenOp2No1Mvt3,polyphonic,192,0.0,174.375,0.622721,175.937506,-1.0,556.0
6,chopinOp24No4,polyphonic,138,0.0,243.47792,0.867302,245.217415,-1.0,558.0
7,mozartK282Mvt2,polyphonic,120,0.0,264.5,0.997778,267.0,-1.0,527.0
8,gibbonsSilverSwan1612,polyphonic,54,1.111111,94.444435,1.108889,97.777778,1.0,84.0
9,bachBWV889Fg,polyphonic,84,0.5,80.713636,0.712063,82.142857,1.0,112.0


## Get Start End Time of Patterns

In [4]:
from src.clean_mirex import get_pattern_info

# Gather the start/ending beats of all patterns from files
pattern_df = get_pattern_info(data_dir)
# df.to_csv("./data/mirex/pattern_all.csv", index=False)

### Select patterns with proper duration

In [3]:
# Convert timestamps from beat to seconds
pattern_df = pd.merge(pattern_df, meta_df, on=['piece', 'version'])
pattern_df['start/s'] = (pattern_df['start/beat'] - pattern_df['beat start'])*60/pattern_df['bpm'] + pattern_df['audio start/s']
pattern_df['end/s'] = (pattern_df['end/beat'] - pattern_df['beat start'])*60/pattern_df['bpm'] + pattern_df['audio start/s']

# Filter out short segment
t_thresh = 7.5

pattern_df['duration/s'] = np.round(pattern_df['end/s'] - pattern_df['start/s'], 2)
pattern_df = pattern_df[pattern_df['duration/s'] >= t_thresh]
pattern_df.reset_index(drop=True, inplace=True)

In order to avoid repeated patterns in different names, reject the sectional patterns that contains other patterns. This was done manually. The following patterns have been rejected:

1. mozartK282Mvt2_polyphonic: K
2. beethovenOp2No1Mvt3_monophonic: I
3. beethovenOp2No1Mvt3_polyphonic: I
4. chopinOp24No4_monophonic: C
5. chopinOp24No4_polyphonic: C, E

In [4]:
to_reject = ['mozartK282Mvt2_polyphonic_K',
             'beethovenOp2No1Mvt3_monophonic_I',
             'beethovenOp2No1Mvt3_polyphonic_I',
             'chopinOp24No4_monophonic_C',
             'chopinOp24No4_polyphonic_C', 
             'chopinOp24No4_polyphonic_E']

pattern_df['title'] = pattern_df['piece'] + '_' + pattern_df['version'] + '_' + pattern_df['pattern']
pattern_df = pattern_df[~pattern_df['title'].isin(to_reject)]
pattern_df.reset_index(drop=True, inplace=True)

In [12]:
# Trim all segment to 10 secs
t_frame = 10
t_hop = 5

# Pad the segments < t_frame
to_pad = pattern_df[pattern_df['duration/s'] < t_frame].copy()
to_pad.reset_index(drop=True, inplace=True)

cond1 = (to_pad['start/s'] + t_frame) <= to_pad['audio end/s']
to_pad.loc[cond1, 'end/s'] = to_pad.loc[cond1, 'start/s'] + t_frame

cond2 = (to_pad['start/s'] + t_frame) > to_pad['audio end/s']
to_pad.loc[cond2, 'start/s'] = to_pad.loc[cond2, 'end/s'] - t_frame


# Trim the segments between t_frame and t_frame + t_hop
to_trim = pattern_df[(pattern_df['duration/s'] >= t_frame) & (pattern_df['duration/s'] < (t_frame + t_hop))].copy()
to_trim.reset_index(drop=True, inplace=True)
to_trim['end/s'] = to_trim['start/s'] + t_frame


# Truncate the segments with duration > t_frame + t_hop into several segments
to_expand = pattern_df[pattern_df['duration/s'] >= (t_frame + t_hop)].copy()
to_expand.reset_index(drop=True, inplace=True)

expanded_df = pd.DataFrame(columns=to_expand.columns)
for _, row in to_expand.iterrows():
    
    n = int((row['duration/s'] - t_frame)/t_frame) + 1
    t_st = row['start/s']
    
    for i in range(n):
        tmp = row.copy()
        tmp['pattern'] = f"{row['pattern']}_{i}" 
        tmp['start/s'] = t_st + i * t_hop
        tmp['end/s'] = tmp['start/s'] + t_frame
        expanded_df = expanded_df.append(tmp)
        

df = pd.concat([to_pad, to_trim, expanded_df])
df.reset_index(drop=True, inplace=True)

In [13]:
# Update start end beat for these segments
df['start/beat'] = (df['start/s'] - df['audio start/s'])*df['bpm']/60 + df['beat start']
df['end/beat'] = (df['end/s'] - df['audio start/s'])*df['bpm']/60 + df['beat start']

df.sort_values(by=['piece', 'version', 'start/beat', 'pattern'], inplace=True)
df.reset_index(drop=True, inplace=True)
df = df[['piece', 'version', 'pattern', 'start/s', 'end/s']]
df.to_csv("./data/mirex/pattern.csv", index=False)

# Data Augmentation

For each selected segments, 4 transformations were generated with different tempo or key. Each pattern has at least 2 segments which results in more 8 variants of the pattern of its original form.

In [3]:
# Generate transformations
import librosa
import pretty_midi
import pandas as pd

from src.periodicity import get_onset_env
from src.utils import change_tempo, change_key

pattern_df = pd.read_csv("./data/mirex/pattern.csv")
meta_df = pd.read_csv("./metadata/mirex.csv")

pieces = pattern_df.drop_duplicates(subset=['piece', 'version'])
pieces = pieces.merge(meta_df, on=['piece', 'version'])[['piece', 'version', 'bpm']]


sr = 22050
onset_env_dict = {}

for _, row in pieces.iterrows():
    
    p, v, orig_tempo = row['piece'], row['version'], row['bpm']
    
    fname = glob(f"./data/JKUPDD-Aug2013/groundTruth/{p}/{v}/midi/*.mid")[0]
    pm = pretty_midi.PrettyMIDI(fname)
    
    # from original audio
    audio_fname = glob(f"./data/JKUPDD-Aug2013/groundTruth/{p}/{v}/audio/*.wav")[0]
    y, _ = librosa.load(audio_fname, sr=sr)
    onset_env_dict[f"{p}_{v}_{int(orig_tempo)}_0_0"] = get_onset_env(y)
    
    
    # Synthesize audio from midi with shifted tempo
    for tempo_shift in [-10, +10]:
        new_tempo = orig_tempo + tempo_shift
        new_pm = change_tempo(pm, orig_tempo, new_tempo)
        y = new_pm.fluidsynth(fs=float(sr))
        onset_env_dict[f"{p}_{v}_{int(new_tempo)}_{tempo_shift}_0"] = get_onset_env(y)
        
    for key_shift in [-10, +10]:
        new_pm = change_key(pm, key_shift=key_shift)
        y = new_pm.fluidsynth(fs=float(sr))
        onset_env_dict[f"{p}_{v}_{int(new_tempo)}_{tempo_shift}_0"] = get_onset_env(y)
        
# np.savez("./data/mirex/onset_env", **onset_env_dict, allow_pickle=True)
onset_env_dict = dict(np.load("./data/mirex/onset_env.npz", allow_pickle=True))

# Build Query and Candidate Set

In [1]:
import numpy as np
import pandas as pd

# Update pattern timestamps with shifted tempo
def update_df_with_new_tempo(orig_df, tempo_shift=10, t_frame=10):
    
    df = orig_df.copy()
    df['bpm_shift'] = tempo_shift
    df['bpm'] = df['bpm'] + tempo_shift

    df['midi start/s'] = df['midi start/s']*(orig_df['bpm']/df['bpm'])
    df['start/s'] = (df['start/s'] - df['audio start/s'])*(orig_df['bpm']/df['bpm']) + df['midi start/s']
    df['end/s'] = df['start/s'] + t_frame

    df['audio end/s'] = (df['beat end'] - df['beat start'])*60/df['bpm'] + df['midi start/s']
    return df[df['end/s'] <= df['audio end/s']]

def update_df_with_new_key(orig_df, key_shift=3, t_frame=10):
    df = orig_df.copy()
    df['key_shift'] = key_shift
    df['start/s'] = df['start/s'] - df['audio start/s'] + df['midi start/s']
    df['end/s'] = df['start/s'] + t_frame
    return df

In [2]:
meta_df = pd.read_csv("./metadata/mirex.csv")
pattern_df = pd.read_csv("./data/mirex/pattern.csv")
pattern_df = pattern_df.merge(meta_df, on=['piece', 'version'])

t_frame = 10

fast_df = update_df_with_new_tempo(pattern_df, tempo_shift=10, t_frame=t_frame)
slow_df = update_df_with_new_tempo(pattern_df, tempo_shift=-10, t_frame=t_frame)
high_df = update_df_with_new_key(pattern_df, key_shift=3, t_frame=t_frame)
low_df = update_df_with_new_key(pattern_df, key_shift=-3, t_frame=t_frame)

total_df = pd.concat([pattern_df, fast_df, slow_df, high_df, low_df]).reset_index(drop=True)
total_df = total_df[['piece', 'version', 'bpm',  'bpm_shift', 'key_shift', 'pattern', 'start/s', 'end/s']]
total_df.replace(to_replace=np.nan, value=0, inplace=True)

total_df['bpm_shift'] = total_df['bpm_shift'].astype(int)
total_df['key_shift'] = total_df['key_shift'].astype(int)

### Split Query and Candidate

In [6]:
import random

#'test' for query, 'train' for candidate
# Each pattern has at least 2 segments in its original form. Randomly pick one as query, the other candidate. 
orig_df = total_df.loc[(total_df['bpm_shift'] == 0) & (total_df['key_shift'] == 0)]

count = orig_df.groupby(by=['piece', 'version', 'pattern']).count()
train_idx = []
test_idx = []

for (p, v, pattern), _ in count.iterrows():
    group = orig_df[(orig_df['piece'] == p) & (orig_df['version'] == v) & (orig_df['pattern'] == pattern)]
    selected_idx = random.sample(group.index.to_list(), k=2)
    train_idx.append(selected_idx[0])
    test_idx.append(selected_idx[1])

# Randomly sampled 4 variations as the rest of candidate.
augmented_df = total_df.loc[(total_df['bpm_shift'] != 0) | (total_df['key_shift'] != 0)].copy()
for (p, v, pattern), _ in count.iterrows():
    group = augmented_df[(augmented_df['piece'] == p) & (augmented_df['version'] == v) & (augmented_df['pattern'] == pattern)]
    selected_idx = random.sample(group.index.to_list(), k=4)
    train_idx += selected_idx

In [9]:
df = total_df.copy()
df.loc[train_idx, 'split'] = 'train'
df.loc[test_idx, 'split'] = 'test'
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df.to_csv("./data/mirex/pattern_split.csv", index=False)

### Calculate Feature for Query and Candidate set

In [1]:
import librosa
import numpy as np
import pandas as pd

from src.periodicity import periodicity_spectrum

sr = 22050
hop_size = 512
t_frame = 10
frame_len = int((t_frame * sr-1)/hop_size) + 1

df = pd.read_csv("./data/mirex/pattern_split.csv")
onset_env_dict = dict(np.load("./data/mirex/onset_env.npz", allow_pickle=True))
df['start frame'] = (df['start/s']/(hop_size/sr)).astype(int)

spectra = []
for _, row in df.iterrows():
    title = f"{row['piece']}_{row['version']}_{row['bpm_shift']}_{row['key_shift']}"
    start_frame = row['start frame']
    onset_env = onset_env_dict[title][start_frame: start_frame + frame_len]
    spectrum = periodicity_spectrum(onset_env=onset_env)
    spectra.append(spectrum)
    
spectra = np.array(spectra)
np.save("./data/mirex/periodicity_spectra_pattern.npy", spectra)

In [2]:
# Candidate set
spectra_ = []
# mirex_label = []

for title, onset_env in onset_env_dict.items():
    if title == 'allow_pickle':
        continue
    t_duration = len(onset_env)*hop_size/sr
    n = int(t_duration/10)
    
    for i in range(n):
        start_idx = int(i*frame_len)
        end_idx = int((i+1)*frame_len)
        spectra_.append(periodicity_spectrum(onset_env=onset_env[start_idx:end_idx]))
        
#     mirex_label += [title for _ in range(n)]
spectra_ = np.array(spectra_)
np.save("./data/mirex/periodicity_spectra.npy", spectra_)