In [2]:
import numpy as np
import math
import matplotlib.pylab as plt
import librosa
import librosa.display
import pandas as pd
import midi
import glob

%matplotlib inline

In [3]:
def parse_midi(fname, instrument_id):
    pattern = midi.read_midifile(fname)    
    bpm = list(filter(lambda x: isinstance(x, midi.SetTempoEvent), pattern[0]))[0].bpm
    tdict = {
        midi.NoteOnEvent: 1,
        midi.NoteOffEvent: -1
    }
    events = []
    for i in range(1, 5):    
        t = np.cumsum([e.tick for e in pattern[i]], dtype=np.int32)                         
        events.extend([(t, i - 1, tdict[e.__class__], e.pitch) for (t, e) in zip(t, pattern[i]) if e.__class__ in tdict])
    events = list(sorted(events, key=lambda x: (x[0], x[2])))
    
    chord = [0] * 4
    
    chords = []    
        
    for i in range(len(events)):
        if (i > 0) and (events[i - 1][0] != events[i][0]):
            chords.append((events[i - 1][0], tuple(chord)))            
        if events[i][2] == 1:
            chord[events[i][1]] = events[i][3]            
        else:            
            chord[events[i][1]] = 0            
    chords.append((events[-1][0], tuple(chord)))
    
    score = []
    for (t, chord) in chords:
        if len(score) > 0 and score[-1][-1] == chord[instrument_id]:
            continue            
        score.append((t, t / pattern.resolution, t * 60000.0 / bpm / pattern.resolution, chord[instrument_id]))
        
                        
            
    return pd.DataFrame(score)

def wav_features(fname):
    sr = 44100
    r, _ = librosa.load(fname, sr=sr, offset=0.023)
    n_fft, hop_length = 1024, 441
    features = np.vstack([
        librosa.feature.rmse(r, frame_length=n_fft, hop_length=hop_length),
        librosa.feature.spectral_centroid(r, sr=sr, n_fft=n_fft, hop_length=hop_length),
        librosa.feature.spectral_bandwidth(r, sr=sr, n_fft=n_fft, hop_length=hop_length),
        librosa.feature.mfcc(r, sr=sr, n_mfcc=5, n_fft=n_fft, hop_length=hop_length)        
    ]).T
    return features

In [4]:
instruments = {
    'violin': 0,
    'clarinet': 1,
    'saxphone': 2,
    'bassoon': 3,
}

instrument = 'clarinet'

dirs = list(sorted(glob.glob('./data/Bach10/??-*')))
mid_files = list(map(lambda x: '%s/%s.mid' % (x, x.split('/')[-1]), dirs))
wav_files = list(map(lambda x: '%s/%s-%s.wav' % (x, x.split('/')[-1], instrument), dirs))
asl_files = list(map(lambda x: '%s/%s.asl' % (x, x.split('/')[-1]), dirs))

alignments = []
features = []
scores = []
for i in range(10):        
    alignments.append(pd.read_csv(asl_files[i], sep='\t', header=None, index_col=None))     
    scores.append(parse_midi(mid_files[i], instruments[instrument]))
    features.append(wav_features(wav_files[i]))
    features[-1] = features[-1][:alignments[-1].shape[0]]
    ids = alignments[-1][2] > 0.0
    alignments[-1] = alignments[-1][ids]
    alignments[-1][2] -= alignments[-1][2].min() - np.modf(alignments[-1][2].min())[0]
    features[-1] = features[-1][ids]
    alignments[-1] = alignments[-1].iloc[::4]
    features[-1] = features[-1][::4]
    
    scores[-1][1] -= scores[-1][1][0]    

In [5]:
def grountruth_matrix(alignment, s):
    b = alignment[2].copy()
    mx = np.ceil(np.max(alignment[2]) * 2) / 2    
    #s = np.arange(0.0, mx, 0.5)        
    #print(s, np.arange(0.0, mx, 0.5))
    #Y = ((b[:, None] >= s[None, :]) & (b[:, None] < s[None, :] + 0.5)).astype(np.int32)
    Y = ((b[:, None] >= s[None, :-1]) & (b[:, None] < s[None, 1:])).astype(np.int32)
    return Y

def prepare(scores, alignments):
    S = set(sum(map(lambda x: x[3].tolist(), scores), []))
    K = len(S)    
    id_to_key = {i: k for i, k in enumerate(S)}
    key_to_id = {k: i for i, k in enumerate(S)}
    
    GTs = []
    Bs = []
    
    SYs = []
    
    for k in range(len(scores)):        
        GT = grountruth_matrix(alignments[k], scores[k][1])
        B = np.array([key_to_id[v] for v in scores[k][3][:-1]], dtype=np.int32)
        #j = 0
        #for i in range(B.shape[0]):
        #    while (2 * scores[k][1][j + 1] <= i):
        #        j += 1            
        #    B[i] = key_to_id[scores[k][3][j]]
        GTs.append(GT)
        Bs.append(B)
        
        SY = np.zeros(alignments[k].shape[0], dtype=np.int32)
        j = 0
        for i in range(alignments[k].shape[0]):
            while (scores[k][1][j + 1] <= alignments[k].iloc[i, 2]):
                j += 1            
            SY[i] = key_to_id[scores[k][3][j]]
        SYs.append(SY)
        
    return GTs, Bs, SYs, K
    

In [6]:
GTs, Bs, SYs, K = prepare(scores, alignments)

In [7]:
K

18

In [8]:
np.max(alignments[0][2]), np.max(scores[0][1][:-1])

(31.948999999999998, 31.0)

In [9]:
GTs[0].shape

(601, 26)

In [14]:
SYs[0]

array([ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16, 16, 16, 16, 16, 16, 16,  0,  0,  0,  0,  0,  0,  0,
       16, 16, 16, 16, 16, 16, 16, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
       14, 14, 14, 14, 14, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
       16, 16, 16, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  3

In [15]:
Bs[0]

array([ 3,  2, 16,  0, 16, 14, 16,  0,  3,  2,  3,  2,  3,  2, 16,  3,  1,
        3,  2,  3, 16, 14,  5,  3,  1,  0], dtype=int32)