In [None]:
import glob
import numpy as np
import pandas as pd
import sklearn

from sklearn.ensemble import RandomForestClassifier

from project_utils import *

%matplotlib inline

## Загрузка данных

In [None]:
instruments = {s: i for i, s in enumerate([
    'violin',
    'clarinet',
    'saxphone',
    'bassoon',
])}

instrument = 'bassoon'

dirs = list(sorted(glob.glob('./data/Bach10/??-*')))
mid_files = ['{}/{}.mid'.format(x, x.split('/')[-1]) for x in dirs]
wav_files = ['{}/{}-{}.wav'.format(x, x.split('/')[-1], instrument) for x in dirs]
asl_files = ['{}/{}.asl'.format(x, x.split('/')[-1]) for x in dirs]

alignments = []
features = []
scores = []
for i in range(10):        
    alignments.append(pd.read_csv(asl_files[i], sep='\t', header=None, index_col=None))     
    scores.append(parse_midi(mid_files[i], instruments[instrument]))
    features.append(wav_features(wav_files[i]))
    features[-1] = features[-1][:alignments[-1].shape[0]]
    ids = alignments[-1][2] > 0.0
    alignments[-1] = alignments[-1][ids]
    alignments[-1][2] -= alignments[-1][2].min() - np.modf(alignments[-1][2].min())[0]
    features[-1] = features[-1][ids]
    alignments[-1] = alignments[-1].iloc[::4]
    features[-1] = features[-1][::4]
    
    scores[-1][1] -= scores[-1][1][0]    

In [None]:
GTs, Bs, SYs, K = prepare(scores, alignments)

## Логистическая регрессия

In [None]:
import sklearn
X_train = np.vstack(features[5:])
Y_train = np.concatenate(SYs[5:])


X_test = np.vstack(features[:5])
Y_test = np.concatenate(SYs[:5])

m, s = np.mean(X_train, axis=0, keepdims=True), np.std(X_train, axis=0, keepdims=True)
X_train = (X_train - m) / s
X_test = (X_test - m) / s

clf = sklearn.linear_model.LogisticRegression(
    C=1e4,
    solver='lbfgs',
    multi_class='multinomial',
    verbose=True,
    max_iter=1000
)
clf.fit(X_train, Y_train)

In [None]:
print(np.mean(clf.predict_log_proba(X_train)[np.arange(X_train.shape[0]), Y_train]))
print(np.mean(clf.predict(X_train) == Y_train))
print('---')
print(np.mean(clf.predict_log_proba(X_test)[np.arange(X_test.shape[0]), Y_test]))
print(np.mean(clf.predict(X_test) == Y_test))


In [None]:
def DTW(theta):
    D = np.zeros((theta.shape[0] + 1, theta.shape[1] + 1))
    D[0, 1:] = 1e20
    D[1:, 0] = 1e20
    Q = np.zeros((theta.shape[0] + 1, theta.shape[1] + 1, 2), dtype=np.int32)    
    for i in range(1, theta.shape[0] + 1):
        tmp = np.hstack([D[i - 1, 1:][:, None], D[i - 1, :-1][:, None]]) + theta[i - 1, :][:, None]
        Q[i - 1, np.arange(theta.shape[1]), np.argmin(tmp, axis=1)] = 1                 
        D[i, 1:] = np.sum(tmp * Q[i - 1, :-1], axis=1)        
        
    Y = np.zeros((theta.shape[0] + 1, theta.shape[1] + 1), dtype=np.int32)
    Y[-1, -1] = 1
    Q[-1, -1] = [0, 1]
    j = theta.shape[1] - 1    
    for i in range(theta.shape[0] - 1, -1, -1):
        Y[i, :-1] = np.sum(
            np.hstack([Y[i + 1, :-1][:, None], Y[i + 1, 1:][:, None]]) * 
            np.hstack([Q[i + 1, :-1, 0][:, None], Q[i + 1, 1:, 1][:, None]]), axis=1)
    
    return D[:-1, :-1], Y[:-1, :-1]

def MAD(Y, GT):
    return np.sum(np.cumsum(Y - GT, axis=1) ** 2) / GT.shape[0]

def average_MAD(features, m, Bs, GTs):
    res = 0
    for i in range(5):
        theta = -np.log(
            clf.predict_proba((features[i] - m) / s)[np.arange(features[i].shape[0])[:, None], Bs[i][None, :]] 
            + 0.1
        )
        v, Y = DTW(theta)
        print(MAD(Y, GTs[i]))
        res += MAD(Y, GTs[i]) * Y.shape[0]

    return res / Y_test.shape[0]

In [None]:
print('Average MAD', average_MAD(features, m, Bs, GTs))

## Random forest

In [None]:
X_train = np.vstack(features[5:])
Y_train = np.concatenate(SYs[5:])

X_test = np.vstack(features[:5])
Y_test = np.concatenate(SYs[:5])

m, s = np.mean(X_train, axis=0, keepdims=True), np.std(X_train, axis=0, keepdims=True)
X_train = (X_train - m) / s
X_test = (X_test - m) / s

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, Y_train)

In [None]:
print(np.mean(clf.predict_proba(X_train)[np.arange(X_train.shape[0]), Y_train]))
print(np.mean(clf.predict(X_train) == Y_train))
print('---')
print(np.mean(clf.predict_proba(X_test)[np.arange(X_test.shape[0]), Y_test]))
print(np.mean(clf.predict(X_test) == Y_test))


In [None]:
print('Average MAD', average_MAD(features, m, Bs, GTs))