In [236]:
import os
import pickle


import librosa
import librosa.display
import IPython.display
import numpy as np
from time import time

import matplotlib.style as ms
import matplotlib.pyplot as plt
ms.use('seaborn-muted')
%matplotlib inline

from collections import Counter, defaultdict
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

## Load Datasets

In [27]:
def get_labels(label_path, fileName):
    with open(os.path.join(label_path,'REF_key_'+fileName.split('.')[-2]+'.txt')) as f:
        """
        [('0', 'f'),
         ('1', 'f'),
         ('2', 'f')]
        """
        time_list = []
        labels = []
        for line in f.readlines():
            time, key = line.strip().split('\t')

            if time in time_list:
                continue

            labels.append((time,key))
        return labels

In [38]:
def read_get_features(dataset_dir, 
                      label_dir, 
                      y_source='source',
                      gain = 10):

    
    feature_dict = {} # {song: {'window_chroma':43?, 'chroma_features':[],labels:[]}}

    for dir_path, _, fileNames in os.walk(dataset_dir):
        for fileName in fileNames:
            print(os.path.join(dir_path,fileName))

            content_dict = {'window_chroma':0, 'chroma_features':[],'labels':[]}
            y, sr = librosa.load(os.path.join(dir_path,fileName))
            
            if y_source == 'harmonic':
                    # We'll use the harmonic component to avoid pollution from transients
                    # y_harmonic, y_percussive = librosa.effects.hpss(y)
                    y, _ = librosa.effects.hpss(y)
                    
            S=np.abs(librosa.stft(y))
            C = librosa.feature.chroma_stft(S=np.log(1.0 + gain * S), sr=sr)

            content_dict['chroma_features'] = C
            content_dict['labels'] = get_labels(label_path, fileName)
            content_dict['window_chroma'] = int(sr/512)

#             print(len(content_dict['labels']))
#             print(content_dict['window_chroma'])

            feature_dict[fileName.split('.')[-2]] = content_dict
            
    return feature_dict

In [40]:
dataset_dir = '../datasets/BPS_piano'
label_path = '../datasets/BPS_piano_label'

In [41]:
feature_dict = read_get_features(dataset_dir, label_path, y_source='source',gain = 10)


with open("../datasets/bps_chroma_features.pkl","wb") as f:
    pickle.dump(feature_dict,f)

print("Features Dumped")

../datasets/BPS_piano/1.wav
../datasets/BPS_piano/14.wav
../datasets/BPS_piano/3.wav
../datasets/BPS_piano/5.wav
../datasets/BPS_piano/20.wav
../datasets/BPS_piano/23.wav
../datasets/BPS_piano/18.wav
../datasets/BPS_piano/6.wav
../datasets/BPS_piano/16.wav
../datasets/BPS_piano/21.wav
../datasets/BPS_piano/27.wav
../datasets/BPS_piano/8.wav
../datasets/BPS_piano/31.wav
../datasets/BPS_piano/26.wav
../datasets/BPS_piano/32.wav
../datasets/BPS_piano/12.wav
../datasets/BPS_piano/19.wav
../datasets/BPS_piano/11.wav
../datasets/BPS_piano/28.wav
../datasets/BPS_piano/24.wav
../datasets/BPS_piano/25.wav
../datasets/BPS_piano/13.wav
../datasets/BPS_piano/22.wav
Features Dumped


In [None]:
# Read features from pickle
# with open("../datasets/spectrum_features.pkl","rb") as f:
#     feature_dict = pickle.load(f)
    
# print("Features Loads")

## Time Modeling


Model the signal and tome to one second a frame

In [122]:
def slide_windows(chroma, windows):
    """
    chroma    chroma value
    windows   time_gap, how many chroma value is one second
    """

    length = chroma.shape[1]
    frames_length = int(length/windows)+1
    features = np.zeros((0,12,windows), np.float32) # (total seconds, chroma_length, windows)
    
    for i in range(0,length,windows):
#         print(features.shape)
        if i + windows < length:
            X = chroma[:,i:i+windows]
        else:
            break # give up the last one
            X = chroma[:,i:]
            padding = windows-(length-i)
            X_pad = np.zeros((12,padding))
            X = np.concatenate((X, X_pad), axis=1)
            
        X = X.reshape(1,12,windows)
        features = np.vstack([features, X])
    return features
    

In [127]:
train_list = [1, 3, 5, 11, 16, 19, 20, 22, 25, 26, 32]
valid_list = [6, 13, 14, 21, 23, 31]
test__list = [8, 12, 18, 24, 27, 28]
final_train_list = train_list+valid_list

In [177]:
feature_to_int_dict = {'A':0,'B-':1,'B':2,'C':3,'C+':4,'D-':4,'D':5,'E-':6,
                       'E':7,'F':8,'G-':9,'G':10,'G+':11,'A-':11, 'F+':9, 'D+':6}
feature_to_int_dict = {**feature_to_int_dict, **{k.lower(): v+12 for k, v in feature_to_int_dict.items()}}
# feature_to_int_dict

In [159]:
feature_to_int = np.vectorize(lambda x: (feature_to_int_dict[x]))

In [151]:
def get_features_labels(file_name_list, feature_dict):
    features_list = []
    labels_list = []
    for i in file_name_list:
        features = slide_windows(feature_dict[str(i)]['chroma_features'], feature_dict[str(i)]['window_chroma'])
        for j, feature in enumerate(features):
            feature_shape = feature.shape
            feature = feature.reshape(-1)

            try:
                labels_list.append(feature_dict[str(i)]['labels'][j][1])
                features_list.append(feature)
            except:
                print(i, j)
                
    return (np.array(features_list), np.array(labels_list))
    

In [190]:

perfect_fifth_dict = {3:10, 10:5, 5:0, 0:7, 7:2, 2:9, 9:4, 4:11, 11:6, 6:1, 1:8, 8:3, 
                      12:19,19:14, 14:21, 21:16, 16:23, 23:18, 18:13, 13:20, 20:15, 15:22, 22:17, 17:12}
relative_dict = {3:12, 12:3, 10:19, 19:10, 5:14, 14:5, 0:21, 21:0, 7:16, 16:7, 2:23, 23:2,  
                 9:18, 18:9, 4:13, 13:4, 11:20, 20:11, 6:15, 15:6, 1:22, 22:1, 8:17, 17:8}
    
def evaluation(y_true_dict, 
               y_predict_dict, 
               weighted = False, 
               to_classification_report=False, 
               target_names = ["A","A#","B","C","C#","D","D#","E","F","F#","G","G#","a","a#","b","c","c#","d","d#","e","f","f#","g","g#"], 
               perfect_fifth_dict = perfect_fifth_dict, 
               relative_dict = relative_dict):
    # classification report
    if to_classification_report:
        
        print(classification_report(y_true_dict, y_predict_dict, target_names=target_names))
        
    class_acc = []
    if weighted:
        print("Weighted accuracy")

        correct = 0.0
        for i, label in enumerate(y_true_dict):
            if y_predict_dict[i] == label: # same
                correct += 1
            if perfect_fifth_dict[y_predict_dict[i]] == label: # perfect fifth error
                correct += 0.5
            if relative_dict[y_predict_dict[i]] == label: # Relative major/minor error
                correct += 0.3
            if (y_predict_dict[i]%12) == (label%12): # parallel major/minor
                correct += 0.2
        class_acc.append(correct/len(y_predict_dict))
        print("{:.2f}%".format(correct/len(y_predict_dict)*100))
    else:
        print("Average accuracy")

        correct = 0.0
        for i, label in enumerate(y_true_dict):
            if label == y_predict_dict[i]:
                correct += 1
        try:
            class_acc.append(correct/len(y_predict_dict))
        except:
            class_acc.append(0.0)
        
        print(str(correct/len(y_predict_dict)*100)+'%')

In [280]:

all_train_features, all_train_labels = get_features_labels(final_train_list, feature_dict)

print(all_train_features.shape)
print(all_train_labels.shape)

train_features, train_labels = get_features_labels(train_list, feature_dict)

print(train_features.shape)
print(train_labels.shape)

valid_features, valid_labels = get_features_labels(valid_list, feature_dict)
print(valid_features.shape)
print(valid_labels.shape)

test_features, test_labels = get_features_labels(test__list, feature_dict)
print(valid_features.shape)
print(valid_labels.shape)

valid_labels_int = feature_to_int(valid_labels).astype(int)
train_labels_int = feature_to_int(train_labels).astype(int)
test_labels_int = feature_to_int(test_labels).astype(int)


32 836
(14820, 516)
(14820,)
32 836
(9878, 516)
(9878,)
(4942, 516)
(4942,)
(4942, 516)
(4942,)


## Random Forest

In [129]:
from sklearn.ensemble import RandomForestClassifier

In [217]:
# build randomforest
clf = RandomForestClassifier(random_state=42, 
                             n_estimators=128, 
                             verbose=1, 
                             n_jobs=-1)
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [281]:
# Training
clf.fit(all_train_features, feature_to_int(all_train_labels))

[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 128 out of 128 | elapsed:    3.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=128, n_jobs=-1,
            oob_score=False, random_state=42, verbose=1, warm_start=False)

In [282]:
test_ = clf.predict(test_features)
test_.shape

[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 128 out of 128 | elapsed:    0.1s finished


(4929,)

In [283]:
evaluation(test_, test_labels_int, weighted = True, to_classification_report=True)

             precision    recall  f1-score   support

          A       0.00      0.00      0.00         0
         A#       0.16      0.28      0.21       180
          B       0.00      0.00      0.00         0
          C       0.36      0.10      0.15       384
         C#       0.00      0.08      0.01        13
          D       0.29      0.05      0.09        93
         D#       0.42      0.39      0.40      1052
          E       0.62      0.22      0.33       344
          F       0.16      0.03      0.05       261
         F#       0.00      0.00      0.00         0
          G       0.53      0.09      0.16       445
         G#       0.50      0.29      0.37       562
          a       0.00      0.00      0.00         0
         a#       0.00      0.00      0.00         1
          b       0.06      0.19      0.09        62
          c       0.35      0.33      0.34       763
         c#       0.03      1.00      0.05         1
          d       0.00      0.00      0.00   

## DNN

### Features

In [274]:
import tensorflow as tf

# consturct your DNN model Graph
# The maxnumber of
def dnn(num_hidden_uni, num_class,mode, f_dim):
    num_hidden = len(num_hidden_uni)
    with tf.variable_scope('dnn'):
        # Tensor for input layer protocol
        features = tf.placeholder(
            tf.float32, shape=[None, f_dim], name='input_features')
        hid = tf.layers.dense(features, num_hidden_uni[
                             0], activation=tf.nn.relu)
        for i in range(min(num_hidden, 5) - 1):
            hid = tf.layers.dense(hid, num_hidden_uni[
                                 i + 1], activation=tf.nn.relu)
            hid = tf.layers.dropout(hid, rate=0.7, training=mode, name="Dropout") # set the dropout layer and give a name 

        # Unscaled propability of each class
        output_logits = tf.layers.dense(
            hid, num_class, activation=None, name='output_layer')
        return output_logits


# parameters for training
batch_size = 50
num_epochs = 1000
init_learning_rate = 0.00013
epsilon = 1e-6
num_class = 24

In [279]:

with tf.Graph().as_default(), tf.Session() as sess:
        mode = tf.placeholder(tf.bool, name ="Mode") 
        # define your own fully connected DNN
        output = dnn([256, 128, 128], num_class, mode,f_dim=train_features.shape[1])

        # tensor for prediction the class
        prediction = tf.argmax(output, -1)
        # Add training ops into graph.
        with tf.variable_scope('train'):
            # tensor for labels
            label_ = tf.placeholder(
                tf.int32, shape=(None,), name='labels')
            label = tf.one_hot(label_,depth=24)
            
            # tensor for calculate loss by softmax cross-entroppy
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                labels=label,
                logits=output,
                name='loss_op'
            ))
            
            global_step = tf.Variable(
                0, name='global_step', trainable=False,
                collections=[tf.GraphKeys.GLOBAL_VARIABLES,
                             tf.GraphKeys.GLOBAL_STEP])
            optimizer = tf.train.AdamOptimizer(
                learning_rate=init_learning_rate,
                epsilon=epsilon)
            train_op = optimizer.minimize(
                loss, global_step=global_step, name='train_op')
            arg_label = tf.argmax(label, -1)

            acc = tf.reduce_mean(
                tf.cast(tf.equal(prediction, arg_label), tf.float32), name='acc_op')
            
            tf.summary.scalar('cross_entropy', tf.cast(loss, tf.float32))
            
            merged = tf.summary.merge_all()
            
            train_writer = tf.summary.FileWriter('train_log/',
                                                 sess.graph)
            
        sess.run(tf.global_variables_initializer())

        # Assign the required tensors to do the operation
        
        global_step_tensor = sess.graph.get_tensor_by_name(
            'train/global_step:0')
        features_tensor = sess.graph.get_tensor_by_name(
            'dnn/input_features:0')
        train_op = sess.graph.get_operation_by_name('train/train_op')
        acc_op = sess.graph.get_tensor_by_name('train/acc_op:0')
        
        labels_tensor = sess.graph.get_tensor_by_name('train/labels:0')
        loss_tensor = sess.graph.get_tensor_by_name('train/loss_op:0')
        
        # Start training
        print('Start training...')
        print('Using dataset is: ' + dataset_dir)
        t0 = time()
        epo = 0
        while epo < num_epochs:
            st = 0
            for _ in range(round(train_features.shape[0] / batch_size)):
                [num_steps, _, loss_out] = sess.run([global_step_tensor, train_op, loss_tensor], 
                                                    feed_dict={
                                                        features_tensor: train_features[st:st + batch_size], 
                                                        labels_tensor: train_labels_int[st:st + batch_size],
                                                        mode:True})
                st += batch_size
            
#             print('loss:', '%g' % np.mean(loss_out))
            [acc, p] = sess.run([acc_op, prediction], feed_dict={
                features_tensor: valid_features, labels_tensor: valid_labels_int, mode:False})
            train_writer.add_summary(summary, epo)
            epo += 1
            if epo%50 ==0:
                print("# of epochs: ", epo,
                      ', valid accuracy : ', '%.4f' % (acc),
                      ', loss:', '%g' % np.mean(loss_out))
            
        [acc, p] = sess.run([acc_op, prediction], feed_dict={
                features_tensor: test_features, labels_tensor: test_labels_int, mode:False})
        print('Finish training in {:4.2f} sec!'.format(time() - t0))
        print('Now test the trained DNN model....\n')
        print("Test accuracy : %.4f" % (acc))
#         print(classification_report(test_labels, p))
        evaluation(p, test_labels_int, weighted = True, to_classification_report=True)
        

Start training...
Using dataset is: ../datasets/BPS_piano
# of epochs:  50 , valid accuracy :  0.0882 , loss: 2.00082
# of epochs:  100 , valid accuracy :  0.1313 , loss: 1.86388
# of epochs:  150 , valid accuracy :  0.1941 , loss: 1.78053
# of epochs:  200 , valid accuracy :  0.2086 , loss: 1.70366
# of epochs:  250 , valid accuracy :  0.2285 , loss: 1.79402
# of epochs:  300 , valid accuracy :  0.2200 , loss: 1.63176
# of epochs:  350 , valid accuracy :  0.2355 , loss: 1.93537
# of epochs:  400 , valid accuracy :  0.2236 , loss: 1.60719
# of epochs:  450 , valid accuracy :  0.2129 , loss: 1.52376
# of epochs:  500 , valid accuracy :  0.2094 , loss: 1.63776
# of epochs:  550 , valid accuracy :  0.2224 , loss: 1.38766
# of epochs:  600 , valid accuracy :  0.2218 , loss: 1.54792
# of epochs:  650 , valid accuracy :  0.2317 , loss: 1.18084
# of epochs:  700 , valid accuracy :  0.2230 , loss: 1.41266
# of epochs:  750 , valid accuracy :  0.2218 , loss: 1.40884
# of epochs:  800 , valid ac