# __CNN__

We decide to use CNN(convolutional neural networks) for the task of guitar tablature estimation. The previous work of Andrew Wiggins and Youngmoo Kim showed that CNNs have shown promise for translating guitar audios to tabs, and the use of CNNs has also been explored for various other tasks within music information retrieval such as musical tempo estimation, key classification, singing voice detection, and instrument classification. It is proven that CNN is a powerful tool for the purpose of our study.

## __Import libraries__

In [None]:
# Import required packages 

#various
import datetime
import pathlib
import IPython.display as display
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from PIL import Image
import warnings


#sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

#tensorflow
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
#from tensorflow.keras import layers

#keras
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import ReduceLROnPlateau
from keras import backend as K

# Load the TensorBoard notebook extension
%load_ext tensorboard

RSEED = 42

warnings.filterwarnings('ignore')

In [None]:
# Check for Tensorflow version
print(tf.__version__)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)

## __Define Input Shapes__

In [None]:
# Model parameters
FRAME_HEIGHT = 192
FRAME_WIDTH = 9
N_CLASSES = 21
N_STRINGS = 6
BATCH_SIZE = 128
EPOCHS = 50

In [None]:
# Paths
BASE_PATH = '../app/model/'
INPUT_PATH = '../data/output/'

# Model is saved under the following path and name:
model_name = BASE_PATH + 'swizzle_model'

## __Load Data__

In [None]:
# load data
images = np.load(INPUT_PATH + 'training_data_solo_0.npz')
annots = np.load(INPUT_PATH + 'training_labels_solo_0.npz')

## __Do train & test split__

In [None]:
#First we have to split our dataset into train and test set. We use 70% for the train set and 30% for the test set.
train_images, test_images, train_annots, test_annots = train_test_split(images['arr_0'], annots['arr_0'], test_size= 0.2, shuffle=True, random_state= RSEED )

In [None]:
#Because we need also a validation set we split once more. We take this time 10% of the train set for 
#the validation set and take the rest for training.
train_images, validate_images,train_annots,validate_annots = train_test_split(train_images, train_annots, test_size = 0.1, shuffle=True, random_state = RSEED)

In [None]:
#let's have a look on the different shapes of our sets
print(train_images.shape)
print(test_images.shape)
print(validate_images.shape)
print(train_annots.shape)
print(test_annots.shape)
print(validate_annots.shape)

## __Define our softmax function by string__

In [None]:
def softmax_by_string(t):
        sh = K.shape(t)
        string_sm = []
        for i in range(N_STRINGS):
            string_sm.append(K.expand_dims(K.softmax(t[:,i,:]), axis=1))
        return K.concatenate(string_sm, axis=1)

In [None]:
def catcross_by_string(target, output):
        loss = 0
        for i in range(N_STRINGS):
            loss += K.categorical_crossentropy(target[:,i,:], output[:,i,:])
        return loss

In [None]:
def avg_acc(y_true, y_pred):
        return K.mean(K.equal(K.argmax(y_true, axis=-1), K.argmax(y_pred, axis=-1)))

## __Building our CNN Model__

In [None]:
# the function of our cnn model
def cnn_swizzle_model():       
        """The swizzleCNN.
        
        what it takes:
        - a picture with a certain frame height(192 px) and a frame width(9 px)
        - only one color channel, therefore as a grayscale image

        what it returns:

        An array with the size 6x21. This is representing the 6 different strings of a guitar and 19 different 
        frets of the guitar. The other 2 of the 21 entries represent, if a string is played or not played.

        The different layers we used you can easily extract from below.

        Returns:
            Keras Sequential: The swizzleCNN architecture.
        """
        swizzle_model = tf.keras.Sequential()
        swizzle_model.add(tf.keras.layers.InputLayer(input_shape=[FRAME_HEIGHT, FRAME_WIDTH, 1]))
        swizzle_model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3),activation='relu'))
        swizzle_model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
        swizzle_model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'))
        swizzle_model.add(tf.keras.layers.MaxPooling2D(pool_size=(2, 2)))
        swizzle_model.add(tf.keras.layers.Dropout(0.25))   
        swizzle_model.add(tf.keras.layers.Flatten())
        swizzle_model.add(tf.keras.layers.Dense(128, activation='relu'))
        swizzle_model.add(tf.keras.layers.Dropout(0.5))
        swizzle_model.add(tf.keras.layers.Dense(N_CLASSES * N_STRINGS))
        swizzle_model.add(tf.keras.layers.Reshape((N_STRINGS, N_CLASSES)))
        swizzle_model.add(tf.keras.layers.Activation(softmax_by_string))
        return swizzle_model

In [None]:
# create the swizzleCNN
swizzle_model = cnn_swizzle_model()

In [None]:
# let's have a look on the model summary to see the different layers and their shapes
# we have 3 dimensions in the beginning, then flatten to 1 Dimension for the dense layers and after them
# create the end shape representing the guitar with 6 strings and 21 frets
swizzle_model.summary()

## Define model metrics for the cnn

In [None]:
'''
Metric: For our model we will use the average accuracy metric, because we want to have a good overall 
prediction by our model. Besides that, for us every tone has the same importance so all classes
have the same importance.

Optimizer: As an optimizer we take the adadelta optimizer, which is fast enough to handle our data 
in a short time.

Loss function: For the loss function we used categorical crossentropy by string because we have multiple classes or labels
with soft probabilities like [0.5, 0.3, 0.2].
'''

metrics = avg_acc

optimizer = tf.keras.optimizers.Adadelta(learning_rate=1.0)

swizzle_model.compile(loss=catcross_by_string, optimizer=optimizer, metrics=metrics)

## __Train CNN__

In [None]:
# Create folder for model 
'''This function takes the path of a new folder and create a new one. 
If the folder already exists, it will pass.'''
def my_makedirs(path):
    if not os.path.isdir(path):
        os.makedirs(path)

my_makedirs('../app/model')

In [None]:
# Metrics are logged using
csv_logger = tf.keras.callbacks.CSVLogger('../app/model/metrics_' + model_name + '.csv')

In [None]:
# For the training we fit our model and use the batch size and epochs from our constants
history = swizzle_model.fit( train_images,
                             train_annots,
                             batch_size=BATCH_SIZE,
                             epochs=EPOCHS,
                             verbose=1,
                             use_multiprocessing=True,
                             validation_data=(validate_images,validate_annots),
                             callbacks=[csv_logger]
)

swizzle_model_metrics = pd.read_csv('../app/model/metrics_' + model_name + '.csv')
print(swizzle_model_metrics.to_markdown())

In [None]:
# Show plots for our loss function and the accurancy
fig = plt.figure(figsize=(10, 10))

ax = plt.subplot(2, 2, 1)
plt.plot(history.history['loss'], label='Loss', color= '#7900AA')
plt.plot(history.history['val_loss'], label='Validation Loss', color = 'c')
plt.legend()
plt.title('Training - Loss Function')

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

ax2 = plt.subplot(2, 2, 2)
plt.plot(history.history['avg_acc'], label='avg. Accuracy', color = '#7900AA')
plt.plot(history.history['val_avg_acc'], label='Validation avg. Accuracy', color = 'c')
plt.legend()
plt.title('Train - Accuracy')

ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)

fig.savefig('../data/model/plots_' + model_name + '.png')

In [None]:
#print results of our swizzle model metrics for training
score = swizzle_model.evaluate(test_images,test_annots,verbose=0)
print('Test Loss : {:.4f}'.format(score[0]))
print('Test Accuracy : {:.4f}'.format(score[1]))

In [None]:
# Save the entire model
swizzle_model.save(model_name)

# Predict on test set

In [None]:
# load the previously saved model
loaded_swizzle_model = keras.models.load_model(model_name, compile = False)

In [None]:
# load test data
X_images = test_images
y_true = test_annots

In [None]:
# predict
y_pred = loaded_swizzle_model.predict(X_images)

In [None]:
# Save model predictions
np.save(model_name, y_pred, allow_pickle=True, fix_imports=True)

In [None]:
# check shapes of truth and prediction (have to match!)
np.set_printoptions(threshold=np.inf)
print(y_true.shape)
print(type(y_true.dtype))
print(y_true[0])
print(y_pred.shape)
print(type(test_annots.dtype))
print(np.round(y_pred[0]))


# Model evaluation

In [None]:
test_array = np.load('test_array.npy')

In [None]:
result = []

for i in y_pred:
    corr_i = np.zeros_like(i)
    for sidx, string in enumerate(i):
        corr_i[sidx][np.argmax(string)] = 1
    
    x = np.array_equal(test_array, corr_i)
    if x == True:
        result.append(0)
    else:
        result.append(1)
print('length of the list below:',len(result),'entries')
#print(result)
df = pd.DataFrame(result)
print('1: Frames which are NOT empty!')
print('0: Frames which are empty')
df.value_counts()


In [None]:
from sklearn.metrics import accuracy_score, precision_score,recall_score,f1_score

print(y_true.shape)
print(y_true[20])
print('#__________________________________________________________#')
print(y_pred.shape)
y_true_ravel = y_true.ravel()

# argmax the shizzle out of the swizzle
corr_y_pred = np.zeros_like(y_pred)

for fidx, frame in enumerate(y_pred):
    for sidx, string in enumerate(frame):
        corr_y_pred[fidx][sidx][np.argmax(string)] = 1

print(corr_y_pred[20])

corr_y_pred_ravel = corr_y_pred.ravel()


In [None]:
acc = accuracy_score(y_true_ravel, corr_y_pred_ravel)
prec = precision_score(y_true_ravel, corr_y_pred_ravel)
rec = recall_score(y_true_ravel, corr_y_pred_ravel)
f1 = f1_score(y_true_ravel, corr_y_pred_ravel)


print('#__________________________________________________________#')
print('Accuracy score:', acc)
print('#__________________________________________________________#')
print('Precision score:', prec)
print('#__________________________________________________________#')
print('Recall score:', rec)
print('#__________________________________________________________#')
print('f1_score:', f1)

# Data characteristics and Error analysis

In [None]:
def data_characteristics(labels: np.array, verbose: bool = True):
    dc = {
        'empty_frames': 0,
        'single_note_frames': 0,
        'multi_note_frames': 0
    }

    # empty frame
    empty = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.]]

    if labels.shape[1:] == (6, 21):

        for fidx, frame in enumerate(labels):

            if np.all(frame == empty):
                dc['empty_frames'] += 1

            else:
                # get number of notes played by number of strings played
                n_notes = 6 - sum([i[0] for i in frame])
                if n_notes == 1:
                    dc['single_note_frames'] += 1
                elif n_notes > 1:
                    dc['multi_note_frames'] += 1
                else: continue
    

    if verbose:
        print("-"*30)
        print("|", " "*5, "Label analysis", " "*5, "|")
        print("-"*30)

        for key, value in dc.items():
            if key in ['strings_correct', 'null_correct', 'fully_correct']:
                print('-'*30)

            print(f"| {key:<19}: {value:>5} |")

        print("-"*30)
    
    return dc

In [None]:
_ = data_characteristics(y_true, verbose=True)

In [None]:
def error_analysis(true: np.array, test: np.array, transform_preds: bool = True, verbose: bool = True):
    """Takes true labels and (transformed) test labels in the (n, 6, 21) shape and performs error analysis.

    Args:
        true (np.array): True labels. Shape expected (n, 6, 21)
        test (np.array): Test labels. Shape expected (n, 6, 21)
        transform_preds (bool): If true, transforms prediction probabilities to 0 or 1 using argmax. Defaults to True.
        verbose (bool): If true, prints out results. Defaults to True.
    
    Returns:
        dict: Dictionary with error analysis data.
    """

    ea = {
    'frets_correct': 0,
    'frets_wrong': 0,
    'strings_correct': 0,
    'strings_wrong': 0,
    'null_correct': 0,
    'null_wrong': 0,
    'null_total': 0,
    'fully_correct': 0,
    'part_correct': 0,
    'fully_wrong': 0,
    'total': 0
}


    # transform predictions to be [0, 1]
    if transform_preds:
        temp = np.zeros_like(test)
        for fidx, frame in enumerate(test):
            for sidx, string in enumerate(frame):
                temp[fidx][sidx][np.argmax(string)] = 1
        
        test = temp
        del temp


    # empty frame
    empty = [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.],
             [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.]]


    # for all frames in true labels
    for idx, frame in enumerate(true):

        # if no notes were played
        if np.all(frame == empty):

            if np.all(frame == test[idx]):
                ea['null_correct'] += 1
            else:
                ea['null_wrong'] += 1

            ea['null_total'] += 1

        # if a note was played
        else:
            # strings match
            if np.all(frame[:, 0] == test[idx][:, 0]):
                ea['strings_correct'] += 1
            
            else: 
                ea['strings_wrong'] += 1

            # frets match
            if np.all(frame[:, 1:] == test[idx][:, 1:]):
                ea['frets_correct'] += 1
            
            # only some of the frets match
            elif np.any(frame[:, 1:] == test[idx][:, 1:]):
                ea['part_correct'] += 1
                ea['frets_wrong'] += 1

            # no frets match
            elif not np.any(frame[:, 1:] == test[idx][:, 1:]):
                ea['frets_wrong'] += 1

            # nothing matches
            if not np.any(frame == test[idx]):
                ea['fully_wrong'] += 1
            
            # everything matches
            if np.all(frame == test[idx]):
                ea['fully_correct'] += 1
        
        # increase frame counter
        ea['total'] += 1
        
    if verbose:
        print("-"*26)
        print("|", " "*3, "Error analysis", " "*3, "|")
        print("-"*26)

        for key, value in ea.items():
            if key in ['strings_correct', 'null_correct', 'fully_correct']:
                print('-'*26)

            print(f"| {key:<15}: {value:>5} |")

        print("-"*26)
    
    return ea

In [None]:
errors = error_analysis(y_true, y_pred, transform_preds=True, verbose=True)

In [None]:
#total percentage of total values
a = errors['total']/100
t = (errors['fully_correct']+errors['null_correct'])/a
print('Total percentage of right predicted values of all values:',t)

#strings percentage of all strings
strings = errors['strings_correct']+errors['strings_wrong']
b = strings/100
s = errors['strings_correct']/b
print('Percentage of right predicted strings out of strings:',s)
#fret percentage of all frets
strings = errors['frets_correct']+errors['frets_wrong']
c = strings/100
f = errors['frets_correct']/c
print('Percentage of right predicted frets out of frets:',f)