# Combined Pipeline (2-Vocal Separation + Pitch Transcription)

- Separate 2 vocals using pre-trained Conv-TasNet model
- Transcribe pitch of both vocals using trained CREPE model (x16)
- Evaluate pipeline on case study examples

In [29]:
import IPython.display as ipd

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import itertools
import random
import numpy as np
import librosa
import os
import torch
import torchaudio
from torchaudio.pipelines import CONVTASNET_BASE_LIBRI2MIX
from torchmetrics.audio import ScaleInvariantSignalNoiseRatio, SignalDistortionRatio, PermutationInvariantTraining
from torchmetrics.functional.audio import scale_invariant_signal_noise_ratio, signal_distortion_ratio
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import functional as F


from tqdm import tqdm

import sys
sys.path.append('../')
from src.crepe_model import CREPEModel
from src.train_test_split_list import train_artists, val_artists, test_artists
from src.utils import get_split_by_artist


# use GPU if available, otherwise, use cpu
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [9]:
files = os.listdir('../MIR-1K/MIR-1K/Wavfile')
num_files = len(files)

In [5]:
# # Save artist names for reproducibility
# # Note: artist 'tammy' has only one track

# train_artists = ['khair',
#  'heycat',
#  'amy',
#  'fdps',
#  'Ani',
#  'Kenshin',
#  'bobon',
#  'yifen',
#  'davidson',
#  'bug',
#  'annar',
#  'abjones',
#  'geniusturtle',
#  'stool',
#  'tammy']

# val_artists = ['titon', 'ariel']
# test_artists = ['jmzen', 'leon']

In [4]:
# def get_split_by_artist(artist_name, train, val, test):
#     if artist_name in train:
#         return 'train'
#     elif artist_name in val:
#         return 'val'
#     elif artist_name in test:
#         return 'test'

In [10]:
test_files = [file for file in files for artist in test_artists if artist in file]

## 2-Vocal Separation

In [15]:
def get_tempo(file_name):
    audio, sample_rate = librosa.load(f"../MIR-1K/MIR-1K/UndividedWavfile/{file_name}")
    onset_env = librosa.onset.onset_strength(y=audio, sr=sample_rate)
    tempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sample_rate)
    return tempo

In [16]:
tempo_file_names = []
tempos=[]
for file_name in os.listdir('../MIR-1K/MIR-1K/UndividedWavfile'):
    if file_name.split("_")[0] in test_artists:
        tempos.append(get_tempo(file_name))
        tempo_file_names.append(file_name)

In [17]:
tempo_df = pd.DataFrame([tempo_file_names, tempos]).T
tempo_df.columns = ['file', 'tempo']
tempo_df['tempo'] = tempo_df['tempo'].apply(lambda x: x[0])

In [18]:
tempo_df.sort_values('tempo')

Unnamed: 0,file,tempo
0,jmzen_4.wav,86.132812
6,jmzen_1.wav,95.703125
3,leon_9.wav,99.384014
7,leon_6.wav,103.359375
10,leon_4.wav,112.347147
1,jmzen_5.wav,117.453835
8,leon_7.wav,117.453835
12,leon_3.wav,123.046875
5,jmzen_3.wav,129.199219
13,leon_2.wav,143.554688


leon_7.wav and jmzen_5.wav have the same tempo. Let's use them as our test set pair, by selecting a middle section of their vocal recordings

In [19]:
def combine_vocals(vocals1, vocals2, sample_rate, save_as=None):
    """
    Combine 2 vocal audio waves.

    Saves mixed vocals to path `save_as`
    Returns mixed vocal wave.
    """
    if vocals1.shape[0] > vocals2.shape[0]:
        shorter, longer = vocals2, vocals1
    else:
        shorter, longer = vocals1, vocals2
    
    combined_len = shorter.shape[0]
    # resize longer vocals to match shape of shorter vocals
    longer_trimmed = longer[:combined_len]

    # mix both vocals
    mixed_tensor = torch.cat([longer_trimmed.reshape(1, combined_len), shorter.reshape(1, combined_len)])
    mixed = torch.mean(mixed_tensor, dim=0, keepdim=True)
    
    if save_as:
        torchaudio.save(f'{save_as}', mixed, sample_rate)
        
    return mixed, mixed_tensor

def load_vocals(file_pair, target_sr=8000):
    audio1, sample_rate = torchaudio.load(f"../MIR-1K/MIR-1K/{file_pair[0]}")
    audio2, sample_rate = torchaudio.load(f"../MIR-1K/MIR-1K/{file_pair[1]}")

    resample_8k = torchaudio.transforms.Resample(sample_rate, target_sr)
    vocals1 = resample_8k(audio1[1])
    vocals2 = resample_8k(audio2[1])
    return vocals1, vocals2
    

def load_and_mix_vocals(file_pairs, target_sr=8000, num_voices=2, sample_len=1):
    num_pairs = len(file_pairs)
    num_samples = sample_len*target_sr
    mixed_lst = []
    separated_lst = []
    
    for i, pair in tqdm(enumerate(file_pairs)):

        vocals1, vocals2 = load_vocals(pair, target_sr=target_sr)
    
        mixed, separated = combine_vocals(vocals1, vocals2, target_sr, 
                               # save_as=f"{paired_folder}/{mixed_name}"
                              )
        
        dim2 = mixed.shape[1]//num_samples
        # print(mixed.shape, separated.shape)
        mixed = torch.reshape(input=mixed[:, :dim2*num_samples], shape=(dim2, 1, num_samples))
        
        sep1 = separated[:, :dim2*num_samples][0].reshape(dim2, 1, num_samples)
        sep2 = separated[:, :dim2*num_samples][1].reshape(dim2, 1, num_samples)
        separated = torch.cat([sep1, sep2], dim=1)

        mixed_lst.append(mixed)
        separated_lst.append(separated)
        
    return torch.cat(mixed_lst), torch.cat(separated_lst)

In [23]:
case_pair = ["UndividedWavfile/leon_7.wav", "UndividedWavfile/jmzen_5.wav"]

target_sr=8000
vocals1, vocals2 = load_vocals(case_pair, target_sr=target_sr)

mixed, separated = combine_vocals(vocals1, vocals2, target_sr, 
                       save_as=f"../audio/test_data/leon_7_jmzen_5.wav"
                      )

In [24]:
ipd.Audio(mixed.numpy(), rate=8000)

In [25]:
model = CONVTASNET_BASE_LIBRI2MIX.get_model()
model = model.to(device)
print(f"Initialized CONVTASNET_BASE_LIBRI2MIX model.")

with torch.no_grad():
    pred = model(mixed.reshape(1, 1, -1).to(device))

  state_dict = torch.load(path)


Initialized CONVTASNET_BASE_LIBRI2MIX model.


In [26]:
ipd.Audio(pred[:1, 0].cpu().detach().numpy(), rate=8000)

In [27]:
ipd.Audio(pred[:1, 1].cpu().detach().numpy(), rate=8000)

In [30]:
# Save split audio samples
torchaudio.save(f'../audio/test_data/split_full_1.wav', F.normalize(pred[:1, 0]), 8000)
torchaudio.save(f'../audio/test_data/split_full_2.wav', F.normalize(pred[:1, 1]), 8000)

The 2 split vocals do have different vocals, but they switch between singers in each track. This might be due to the fact that there are fewer overlapping vocals in this mix, and many of the vocals are during the pauses of the other track.

In [203]:
def get_metrics(pred, target):
    """
    Calculate the Permutation-invariant SI-SNR and SDR for a given predicted split VS a target split.
    """
    sisnr_pit = PermutationInvariantTraining(scale_invariant_signal_noise_ratio,
                                   mode="speaker-wise", eval_func="max")
    sisnr_pit.cuda()
    sisnr = sisnr_pit(pred, target)

    sdr_pit = PermutationInvariantTraining(signal_distortion_ratio,
                                   mode="speaker-wise", eval_func="max")
    sdr_pit.cuda()
    sdr = sdr_pit(pred, target)
    
    return sisnr.item(), sdr.item()

In [204]:
sep = separated.reshape(1, 2, -1).to(device)
stacked = torch.cat([mixed, mixed], dim=0).reshape(1, 2, -1).to(device)

sisnr, sdr = get_metrics(pred, sep)
sisnr_orig, sdr_orig = get_metrics(stacked, sep)

sisnri = sisnr - sisnr_orig
sdri = sdr - sdr_orig

print(f"Separating {case_pair} had an SI-SNRi of {sisnri} and SDRi of {sdri}")

Separating ['UndividedWavfile/leon_7.wav', 'UndividedWavfile/jmzen_5.wav'] had an SI-SNRi of -0.6518887281417847 and SDRi of -0.319210410118103


Let's test on a subset of their vocals instead.

In [31]:
case_pair = ['Wavfile/leon_7_03.wav', 'Wavfile/jmzen_5_03.wav']

target_sr=8000
vocals1, vocals2 = load_vocals(case_pair, target_sr=target_sr)

mixed, separated = combine_vocals(vocals1, vocals2, target_sr, 
                       save_as=f"../audio/test_data/leon_7_03_jmzen_5_03.wav"
                      )

In [32]:
ipd.Audio(mixed.numpy(), rate=8000)

In [33]:
model = CONVTASNET_BASE_LIBRI2MIX.get_model()
model = model.to(device)
print(f"Initialized CONVTASNET_BASE_LIBRI2MIX model.")

with torch.no_grad():
    pred = model(mixed.reshape(1, 1, -1).to(device))

  state_dict = torch.load(path)


Initialized CONVTASNET_BASE_LIBRI2MIX model.


In [34]:
# Save split audio samples
torchaudio.save(f'../audio/test_data/split_03_1.wav', F.normalize(pred[:1, 0]), 8000)
torchaudio.save(f'../audio/test_data/split_03_2.wav', F.normalize(pred[:1, 1]), 8000)

In [210]:
ipd.Audio(pred[:1, 0].cpu().detach().numpy(), rate=8000)

In [211]:
ipd.Audio(pred[:1, 1].cpu().detach().numpy(), rate=8000)

In [212]:
sep = separated.reshape(1, 2, -1).to(device)
stacked = torch.cat([mixed, mixed], dim=0).reshape(1, 2, -1).to(device)

sisnr, sdr = get_metrics(pred, sep)
sisnr_orig, sdr_orig = get_metrics(stacked, sep)

sisnri = sisnr - sisnr_orig
sdri = sdr - sdr_orig

print(f"Separating {case_pair} had an SI-SNRi of {sisnri} and SDRi of {sdri}")

Separating ['Wavfile/leon_7_03.wav', 'Wavfile/jmzen_5_03.wav'] had an SI-SNRi of 15.689836025238037 and SDRi of 15.917433023452759


Definitely better than the previous pair, probably because the longer tracks have more pauses, which results in both voices being active at different times. Let's try detecting the pitches of both tracks using CREPE.

In [35]:
leon_7 = pred[:1, 0]
jmzen_5 = pred[:1, 1]

In [36]:
def audio_to_frames(vocals):
    # make 1024-sample frames of the audio with hop length of 10 milliseconds
    num_samples = len(vocals)
    num_frames = int((num_samples - 1024) / 160) + 1
    frames = vocals.unfold(step=160, size=1024, dimension=0)
    return frames

def pitch_to_frame(annotations, num_frames, num_pitches=410):
    num_classes = num_pitches
    annotation_matrix = torch.zeros((num_frames, num_classes+1)) # Extra +1 class for NA notes
    print(annotation_matrix.shape)
    note_range = list(librosa.midi_to_note([i/10 for i in range(360 , 770)], cents=True))
    print(len(note_range))
    for idx in range(num_frames): # iterate each frame, assign pitch label to each frame
        pitch_idx = round(((512+(idx*160))/16000) / 0.02) - 1
        frame_pitch = annotations[pitch_idx]
        if frame_pitch in note_range:
            annotation_matrix[idx, note_range.index(frame_pitch)] = 1
        else:
            # If note it out of range (e.g. 0 values), assign to NA
            annotation_matrix[idx, len(note_range)] = 1
    return annotation_matrix

def load_pitch_labels(file_path):
    pitch_label = np.loadtxt(file_path, dtype=float)
    
    # Convert semitone to discrete note value
    pitch_midi = librosa.midi_to_note(np.round(pitch_label, 1), cents=True)
    pitch_midi[pitch_midi=='C-1+0'] = 'NA'
    return pitch_midi

In [37]:
note_range = list(librosa.midi_to_note([i/10 for i in range(360 , 770)], cents=True))

In [38]:
note_range

[np.str_('C2+0'),
 np.str_('C2+10'),
 np.str_('C2+20'),
 np.str_('C2+30'),
 np.str_('C2+40'),
 np.str_('C2+50'),
 np.str_('C♯2-40'),
 np.str_('C♯2-30'),
 np.str_('C♯2-20'),
 np.str_('C♯2-10'),
 np.str_('C♯2+0'),
 np.str_('C♯2+10'),
 np.str_('C♯2+20'),
 np.str_('C♯2+30'),
 np.str_('C♯2+40'),
 np.str_('D2-50'),
 np.str_('D2-40'),
 np.str_('D2-30'),
 np.str_('D2-20'),
 np.str_('D2-10'),
 np.str_('D2+0'),
 np.str_('D2+10'),
 np.str_('D2+20'),
 np.str_('D2+30'),
 np.str_('D2+40'),
 np.str_('D2+50'),
 np.str_('D♯2-40'),
 np.str_('D♯2-30'),
 np.str_('D♯2-20'),
 np.str_('D♯2-10'),
 np.str_('D♯2+0'),
 np.str_('D♯2+10'),
 np.str_('D♯2+20'),
 np.str_('D♯2+30'),
 np.str_('D♯2+40'),
 np.str_('E2-50'),
 np.str_('E2-40'),
 np.str_('E2-30'),
 np.str_('E2-20'),
 np.str_('E2-10'),
 np.str_('E2+0'),
 np.str_('E2+10'),
 np.str_('E2+20'),
 np.str_('E2+30'),
 np.str_('E2+40'),
 np.str_('E2+50'),
 np.str_('F2-40'),
 np.str_('F2-30'),
 np.str_('F2-20'),
 np.str_('F2-10'),
 np.str_('F2+0'),
 np.str_('F2+10'),


In [39]:
pitch_label_files = os.listdir("../MIR-1K/MIR-1K/PitchLabel/")

In [40]:
# Load labels for the pair
labels = []
pitch_label_files = os.listdir("../MIR-1K/MIR-1K/PitchLabel/")
for file in case_pair:
    track_name = file.split("/")[1].split(".")[0]

    pitch_midi_labels = load_pitch_labels(f"../MIR-1K/MIR-1K/PitchLabel/{track_name}.pv")
    labels.append(pitch_midi_labels)
# annotation_matrix = pitch_to_frame(pitch_midi_labels, num_frames=num_frames, num_pitches=num_classes)

In [41]:
case_pair

['Wavfile/leon_7_03.wav', 'Wavfile/jmzen_5_03.wav']

In [42]:
data = []
annotations = []
for i, vocals in enumerate([leon_7, jmzen_5]):
    # make 1024-sample frames of the audio with hop length of 10 milliseconds. Resample to 16kHz
    vocals = librosa.resample(vocals.cpu().numpy(), orig_sr=8000, target_sr=16000)
    frames = audio_to_frames(torch.tensor(vocals.T))
    data.append(frames)
    num_frames=len(frames)
    annotation_matrix = pitch_to_frame(labels[i], num_frames=num_frames, num_pitches=410)
    annotations.append(annotation_matrix)

torch.Size([679, 411])
410
torch.Size([679, 411])
410


In [44]:
def evaluate(model, data_loader, threshold=0.7, error_range=5):
    model.eval()
    accuracy_all = 0.
    accuracy = 0.
    accuracy_10 = 0.
    accuracy_50 = 0.
    num_gt_pitches = 0
    with torch.no_grad():
        for batch_inputs, batch_labels in data_loader:
            try:
                batch_inputs = torch.reshape(batch_inputs, (batch_inputs.shape[0],1024,1)).to(device)
            except:
                print(batch_inputs.shape)
            batch_labels = batch_labels.to(device)
            batch_outputs = model(batch_inputs).squeeze(dim=1)
            
            # get output prediction indices
            batch_output_arg = torch.argmax(batch_outputs, dim=1)
            
            # Count number of correct predictions (including non-pitch prediction)
            # get labels at predicted indices
            label_values = batch_labels[range(len(batch_output_arg)), batch_output_arg]
            accuracy_all += torch.count_nonzero(label_values)

            # Count number of correct pitch predictions
            pitch_labels = torch.clone(batch_labels)
            pitch_labels[:, -1] = 0
            num_gt_pitches += pitch_labels.sum()
            label_values = pitch_labels[range(len(batch_output_arg)), batch_output_arg]
            accuracy += torch.count_nonzero(label_values)

            # Add error range to labels
            batch_labels_range = torch.clone(pitch_labels)
            
            for i in range(1, error_range+1):
                # Shift label to +- i indices (each index is a 10 cent error range)
                batch_labels_range += pitch_labels.roll(shifts=i, dims=1) 
                batch_labels_range += pitch_labels.roll(shifts=-i, dims=1)
                if i == 1:
                    # Count number of correct pitch predictions with 10-cent error range
                    label_values = batch_labels_range[range(len(batch_output_arg)), batch_output_arg]
                    accuracy_10 += torch.count_nonzero(label_values)

                elif i == 5:
                    # Count number of correct pitch predictions with 50-cent error range
                    label_values = batch_labels_range[range(len(batch_output_arg)), batch_output_arg]
                    accuracy_50 += torch.count_nonzero(label_values)
                
            
    accuracy_all /= len(data_loader.dataset)
    accuracy /= num_gt_pitches
    accuracy_10 /= num_gt_pitches
    accuracy_50 /= num_gt_pitches
    return accuracy_all.item(), accuracy.item(), accuracy_10.item(), accuracy_50.item()

In [45]:
train_mean = 0.3302
train_std  = 0.6109
batch_size = 20

In [46]:
batch_size = 20
# Apply train set statistics to all splits
test_data = torch.clip((data[0] - train_mean) / train_std, min=1e-8, max=None)
test_loader = DataLoader(TensorDataset(test_data, annotations[0]), batch_size=batch_size, shuffle=False)

In [51]:
# evaluate on the case study, with different model sizes
model_size_mult = [4, 8, 16, 24, 32]

# evaluate on the test data
test_metrics = {}
for idx, mult in enumerate(model_size_mult):
    metrics = {
               "accuracy_all": [],
               "accuracy": [],
               "accuracy_10": [],
               "accuracy_50": [],
          
          }
    model = CREPEModel(mult).to(device)
    model.load_state_dict(torch.load(f'../models/best_crepe_{mult}.pkl', map_location=device))
    model.eval()
    
    test_acc_all, test_acc, test_acc_10, test_acc_50 = evaluate(model, test_loader)
    print(f'Num Filters X{mult} - Test accuracy_all: {100*test_acc_all:.2f}%; pitch accuracy: {100*test_acc:.2f}%; pitch accuracy (+-10c): {100*test_acc_10:.2f}%; pitch accuracy (+-50c): {100*test_acc_50:.2f}%')
    metrics['accuracy_all']= 100*test_acc_all
    metrics['accuracy'] = 100*test_acc
    metrics['accuracy_10'] = 100*test_acc_10
    metrics['accuracy_50'] = 100*test_acc_50
    test_metrics[str(mult)] = metrics
    
    del model
    torch.cuda.empty_cache()

  model.load_state_dict(torch.load(f'../models/best_crepe_{mult}.pkl', map_location=device))
  return F.conv2d(


Num Filters X4 - Test accuracy_all: 15.32%; pitch accuracy: 18.90%; pitch accuracy (+-10c): 19.11%; pitch accuracy (+-50c): 27.39%
Num Filters X8 - Test accuracy_all: 22.24%; pitch accuracy: 29.94%; pitch accuracy (+-10c): 30.15%; pitch accuracy (+-50c): 41.19%
Num Filters X16 - Test accuracy_all: 28.57%; pitch accuracy: 34.61%; pitch accuracy (+-10c): 34.82%; pitch accuracy (+-50c): 42.89%
Num Filters X24 - Test accuracy_all: 36.08%; pitch accuracy: 34.39%; pitch accuracy (+-10c): 34.82%; pitch accuracy (+-50c): 52.23%
Num Filters X32 - Test accuracy_all: 39.76%; pitch accuracy: 44.37%; pitch accuracy (+-10c): 45.65%; pitch accuracy (+-50c): 72.82%


In [288]:
# evaluate on the case study, with different model sizes
model_size_mult = [4, 8, 16, 24, 32]

# evaluate on the test data
test_metrics = {}
for idx, mult in enumerate(model_size_mult):
    metrics = {
               "accuracy_all": [],
               "accuracy": [],
               "accuracy_10": [],
               "accuracy_50": [],
          
          }
    model = CREPEModel(mult).to(device)
    model.load_state_dict(torch.load(f'best_crepe_{mult}.pkl'))
    model.eval()
    
    test_acc_all, test_acc, test_acc_10, test_acc_50 = evaluate(model, test_loader)
    print(f'Num Filters X{mult} - Test accuracy_all: {100*test_acc_all:.2f}%; pitch accuracy: {100*test_acc:.2f}%; pitch accuracy (+-10c): {100*test_acc_10:.2f}%; pitch accuracy (+-50c): {100*test_acc_50:.2f}%')
    metrics['accuracy_all']= 100*test_acc_all
    metrics['accuracy'] = 100*test_acc
    metrics['accuracy_10'] = 100*test_acc_10
    metrics['accuracy_50'] = 100*test_acc_50
    test_metrics[str(mult)] = metrics
    
    del model
    torch.cuda.empty_cache()

Num Filters X4 - Test accuracy_all: 15.46%; pitch accuracy: 19.11%; pitch accuracy (+-10c): 19.32%; pitch accuracy (+-50c): 27.60%
Num Filters X8 - Test accuracy_all: 22.24%; pitch accuracy: 29.94%; pitch accuracy (+-10c): 30.15%; pitch accuracy (+-50c): 41.19%
Num Filters X16 - Test accuracy_all: 28.57%; pitch accuracy: 34.61%; pitch accuracy (+-10c): 34.82%; pitch accuracy (+-50c): 42.89%
Num Filters X24 - Test accuracy_all: 36.08%; pitch accuracy: 34.39%; pitch accuracy (+-10c): 34.82%; pitch accuracy (+-50c): 52.23%
Num Filters X32 - Test accuracy_all: 39.91%; pitch accuracy: 44.59%; pitch accuracy (+-10c): 45.86%; pitch accuracy (+-50c): 73.04%


In [294]:
pd.DataFrame(test_metrics).T

Unnamed: 0,accuracy_all,accuracy,accuracy_10,accuracy_50
4,15.463917,19.108281,19.320594,27.600849
8,22.238587,29.936305,30.148619,41.188958
16,28.57143,34.60722,34.819531,42.887473
24,36.082473,34.394905,34.819531,52.229297
32,39.911634,44.585988,45.859873,73.036093


In [289]:
batch_size = 20
# Apply train set statistics to all splits
test_data = torch.clip((data[1] - train_mean) / train_std, min=1e-8, max=None)
test_loader = DataLoader(TensorDataset(test_data, annotations[1]), batch_size=batch_size, shuffle=False)

In [290]:
# evaluate on the test data
jmzen_metrics = {}
for idx, mult in enumerate(model_size_mult):
    metrics = {
               "accuracy_all": [],
               "accuracy": [],
               "accuracy_10": [],
               "accuracy_50": [],
          
          }
    model = CREPEModel(mult).to(device)
    model.load_state_dict(torch.load(f'best_crepe_{mult}.pkl'))
    model.eval()
    
    test_acc_all, test_acc, test_acc_10, test_acc_50 = evaluate(model, test_loader)
    print(f'Num Filters X{mult} - Test accuracy_all: {100*test_acc_all:.2f}%; pitch accuracy: {100*test_acc:.2f}%; pitch accuracy (+-10c): {100*test_acc_10:.2f}%; pitch accuracy (+-50c): {100*test_acc_50:.2f}%')
    metrics['accuracy_all']= 100*test_acc_all
    metrics['accuracy'] = 100*test_acc
    metrics['accuracy_10'] = 100*test_acc_10
    metrics['accuracy_50'] = 100*test_acc_50
    jmzen_metrics[str(mult)] = metrics
    
    del model
    torch.cuda.empty_cache()

Num Filters X4 - Test accuracy_all: 6.19%; pitch accuracy: 6.71%; pitch accuracy (+-10c): 8.10%; pitch accuracy (+-50c): 12.27%
Num Filters X8 - Test accuracy_all: 4.57%; pitch accuracy: 7.18%; pitch accuracy (+-10c): 7.41%; pitch accuracy (+-50c): 11.11%
Num Filters X16 - Test accuracy_all: 21.06%; pitch accuracy: 12.96%; pitch accuracy (+-10c): 13.89%; pitch accuracy (+-50c): 21.30%
Num Filters X24 - Test accuracy_all: 29.31%; pitch accuracy: 22.45%; pitch accuracy (+-10c): 26.62%; pitch accuracy (+-50c): 39.12%
Num Filters X32 - Test accuracy_all: 25.92%; pitch accuracy: 22.22%; pitch accuracy (+-10c): 26.85%; pitch accuracy (+-50c): 46.06%


In [293]:
pd.DataFrame(jmzen_metrics).T

Unnamed: 0,accuracy_all,accuracy,accuracy_10,accuracy_50
4,6.185567,6.712963,8.101851,12.268519
8,4.565538,7.175926,7.407407,11.111111
16,21.060383,12.962963,13.88889,21.296297
24,29.307806,22.453703,26.62037,39.12037
32,25.920472,22.222222,26.851851,46.064815
