<a href="https://colab.research.google.com/github/wolfisberg/zhaw-ba-online/blob/main/crepe/crepe.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [None]:
!pip install mir_eval
!pip install rt_pie_lib
import os
import numpy as np
import tensorflow as tf
import scipy.interpolate
import matplotlib.pyplot as plt
import shutil
import datetime
import mir_eval
import math

%load_ext tensorboard


from google.colab import drive
drive.mount('/content/drive')

# Config

In [None]:
# Audio
SNR_RANGE = (-5.0,20.0) #dB
FRAME_LENGTH = 1024
FRAME_STEP = 512
MIN_RAND_GAIN = 0.05
MAX_RAND_GAIN = 1.1
SAMPLE_LENGTH = 3 #shorter than shortest noise/speech sample
FS = 16000
PITCH_SAMPLING_TIME = 0.01 # s
PITCH_FRAME_LENGTH = 0.032 # s


# Data
BATCH_SIZE = 32
NUM_FRAMES = 1 + (FS * SAMPLE_LENGTH - FRAME_LENGTH) // FRAME_STEP
# NUM_FRAMES = 1

# Training
STEPS_PER_EPOCH = 500
EPOCHS = 100
VALIDATION_STEPS = 5


# Directories
_DATA_DIR = os.path.join('/content/drive/MyDrive/BA_2021/')
_TFRECORDS_DIR = os.path.join(_DATA_DIR, 'tfrecords')

SPEECH_DATA_TR_DIR = os.path.join(_TFRECORDS_DIR, 'speech', 'tr')
NOISE_DATA_TR_DIR = os.path.join(_TFRECORDS_DIR, 'noise', 'tr')
SPEECH_DATA_CV_DIR = os.path.join(_TFRECORDS_DIR, 'speech', 'cv')
NOISE_DATA_CV_DIR = os.path.join(_TFRECORDS_DIR, 'noise', 'cv')
SPEECH_DATA_TT_DIR = os.path.join(_TFRECORDS_DIR, 'speech', 'tt')
NOISE_DATA_TT_DIR = os.path.join(_TFRECORDS_DIR, 'noise', 'tt')

TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")


# Misc
SEED = 2


# Parsing
PARSING_CONFIG_NOISE = {
    'data': tf.io.VarLenFeature(tf.string),
    'data_sampling_rate': tf.io.VarLenFeature(tf.int64),
    'data_num_channels': tf.io.VarLenFeature(tf.int64),
    'data_width': tf.io.VarLenFeature(tf.int64),
}

PARSING_CONFIG_SPEECH = {
    'data': tf.io.VarLenFeature(tf.string),
    'data_sampling_rate': tf.io.VarLenFeature(tf.int64),
    'data_num_channels': tf.io.VarLenFeature(tf.int64),
    'data_width': tf.io.VarLenFeature(tf.int64),
    'pitch': tf.io.VarLenFeature(tf.float32),
    'pitch_confidence': tf.io.VarLenFeature(tf.float32),
}



In [None]:
print(NOISE_DATA_TR_DIR)

# Data

## Copy Data to Runtime

In [None]:
DATA_DIR_LOCAL = '/content/data'

if not os.path.exists(DATA_DIR_LOCAL):
    os.mkdir(DATA_DIR_LOCAL)
    
    RECORD_DIR_LOCAL = os.path.join(DATA_DIR_LOCAL, 'tfrecords')
    shutil.copytree(_TFRECORDS_DIR, RECORD_DIR_LOCAL)


_TFRECORDS_DIR = os.path.join(DATA_DIR_LOCAL, 'tfrecords')

## Process Data

In [None]:
def _parse_noise_record(serialized_example):
    parsed_features = tf.io.parse_single_example(serialized_example, features=PARSING_CONFIG_NOISE)
    decoded_features = {
        "data_num_channels": tf.cast(parsed_features["data_num_channels"].values[0], tf.int32),
        "data_sampling_rate": tf.cast(parsed_features["data_sampling_rate"].values[0], tf.int32),
        "data_width": tf.cast(parsed_features["data_width"].values[0], tf.int32),
    }
    data = tf.io.decode_raw(parsed_features['data'].values[0], tf.int16)
    decoded_features.update({"data": data})
    return decoded_features


def _parse_speech_record(serialized_example):
    parsed_features = tf.io.parse_single_example(serialized_example, features=PARSING_CONFIG_SPEECH)
    decoded_features = {
        "data_num_channels": tf.cast(parsed_features["data_num_channels"].values[0], tf.int32),
        "data_sampling_rate": tf.cast(parsed_features["data_sampling_rate"].values[0], tf.int32),
        "data_width": tf.cast(parsed_features["data_width"].values[0], tf.int32),
        "pitch": tf.cast(parsed_features['pitch'].values, tf.float32),
        "pitch_confidence": tf.cast(parsed_features['pitch_confidence'].values, tf.float32),
    }
    data = tf.io.decode_raw(parsed_features['data'].values[0], tf.int16)
    decoded_features.update({"data": data})
    return decoded_features


def _mix_noisy_speech(speech, noise):
    speech_pow = tf.math.reduce_euclidean_norm(speech)
    noise_pow = tf.math.reduce_euclidean_norm(noise)

    min_SNR = SNR_RANGE[0]
    max_SNR = SNR_RANGE[1]
    snr_current = 20.0*tf.math.log(speech_pow/noise_pow)/tf.math.log(10.0)
    snr_target = tf.random.uniform((),minval=min_SNR,maxval=max_SNR)

    noise = noise * tf.math.pow(10.0,(snr_current-snr_target)/20.0)
    noisy_speech = speech+noise

    return speech, noise, noisy_speech


def _interpolate_pitch(pitch,t):
    pitches = pitch.numpy()
    t = t.numpy()
    t_pitch = np.arange(0, len(pitch)) * PITCH_SAMPLING_TIME + PITCH_FRAME_LENGTH / 2
    f = scipy.interpolate.interp1d(t_pitch, pitch, 'nearest')
    return f(t).astype(np.float32)

def convert_hz_to_cent(f,fref=10.0):
    return mir_eval.melody.hz2cents(np.array(f), fref)

def calc_bin(freq_cent, cents_per_bin = 20, lower_bound_freq=32.7):  
    freq_cent = np.squeeze(freq_cent)
    lower_bound_freq_cent = mir_eval.melody.hz2cents(np.array([lower_bound_freq]))
    bin = (freq_cent - lower_bound_freq_cent) / np.array([cents_per_bin])
    return np.clip(bin, 0, 359)

def calc_y(f_groundtruth, n_bins = 360):
    c_true = calc_bin(f_groundtruth)
    return create_bin_vector(c_true)

def create_bin_vector(c_true):
    cis = np.arange(360)
    y = [gaussian_blur(cis, i) for i in c_true]
    return np.squeeze(y)
    
def gaussian_blur(ci, ctrue):
    return np.exp(-(ci-ctrue)**2/(2.0*25.0**2))

@tf.function
def _interpolate_pitch_tf(pitch,t):
    y = tf.py_function(_interpolate_pitch,[pitch,t], Tout=tf.float32)
    return tf.squeeze(y)

@tf.function
def _convert_hz_to_cent(pitch):
    y = tf.py_function(convert_hz_to_cent,[pitch], Tout=tf.float32)
    return tf.squeeze(y)

@tf.function
def _calc_y(pitch_cents):
    y = tf.py_function(calc_y,[pitch_cents], Tout=tf.float32)
    return tf.squeeze(y)

def _calc_features(speech_data, noise_data):
    speech = tf.squeeze(tf.cast(speech_data["data"], tf.float32))
    noise = tf.squeeze(tf.cast(noise_data["data"], tf.float32))
    speech = speech / tf.int16.max
    noise = noise / tf.int16.max

    random_start_idx = int(tf.round(tf.random.uniform([], maxval=(
             tf.cast(len(noise), tf.float32) - SAMPLE_LENGTH * FS - PITCH_SAMPLING_TIME))))
    noise = noise[random_start_idx:random_start_idx + SAMPLE_LENGTH * FS]

    random_start_idx = int(tf.round(tf.random.uniform([], minval=161, maxval=(
            tf.cast(len(speech), tf.float32) - SAMPLE_LENGTH * FS - 161))))
    speech = speech[random_start_idx:random_start_idx + SAMPLE_LENGTH * FS]   

    #SNR_range = SNR_RANGE
    frame_length = FRAME_LENGTH
    frame_step = FRAME_STEP
    speech, noise, noisy = _mix_noisy_speech(speech, noise)

    random_gain = tf.math.exp(
        tf.random.uniform([], minval=tf.math.log(MIN_RAND_GAIN), maxval=tf.math.log(MAX_RAND_GAIN)))
    noisy = random_gain * noisy

    noisy_frames = tf.signal.frame(noisy, frame_length, frame_step)
    speech_frames = tf.signal.frame(speech, frame_length, frame_step)
    noisy_frames = tf.squeeze(noisy_frames)
    speech_frames = tf.squeeze(speech_frames)
    #noisy_stft = tf.signal.stft(noisy,frame_length,frame_step)
    frame_times = random_start_idx / FS + tf.range(0, NUM_FRAMES) * frame_step / FS + frame_length / FS
    
    pitch = tf.squeeze(speech_data["pitch"])    
    pitch_confidence = tf.squeeze(speech_data["pitch_confidence"])
    #pitch = tf.where(pitch_confidence>config['pitch_confidence_threshold'],pitch,0)
    pitch_interpolated = _interpolate_pitch_tf(pitch, frame_times)
    pitch_interpolated_cents = _convert_hz_to_cent(pitch_interpolated)
    pitch_bins = _calc_y(pitch_interpolated_cents)
    return noisy_frames, pitch_bins

## Provide Data

In [None]:
def get_training_data():
    speech_ds = tf.data.TFRecordDataset([os.path.join(SPEECH_DATA_TR_DIR, file) for file in os.listdir(SPEECH_DATA_TR_DIR)])
    speech_ds = speech_ds.map(_parse_speech_record).repeat(None).shuffle(buffer_size=1000, seed=SEED)

    noise_ds = tf.data.TFRecordDataset([os.path.join(NOISE_DATA_TR_DIR, file) for file in os.listdir(NOISE_DATA_TR_DIR)])
    noise_ds = noise_ds.map(_parse_noise_record).repeat(None).shuffle(buffer_size=1000, seed=SEED)

    dataset_combined = tf.data.Dataset.zip((speech_ds, noise_ds))
    dataset_features = dataset_combined.map(_calc_features, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_features = dataset_features.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
    # just use if crepe without time component
    dataset_features = dataset_features.unbatch().unbatch().shuffle(3000).batch(BATCH_SIZE)
    return dataset_features


def get_validation_data():
    speech_ds = tf.data.TFRecordDataset([os.path.join(SPEECH_DATA_CV_DIR, file) for file in os.listdir(SPEECH_DATA_CV_DIR)])
    speech_ds = speech_ds.map(_parse_speech_record).repeat(None).shuffle(buffer_size=1000, seed=SEED)

    noise_ds = tf.data.TFRecordDataset([os.path.join(NOISE_DATA_CV_DIR, file) for file in os.listdir(NOISE_DATA_CV_DIR)])
    noise_ds = noise_ds.map(_parse_noise_record).repeat(None).shuffle(buffer_size=1000, seed=SEED)

    dataset_combined = tf.data.Dataset.zip((speech_ds, noise_ds))
    dataset_features = dataset_combined.map(_calc_features, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_features = dataset_features.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
    # just use if crepe without time component
    dataset_features = dataset_features.unbatch().unbatch().shuffle(3000).batch(BATCH_SIZE)

    return dataset_features


def get_test_data():
    speech_ds = tf.data.TFRecordDataset([os.path.join(SPEECH_DATA_TT_DIR, file) for file in os.listdir(SPEECH_DATA_TT_DIR)])
    # speech_ds = speech_ds.map(_parse_speech_record).repeat(None).shuffle(buffer_size=1000, seed=SEED)
    speech_ds = speech_ds.map(_parse_speech_record).repeat(10).shuffle(buffer_size=1000, seed=SEED)


    noise_ds = tf.data.TFRecordDataset([os.path.join(NOISE_DATA_TT_DIR, file) for file in os.listdir(NOISE_DATA_TT_DIR)])
    # noise_ds = noise_ds.map(_parse_noise_record).repeat(None).shuffle(buffer_size=1000, seed=SEED)
    noise_ds = noise_ds.map(_parse_noise_record).repeat(10).shuffle(buffer_size=1000, seed=SEED)


    dataset_combined = tf.data.Dataset.zip((speech_ds, noise_ds))
    dataset_features = dataset_combined.map(_calc_features, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset_features = dataset_features.batch(BATCH_SIZE, drop_remainder=True).prefetch(tf.data.experimental.AUTOTUNE)
    # just use if crepe without time component
    dataset_features = dataset_features.unbatch().unbatch().shuffle(3000).batch(BATCH_SIZE)

    return dataset_features

# Models

## CREPE

## Crepe with time component

In [None]:
from tensorflow.keras.layers import Input, Reshape, Conv2D, BatchNormalization
from tensorflow.keras.layers import MaxPool2D, Dropout, Permute, Flatten, Dense
from tensorflow.keras.models import Model


def get_model_crepe():
    layers = [1, 2, 3, 4, 5, 6]
    filters = [n * 32 for n in [32, 4, 4, 4, 8, 16]]
    widths = [512, 64, 64, 64, 64, 64]
    strides = [(1, 4), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]

    x = Input(shape=(184,1024), name='input', dtype='float32')
    y = Reshape(target_shape=(184, 1024, 1), name='input-reshape')(x)

    for l, f, w, s in zip(layers, filters, widths, strides):
        y = Conv2D(f, (1, w), strides=s, padding='same',
                   activation='relu', name="conv%d" % l)(y)
        y = BatchNormalization(name="conv%d-BN" % l)(y)
        y = MaxPool2D(pool_size=(1, 2), strides=None, padding='valid',
                      name="conv%d-maxpool" % l)(y)
        y = Dropout(0.25, name="conv%d-dropout" % l)(y)

    y = Reshape(target_shape=(184, 2048), name='output-reshape')(y)
    y = Dense(1, name="classifier")(y)

    model = Model(inputs=x, outputs=y)
    model.compile('adam', 'mse', metrics=['mse', 'mae'])

    return model


## Crepe without time component

In [None]:
from tensorflow.keras.layers import Input, Reshape, Conv2D, BatchNormalization
from tensorflow.keras.layers import MaxPool2D, Dropout, Permute, Flatten, Dense
from tensorflow.keras.models import Model

def get_model_crepe():
    layers = [1, 2, 3, 4, 5, 6]
    filters = [n * 32 for n in [32, 4, 4, 4, 8, 16]]
    widths = [512, 64, 64, 64, 64, 64]
    strides = [(4, 1), (1, 1), (1, 1), (1, 1), (1, 1), (1, 1)]

    x = Input(shape=(FRAME_LENGTH,), name='input', dtype='float32')
    y = Reshape(target_shape=(FRAME_LENGTH, 1, 1), name='input-reshape')(x)

    for layer, filters, width, strides in zip(layers, filters, widths, strides):
        y = Conv2D(filters, (width, 1), strides=strides, padding='same',
                   activation='relu', name="conv%d" % layer)(y)
        y = BatchNormalization(name="conv%d-BN" % layer)(y)
        y = MaxPool2D(pool_size=(2, 1), strides=None, padding='valid',
                         name="conv%d-maxpool" % layer)(y)
        y = Dropout(0.25, name="conv%d-dropout" % layer)(y)

    y = Permute((2, 1, 3), name="transpose")(y)
    y = Flatten(name="flatten")(y)
    y = Dense(360, activation='sigmoid', name="classifier")(y)

    model = Model(inputs=x, outputs=y)
    model.compile('adam', 'binary_crossentropy', metrics=['mse', 'mae'])

    return model

# Training

## Load Data

In [None]:
dataset_training = get_training_data()
dataset_validation = get_validation_data()
dataset_test = get_test_data()

## Load Model

In [None]:
model = get_model_crepe()

In [None]:
model.summary()

## Fit Model

In [None]:
%tensorboard --logdir /content/drive/MyDrive/BA_2021/crepe/logs

In [None]:
# USE IF IT IS INITIAL TRAINING

MODEL_USED = 'crepe'
LOG_DIR = os.path.join(_DATA_DIR, MODEL_USED, 'logs', TIMESTAMP + '_2048_1024_100_Epochs')
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
CHECKPOINT_DIR = os.path.join(_DATA_DIR, MODEL_USED, 'checkpoints', TIMESTAMP + '_2048_1024_100_Epochs')
if not os.path.exists(CHECKPOINT_DIR):
    os.makedirs(CHECKPOINT_DIR)

In [None]:
# JUST USE IF CONTINUING TRAINING

CHECKPOINT_DIR = '/content/drive/MyDrive/BA_2021/crepe/checkpoints/20210427-145400'
LOGDIR = '/content/drive/MyDrive/BA_2021/crepe/logs/20210427-145400'
model.load_weights(os.path.join('/content/drive/MyDrive/BA_2021/crepe/checkpoints', '20210427-145400', '50-2063.93.hdf5'))

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(LOG_DIR, histogram_freq=1)
checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath=os.path.join(CHECKPOINT_DIR,'{epoch:02d}-{val_loss:.2f}.hdf5'))
early_stopping =  tf.keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=32, mode='min')

callbacks = [checkpoint, tensorboard_callback, early_stopping]


history = model.fit(
    dataset_training,
    steps_per_epoch=STEPS_PER_EPOCH,
    epochs=100,
    # initial_epoch=30,
    verbose = 1,
    validation_data = dataset_validation,
    validation_steps=VALIDATION_STEPS,
    callbacks = callbacks)
    
loss = model.evaluate(dataset_test, steps=1)

# Metrics

## Load Model from Drive

In [None]:
# 1024 / 512
model.load_weights(os.path.join('/content/drive/MyDrive/BA_2021/crepe/checkpoints', '20210502-200638_1024_512_100_Epochs', '100-0.19.hdf5'))

In [None]:
# 512 / 256
model.load_weights(os.path.join('/content/drive/MyDrive/BA_2021/crepe/checkpoints', '20210503-140919_512_256_100_Epochs', '100-0.21.hdf5'))

In [None]:
# 256 / 128 
model.load_weights(os.path.join('/content/drive/MyDrive/BA_2021/crepe/checkpoints', '20210503-193459_256_128_100_Epochs', '84-0.22.hdf5'))

## Predictions

In [None]:
converters.convert_hz_to_cent(np.array([32.7]))

In [None]:
from scipy.signal import argrelextrema
def convert_bin_to_local_average_cents(salience, center=None):
    """
    find the weighted average cents near the argmax bin
    """
    if not hasattr(convert_bin_to_local_average_cents, 'cents_mapping'):
        # the bin number-to-cents mapping
        convert_bin_to_local_average_cents.cents_mapping = (
                np.linspace(0, 7180, 360) + 2051.14876287)
    if salience.ndim == 1:
        center = int(np.argmax(salience))
        start = max(0, center - 4)
        end = min(len(salience), center + 5)
        salience = salience[start:end]
        product_sum = np.sum(
            salience * convert_bin_to_local_average_cents.cents_mapping[start:end])
        weight_sum = np.sum(salience)
        return product_sum / weight_sum
    if salience.ndim == 2:
        return np.array([convert_bin_to_local_average_cents(salience[i, :]) for i in
                         range(salience.shape[0])])
    raise Exception("Label should be either 1d or 2d ndarray.")


def convert_bin_to_local_average_cents_lowest_maxima(salience, center=None, maxima_order=5, maxima_minval=0.2, tolerance=0.1):
    """
    find the weighted average cents near the argmax bin todo
    """
    if salience.ndim == 1:
        
        # if salience[0] > 0.2:
        #     salience = __create_maximum_bin(0)
        #     return convert_bin_to_local_average_cents(np.squeeze(salience), center=center)
        
        # else:
            #maxima = argrelextrema(salience, np.greater, order=maxima_order)[0]
        maxima = np.argmax(salience)
        if maxima == 0 and salience[0] < 0.8:    
            maxima = argrelextrema(salience, np.greater, order=maxima_order)[0]
            maxima = [x if x >= 51 and x <= 217 else 0 for x in maxima]
            maxima = np.max(maxima)
            salience = __create_maximum_bin(maxima)
        # maxima = [(x, converters.convert_cent_to_hz(convert_bin_to_local_average_cents(__create_maximum_bin(x))))
        #           for x in maxima if salience[x] >= maxima_minval]
        # if len(maxima) > 1:
        #     success, idx = __try_find_f0_in_maxima(maxima, tolerance=tolerance)
        #     if success:
        #         salience = np.zeros(360)
        #       salience[maxima[idx][0]] = 1
        return convert_bin_to_local_average_cents(salience, center=center)

    if salience.ndim == 2:
        return np.array([convert_bin_to_local_average_cents_lowest_maxima(salience[i, :]) for i in
                         range(salience.shape[0])])

    raise Exception("Label should be either 1d or 2d ndarray.")


def __create_maximum_bin(index):
    b = np.zeros(360)
    b[index] = 1
    return b

def __try_find_f0_in_maxima_true_negativs(maxima):
    maxima.sort(key=lambda x: x[1])
    for i in range (len(maxima) -1):
        max_current = maxima[i][1]


def __try_find_f0_in_maxima(maxima, tolerance=0.1):
    maxima.sort(key=lambda x: x[1])
    for i in range(len(maxima) - 1):
        max_current = maxima[i][1]
        max_next = maxima[i + 1][1]
        rel_diff = abs(max_current * 2 - max_next) / max_next
        if rel_diff <= tolerance:
            return True, i
    return False, None

In [None]:
def convert_hz_to_cent(f,fref=10.0):
    return 1200.0*np.log2(f/fref)

def convert_cent_to_hz(c,fref=10.0):
    return fref*2**(c/1200.0)
def calc_bin(freq, cents_per_bin = 20, lower_bound_freq=32.7):
    return (convert_hz_to_cent(freq)-convert_hz_to_cent(lower_bound_freq))/cents_per_bin
def create_bin(c_true):
    cis = np.arange(360)
    y = gaussian_blur(cis, c_true)
    return y

def gauss_blur(ci, ctrue):
    return np.exp(-(ci-ctrue)**2/(2.0*25.0**2))

In [None]:
bin = calc_bin(np.array([60]))
bin

In [None]:
vector = np.load('crepe_pred_vector_1024_512.npy')
vector_gt = np.load('crepe_true_vector_1024_512.npy')

In [None]:
import math
true_hz_new = np.array([])
pred_hz_new = []


for i in range(len(vector)):
    pred = converters.convert_cent_to_hz(convert_bin_to_local_average_cents_lowest_maxima(vector[i]))
    if math.isnan(pred):
        print(i)
    pred_hz_new.append(pred)
    

    



In [None]:
diff_new = true_hz - pred_hz_new

In [None]:
plt.figure()
plt.plot(vector[402])
plt.plot(vector_gt[402])
plt.show()

In [None]:
#x = argrelextrema(vector[8756], np.greater, order= 10)
x = convert_bin_to_local_average_cents_lowest_maxima(vector[402])
x

In [None]:
## Hertz filter
combined = zip(true_hz, pred_hz_new)
filtered = combined
# filtered = [x for x in list(combined) if x[0] > 60 and x[0] < 400 and x[1] > 60 and x[1] < 400]
filtered_unzipped = np.array(list(zip(*filtered)))
diff_filtered = filtered_unzipped[0] - filtered_unzipped[1]
print(len(diff), len(diff_filtered))
## Cent filter
combined_cent = zip(true_cent, predicted_cent)
filtered_cent = [x for x in list(combined_cent) if x[0] > 3101.95500087 and x[1] > 3101.95500087]
filtered_c_unzipped = np.array(list(zip(*filtered_cent)))
diff_filtered_cent = filtered_c_unzipped[0] - filtered_c_unzipped[1]
plt.rcParams.update({'font.size': 22})
from rt_pie_lib import metrics
hz_metrics = metrics.get_hz_metrics(filtered_unzipped[0], filtered_unzipped[1], print_output=True)
rpa_cent = metrics.raw_pitch_accuracy_cent(filtered_c_unzipped[0], filtered_c_unzipped[1])
print(rpa_cent)
hist = histogram(diff_filtered)

In [None]:
from rt_pie_lib import converters
def prediction():
    predicted_c = []
    true_c = []
    inp_vector = []
    predicted_vector = []
    for inp, outp in dataset_test:
        predicted = model.predict(inp)
        predicted_vector.append(predicted)
        inp_vector.append(outp) 
        true_cents = converters.convert_bin_to_local_average_cents(outp)
        true_c.append(true_cents)
        #predicted_cents = convert_bin_to_local_average_cents_lowest_maxima(np.squeeze(predicted))
        predicted_cents = convert_bin_to_local_average_cents(np.squeeze(predicted))
        predicted_c.append(predicted_cents)

    predicted_vector = np.reshape(np.array(predicted_vector), ((len(predicted_vector) * len(predicted_vector[0]), 360)))
    inp_vector = np.reshape(np.array(inp_vector), ((len(inp_vector) * len(inp_vector[0]), 360)))
    
    true_c = np.reshape(np.array(true_c), (1, (len(true_c)*len(true_c[0]))))
    true_c = np.squeeze(true_c)
    true_hz = converters.convert_cent_to_hz(true_c)
    predicted_c = np.reshape(np.array(predicted_c), (1, (len(predicted_c)*len(predicted_c[0]))))
    predicted_c = np.squeeze(predicted_c)
    predicted_hz = converters.convert_cent_to_hz(predicted_c)
    diff = true_hz - predicted_hz
    return predicted_hz, true_hz, true_c, predicted_c, diff, inp_vector, predicted_vector

In [None]:
predicted_hz, true_hz, true_cent, predicted_cent, diff, inp_vector, predicted_vector = prediction()

In [None]:
np.save('crepe_predicted_hz_1024_512.npy', predicted_hz)
np.save('crepe_true_hz_1024_512.npy', true_hz)
np.save('crepe_true_vector_1024_512.npy', inp_vector)
np.save('crepe_pred_vector_1024_512.npy', predicted_vector)
np.save('crepe_diff_1024_512.npy', diff)


### Filter predction

In [None]:
## Hertz filter
combined = zip(true_hz, predicted_hz_new)
# filtered = combined
filtered = [x for x in list(combined) if x[0] > 60 and x[0] < 400 and x[1] > 60 and x[1] < 400]
filtered_unzipped = np.array(list(zip(*filtered)))
diff_filtered = filtered_unzipped[0] - filtered_unzipped[1]
print(len(diff), len(diff_filtered))
## Cent filter
combined_cent = zip(true_cent, predicted_cent)
filtered_cent = [x for x in list(combined_cent) if x[0] > 3101.95500087 and x[1] > 3101.95500087]
filtered_c_unzipped = np.array(list(zip(*filtered_cent)))
diff_filtered_cent = filtered_c_unzipped[0] - filtered_c_unzipped[1]

## Zero Pitch Analysis

In [None]:
tn = 0
tp = 0
fp = 0
fn = 0


for i in range(len(true_hz)):
    if true_hz[i] >= 30 and true_hz[i] <= 35 and pred_hz_new[i] >= 30 and pred_hz_new[i] <= 35:
        tp += 1
        continue
    if true_hz[i] > 35 and pred_hz_new[i] > 35:
        tn += 1
        continue
    if true_hz[i] >= 30 and true_hz[i] <= 35 and pred_hz_new[i] > 35:
        fn += 1
        continue
    if true_hz[i] > 35 and pred_hz_new[i] >= 30 and pred_hz_new[i] <= 35:
        fp += 1
        continue

# for i in range(len(true_hz)):
#     if true_hz[i] >= 30 and true_hz[i] <= 35 and predicted_hz[i] >= 30 and predicted_hz[i] <= 35:
#         tp += 1
#         continue
#     if true_hz[i] > 35 and predicted_hz[i] > 35:
#         tn += 1
#         continue
#     if true_hz[i] >= 30 and true_hz[i] <= 35 and predicted_hz[i] > 35:
#         fn += 1
#         continue
#     if true_hz[i] > 35 and predicted_hz[i] >= 30 and predicted_hz[i] <= 35:
#         fp += 1
#         continue



sum = tp + fp + tn + fn
percentage_zero_truth = (tp + fn) / sum * 100
percentage_zero_predicted = (tp + fp) / sum * 100
precision = tp / (tp + fp) * 100  # Anteil unserer 0 schätzungen die richtig sind
recall = tp / (tp + fn) * 100  # Wieviele der tatsächlichen 0 schätzungen haben wir erwischt
accuracy = (tp + tn) / sum * 100  # Anteil richtige predictions
f1 = 2 * (precision * recall) / (precision + recall)

tn_percentage = tn / sum * 100
tp_percentage = tp / sum * 100
fp_percentage = fp / sum * 100
fn_percentage = fn / sum * 100

print("Sample size (test data set): ", sum)
print("0 - % in ground truth: ", "%.2f" % percentage_zero_truth)
print("0 - % in predictions: ",  "%.2f" % percentage_zero_predicted)
print("Accuarcy: ", "%.2f" % accuracy)
print("Precision: ", "%.2f" % precision)
print("Recall: ", "%.2f" % recall)
print("F1-Score", "%.2f" % f1)
print("True Negatives: ", "%.2f" % tn_percentage)
print("True Positives: ", "%.2f" % tp_percentage)
print("False Positives: ", "%.2f" % fp_percentage)
print("False Negatives: ", "%.2f" % fn_percentage)

## Hz Values with factor 2

In [None]:
plt.rcParams.update({'font.size': 12})
counter = 0
trues = np.array([])
preds = np.array([])
for i in range(len(diff)):
    if diff[i] > 50:
        plt.figure(i)
        plt.plot(inp_vector[i], 'r', label='ground truth')
        plt.plot(predicted_vector[i], 'b', label='predicted')
        plt.plot(np.argmax(inp_vector[i]),np.max(inp_vector[i]),'x')
        plt.plot(np.argmax(predicted_vector[i]),np.max(predicted_vector[i]),'x')
        # plt.text(np.argmax(inp_vector[i])+10,np.max(inp_vector[i]),f'max={np.max(inp_vector[i]):.1f} @ bin {np.argmax(inp_vector[i])}')
        # plt.text(np.argmax(predicted_vector[i])+10,np.max(predicted_vector[i]),f'max={np.max(predicted_vector[i]):.1f} @ bin {np.argmax(predicted_vector[i])}')
        plt.xlabel("Vector")
        plt.ylabel("Pitch Confidence")
        plt.legend()
        plt.show()

        
        print('True Hz: ', true_hz[i], 'Predicted Hz: ',predicted_hz[i], i)
        # trues = np.append(trues, filtered_unzipped[0][i])
        # preds = np.append(preds, filtered_unzipped[1][i])
        counter += 1
        # if counter == 60:
        #     break  
print(counter)

In [None]:
divided = np.divide(preds, trues)
np.mean(divided)


## Metrics evaluation

In [None]:
plt.rcParams.update({'font.size': 22})
from rt_pie_lib import metrics
hz_metrics = metrics.get_hz_metrics(filtered_unzipped[0], filtered_unzipped[1], print_output=True)
rpa_cent = metrics.raw_pitch_accuracy_cent(filtered_c_unzipped[0], filtered_c_unzipped[1])
print(rpa_cent)
hist = histogram(diff_filtered)

## Plots

In [None]:
def histogram(diff):  
    n_bins = 250
    x = diff
    y = true_hz

    plt.figure(figsize=[16,9])
    plt.hist(x, bins=n_bins)
    #plt.xlim([-200, 200])
    plt.ylim([0, 30000])
    plt.axvline(np.median(x), color='k', linestyle='dashed', linewidth=2, label='MED')
    plt.axvline(np.mean(x), color='k', linestyle='solid', linewidth=2, label='MEAN')
    plt.axvline(np.quantile(x, 0.05), color='k', linestyle='dotted', linewidth=2, label='5% quantile')
    plt.axvline(np.quantile(x, 0.95), color='k', linestyle='dashdot', linewidth=2, label='95% quantile')
    plt.xlabel("Error in Hertz")
    plt.ylabel("Number of Errors")
    plt.legend()
    plt.show()

# histo = histogram(diff)
# histo_true = histogram([x[0] - x[1] for x in filtered])

## Debug

In [None]:
err = np.array([x[0]-x[1] for x in filtered])
gt = np.array([x[0] for x in filtered])
est = np.array([x[1] for x in filtered])

plt.figure()
plt.scatter(gt,err)

In [None]:
print(len(gt[err<-50]))

In [None]:
plt.figure()
plt.scatter(gt[err < -50],est[err < -50])

## Prediction Metrics With Time Component

In [None]:
def prediction():
    predicted_hz = []
    true_hz = []
    for inp, outp in dataset_test:
        predicted = model.predict(inp)
        true_hz.append(outp)
        predicted = np.squeeze(predicted)
        predicted_hz.append(predicted)
    true_hz = np.reshape(np.array(true_hz), (1, (len(true_hz) * len(true_hz[0])*len(true_hz[0][0]))))
    true_hz = np.squeeze(true_hz)
    predicted_hz = np.reshape(np.array(predicted_hz), (1, (len(predicted_hz) * len(predicted_hz[0])*len(predicted_hz[0][0]))))
    predicted_hz = np.squeeze(predicted_hz)
    diff = true_hz - predicted_hz
    return predicted_hz, true_hz, diff

In [None]:
predicted_hz, true_hz, diff = prediction()

In [None]:
combined = zip(true_hz, predicted_hz)
filtered = [x for x in list(combined) if x[0] > 0]
filtered_unzipped = np.array(list(zip(*filtered)))
diff_filtered = filtered_unzipped[0] - filtered_unzipped[1]

In [None]:
true_cent = mir_eval.melody.hz2cents(filtered_unzipped[0])
predicted_cent = mir_eval.melody.hz2cents(filtered_unzipped[1])
std_dev_hz = np.std(diff_filtered)
mae_hz = mean_absolute_error_hz(true_hz=filtered_unzipped[0], predicted_hz=filtered_unzipped[1])
mean_hz = np.mean(diff_filtered)
median_hz = np.median(diff_filtered)
rpa_cent = raw_pitch_accuracy_cent(true_cent, predicted_cent)
quantile_05 = np.quantile(diff_filtered, 0.05)
quantile_95 = np.quantile(diff_filtered, 0.95)
min = np.min(diff_filtered)
max = np.max(diff_filtered)

histo_true = histogram([x[0] - x[1] for x in filtered])

In [None]:
print("Stdabweichung:", "%.2f" % std_dev_hz )
print("Avg in Hz:", "%.2f" % mean_hz)
print("MAE in Hz:", "%.2f" % mae_hz)
print("5% Quantil:", "%.2f" % quantile_05)
print("95% Quantil:", "%.2f" % quantile_95)
print("Median in Hz:", "%.2f" %median_hz)
print("Max in Hz:","%.2f" % max)
print("Min in Hz:", "%.2f" % min)
print("RPA in Cent:", "%.2f" % rpa_cent)

# Plots

In [None]:
i = 5
z = predicted[i]
y = outp[i]
plt.figure()
plt.plot(z)
plt.plot(y)
plt.plot(np.argmax(z),np.max(z),'x')
plt.plot(np.argmax(y),np.max(y),'x')
plt.ylim([0, 1.1])
plt.text(np.argmax(z)+10,np.max(z),f'max={np.max(z):.1f} @ bin {np.argmax(z)}')
plt.text(np.argmax(y)+10,np.max(y),f'max={np.max(y):.1f} @ bin {np.argmax(y)}')


In [None]:
z = outp[1]
plt.figure()
plt.plot(z)
plt.plot(np.argmax(z),np.max(z),'x')
plt.text(np.argmax(z)+10,np.max(z),f'max={np.max(z):.1f} @ bin {np.argmax(z)}')