# Tests of different melody metrics

In [14]:
import essentia.standard as estd
import numpy as np
import librosa
from scipy.signal import square
import pretty_midi

## Utils

In [15]:
# Function to generate synthetic melody using sine waves
def generate_melody_sine_wave(frequencies, durations, sample_rate=44100):
    audio = np.array([])
    for freq, duration in zip(frequencies, durations):
        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
        note_audio = 0.5 * np.sin(2 * np.pi * freq * t)
        audio = np.concatenate([audio, note_audio])
    return audio

# Function to generate synthetic melody using square waves
def generate_melody_square_wave(frequencies, durations, sample_rate=44100):
    audio = np.array([])
    for freq, duration in zip(frequencies, durations):
        t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
        note_audio = 0.5 * square(2 * np.pi * freq * t)
        audio = np.concatenate([audio, note_audio])
    return audio

# Function to calculate Mean Squared Error (MSE)
def mse(s1, s2):
    return np.mean((s1 - s2)**2)

# Generate synthetic melodies using sine waves
audio_melody1_sine = generate_melody_sine_wave([440, 554, 660, 440], [0.5, 0.5, 0.5, 0.5])
audio_melody2_sine = generate_melody_sine_wave([440, 554, 622, 440], [0.5, 0.5, 0.5, 0.5])

# Generate synthetic melodies using square waves
audio_melody1_square = generate_melody_square_wave([440, 554, 660, 440], [0.5, 0.5, 0.5, 0.5])
audio_melody2_square = generate_melody_square_wave([440, 554, 622, 440], [0.5, 0.5, 0.5, 0.5])

# Sample rate
SR = 44100

## Melody metric from the original paper

In [18]:
def normalize_power(audio):
    return audio / (np.sqrt(np.mean(audio ** 2)) + np.finfo(audio.dtype).eps)

def get_pitches(audio):
    input_sr, sr = SR, 8000  # Need to resample because of EqualLoudness
    audio = estd.Resample(inputSampleRate=input_sr, outputSampleRate=sr)(audio)
    audio = estd.EqualLoudness(sampleRate=sr)(audio)
    rng = np.random.default_rng(seed=(audio > 0).sum())
    audio = rng.normal(loc=audio, scale=1e-4).astype(audio.dtype)  # To prevent Melodia from crashing
    pitches = estd.MultiPitchMelodia(sampleRate=sr)(audio)
    pitches = [[pretty_midi.utilities.hz_to_note_number(p) for p in pl if not np.isclose(0, p)]
               for pl in pitches]
    pitches = [[int(p + 0.5) for p in pl] for pl in pitches]
    return pitches

def eval_example_content(output, reference):
    pitches_output, pitches_reference = get_pitches(output), get_pitches(reference)
    assert len(pitches_output) == len(pitches_reference)
    jaccard = []
    for pl_output, pl_reference in zip(pitches_output, pitches_reference):
        matches = len(set(pl_output) & set(pl_reference))
        total = len(set(pl_output) | set(pl_reference))
        if total == 0:
            jaccard.append(0)
        else:
            jaccard.append(1 - matches / total)
    jaccard = np.mean(jaccard)
    return {'pitch_jaccard': jaccard}

## Melody metric based on chromagrams

In [21]:
def generate_chromagram(audio, sample_rate=44100, hop_length=512, n_fft=2048):
    chroma = librosa.feature.chroma_stft(audio, sr=sample_rate, hop_length=hop_length, n_fft=n_fft)
    return chroma

# Function to calculate Mean Squared Error (MSE) between two chromagrams
def chroma_mse(chroma1, chroma2):
    # Make sure both chromagrams have the same shape
    min_length = min(chroma1.shape[1], chroma2.shape[1])
    chroma1 = chroma1[:, :min_length]
    chroma2 = chroma2[:, :min_length]
    
    return mse(chroma1, chroma2)

def eval_example_content_custom(output, reference, sr=44100):
    chroma_output, chroma_reference = generate_chromagram(output, sample_rate=sr), generate_chromagram(reference, sample_rate=sr)
    return chroma_mse(chroma_output, chroma_reference)

## Evaluation

In [23]:
from itertools import product

cases = [
    ("Same audio", audio_melody1_sine, audio_melody1_sine),
    ("Same timbre, diff melodies (sine)", audio_melody1_sine, audio_melody2_sine),
    ("Same timbre, diff melodies (square)", audio_melody1_square, audio_melody2_square),
    ("Diff timbre, diff melodies 1", audio_melody1_sine, audio_melody2_square),
    ("Diff timbre, diff melodies 2", audio_melody1_square, audio_melody2_sine),
    ("Same melody, diff timbre 1", audio_melody1_sine, audio_melody1_square),
    ("Same melody, diff timbre 2", audio_melody2_sine, audio_melody2_square),
]

for name, a1, a2 in cases:
    original = eval_example_content(a1, a2)['pitch_jaccard']
    custom = eval_example_content_custom(a1, a2)
    print(f"{name}:")
    print(f"    Original: {original}")
    print(f"    Custom  : {custom}")

Same audio:
    Original: 0.0
    Custom  : 0.0
Same timbre, diff melodies (sine):
    Original: 0.20634920634920634
    Custom  : 0.024996413211660296
Same timbre, diff melodies (square):
    Original: 0.25396825396825395
    Custom  : 0.026707178677720746
Diff timbre, diff melodies 1:
    Original: 0.25396825396825395
    Custom  : 0.025914413357136037
Diff timbre, diff melodies 2:
    Original: 0.25396825396825395
    Custom  : 0.031775932941804454
Same melody, diff timbre 1:
    Original: 0.047619047619047616
    Custom  : 0.0041432371282374035
Same melody, diff timbre 2:
    Original: 0.047619047619047616
    Custom  : 0.0029856399720020583
