## Evaluate similarity-based retrieval paradigm

**Query**: 45 patterns lasting around 10-s from MIREX dataset. See `MIREX Dataset.ipynb` for more details on how 45 patterns were chosen from the original MIREX dataset.

**Candidate**: The candidate dataset is composed of a mixture of 5 transformations of these patterns with slightly difference in tempo or <font color=blue>pitch</font> and randomly sampled segments from either MIREX or MAESTRO dataset. 

We rank the candidates based on their distance to the query and assess the recall rate of its 5 transformations.

In [1]:
import random
import numpy as np
import pandas as pd
from numpy.linalg import norm

### Make query set from selected MIREX patterns

In [2]:
# Load MIREX Dataset
mirex_spectra = np.load("./data/mirex/periodicity_spectra_pattern.npy")
mirex_df = pd.read_csv("./data/mirex/pattern_split.csv")
mirex_df['label'] = mirex_df['piece'] + '_' + mirex_df['version'] + '_' + mirex_df['pattern']

### From MIREX

In [3]:
mirex_spectra_ = np.load("./data/mirex/periodicity_spectra.npy")
mirex_label_ = ['mirex_other' for _ in range(len(mirex_spectra_))]

### From MAESTRO

In [4]:
# Load MAESTRO Dataset
maestro_df = pd.read_csv("./metadata/maestro_selected.csv")

# randomly select segement from maestro but avoiding the 1st and last window
# indices = [random.randint(1, len(s)-2) for s in maestro_spectra]
indices = list(np.load("./data/eval/maestro_frame_idx.npy"))
maestro_spectra = dict(np.load("./data/periodicity_spectra/maestro_selected.npz", allow_pickle=True))['p_f']
maestro_spectra = np.array([maestro_spectra[i][idx] for i, idx in enumerate(indices)])
maestro_label = ['maestro' for _ in range(len(indices))]

### Combine Dataset to make training set/database

In [5]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(np.concatenate([mirex_df['label'].to_numpy(), ['maestro'], ['mirex_other']]))

# Get query set from MIREX (Test)
mirex_val_idx = mirex_df[mirex_df['split'] == 'test'].index.to_list()
X_query = mirex_spectra[mirex_val_idx]
X_query = (X_query.T/norm(X_query, axis=1)).T # normalize
y_query = le.transform(mirex_df.loc[mirex_val_idx, 'label'].to_list())


# Make Database (Train)
mirex_train_idx = mirex_df[mirex_df['split'] == 'train'].index.to_list()

# Combine MIREX Pattern and other MIREX data to make database
X_mirex = np.vstack([mirex_spectra[mirex_train_idx], mirex_spectra_])
X_mirex = (X_mirex.T/norm(X_mirex, axis=1)).T # normalize
y_mirex = le.transform(mirex_df.loc[mirex_train_idx, 'label'].to_list() + mirex_label_)

# Combine MIREX Pattern and MAESTRO to make database
X_maestro = np.vstack([mirex_spectra[mirex_train_idx], maestro_spectra])
X_maestro = (X_maestro.T/norm(X_maestro, axis=1)).T # normalize
y_maestro = le.transform(mirex_df.loc[mirex_train_idx, 'label'].to_list() + maestro_label)

## Evaluation Metrics

In [6]:
def top_K_hit_rate(sim_score, y_train, y_test, K=1):
    n = len(y_test)
    idx = np.argsort(sim_score, axis=1)[:, -K:]
    y_pred_top_K = y_train[idx]
    hit = [(y_query[i] in y_pred_top_K[i]) for i in range(n)]
    hit_rate = np.sum(hit)/n
    return hit_rate

In [7]:
def top_K_overlap_rate(sim_score, y_train, y_test, K=1):
    n = len(y_test)
    idx = np.argsort(sim_score, axis=1)[:, -K:]
    y_pred_top_K = y_train[idx]
    
    y_test = np.repeat(np.expand_dims(y_test, axis=1), K, axis=1)
    rate = np.sum(y_test == y_pred_top_K)/K/n
    return rate

# Cosine Similarity

In [8]:
def cosine_similarity(x, y):
    # in the shape of [n, dim_feature]
#     x = (x.T/norm(x.T, axis=0)).T
#     y = (y.T/norm(y.T, axis=0)).T

    return np.dot(x, y.T)
# cos_score = top_K_hit_rate(np.argsort(cos_sim, axis=1), K=2)
# print(cos_score)

In [10]:
mirex_cos_score = cosine_similarity(X_query, X_mirex)
mirex_cos_hit_rate_top_1 = top_K_hit_rate(mirex_cos_score, y_mirex, y_query, K=1)
mirex_cos_hit_rate_top_5 = top_K_hit_rate(mirex_cos_score, y_mirex, y_query, K=5)
mirex_cos_overlap_rate = top_K_overlap_rate(mirex_cos_score, y_mirex, y_query, K=5)

print("Hit Rate on MIREX")
print(f"TOP 1: {mirex_cos_hit_rate_top_1:.4f}, TOP 5: {mirex_cos_hit_rate_top_5:.4f}")
print("---------")
print("Overlap Rate on MIREX")
print(f"TOP 5: {mirex_cos_overlap_rate:.4f}")

Hit Rate on MIREX
TOP 1: 1.0000, TOP 5: 1.0000
---------
Overlap Rate on MIREX
TOP 5: 0.5867


In [11]:
maestro_cos_score = cosine_similarity(X_query, X_maestro)
maestro_cos_hit_rate_top_1 = top_K_hit_rate(maestro_cos_score, y_maestro, y_query, K=1)
maestro_cos_hit_rate_top_5 = top_K_hit_rate(maestro_cos_score, y_maestro, y_query, K=5)
maestro_cos_overlap_rate = top_K_overlap_rate(maestro_cos_score, y_maestro, y_query, K=5)


print("Hit Rate on MAESTRO")
print(f"TOP 1: {maestro_cos_hit_rate_top_1:.4f}, TOP 5: {maestro_cos_hit_rate_top_5:.4f}")
print("---------")
print("Overlap Rate on MAESTRO")
print(f"TOP 5: {maestro_cos_overlap_rate:.4f}")

Hit Rate on MAESTRO
TOP 1: 1.0000, TOP 5: 1.0000
---------
Overlap Rate on MAESTRO
TOP 5: 0.5911


# Euclidean Distance

In [12]:
def euclidean_similiarity(x, y):
    # in the shape of [n, dim_feature]
    m, n = len(x), len(y)
    x = np.repeat(np.expand_dims(x, 1), n, axis=1)
    y = np.repeat(np.expand_dims(y, 0), m, axis=0)
    
    return 1/norm(x-y, axis=-1)

In [13]:
mirex_euc_score = euclidean_similiarity(X_query, X_mirex)
mirex_euc_hit_rate_top_1 = top_K_hit_rate(mirex_euc_score, y_mirex, y_query, K=1)
mirex_euc_hit_rate_top_5 = top_K_hit_rate(mirex_euc_score, y_mirex, y_query, K=5)
mirex_euc_overlap_rate = top_K_overlap_rate(mirex_euc_score, y_mirex, y_query, K=5)

print("Hit Rate on MIREX")
print(f"TOP 1: {mirex_euc_hit_rate_top_1:.4f}, TOP 5: {mirex_euc_hit_rate_top_5:.4f}")
print("---------")
print("Overlap Rate on MIREX")
print(f"TOP 5: {mirex_euc_overlap_rate:.4f}")

Hit Rate on MIREX
TOP 1: 1.0000, TOP 5: 1.0000
---------
Overlap Rate on MIREX
TOP 5: 0.5867


In [14]:
maestro_euc_score = cosine_similarity(X_query, X_maestro)
maestro_euc_hit_rate_top_1 = top_K_hit_rate(maestro_euc_score, y_maestro, y_query, K=1)
maestro_euc_hit_rate_top_5 = top_K_hit_rate(maestro_euc_score, y_maestro, y_query, K=5)
maestro_euc_overlap_rate = top_K_overlap_rate(maestro_euc_score, y_maestro, y_query, K=5)


print("Hit Rate on MAESTRO")
print(f"TOP 1: {maestro_euc_hit_rate_top_1:.4f}, TOP 5: {maestro_euc_hit_rate_top_5:.4f}")
print("---------")
print("Overlap Rate on MAESTRO")
print(f"TOP 5: {maestro_euc_overlap_rate:.4f}")

Hit Rate on MAESTRO
TOP 1: 1.0000, TOP 5: 1.0000
---------
Overlap Rate on MAESTRO
TOP 5: 0.5911


# DPW Distance

In [None]:
# Note: takes around 2hrs to calcualte dpw for each data set. 
from src.dpw import DPW_distance

def cal_dpw_dist(X_train, X_test):
    n_train, n_test = len(X_train), len(X_test)
    dist = np.ones((n_test, n_train)) * np.inf
    
    for i in range(n_test):
        for j in range(n_train):
            dist[i,j] = DPW_distance(X_test[i], X_train[j])
    return dist

In [30]:
mirex_dpw_dist = np.load("./data/eval/eval_dpw_dist_mirex.npy")
mirex_dpw_score = 1/mirex_dpw_dist

mirex_dpw_hit_rate_top_1 = top_K_hit_rate(mirex_dpw_score, y_mirex, y_query, K=1)
mirex_dpw_hit_rate_top_5 = top_K_hit_rate(mirex_dpw_score, y_mirex, y_query, K=5)
mirex_dpw_overlap_rate = top_K_overlap_rate(mirex_dpw_score, y_mirex, y_query, K=5)

print("Hit Rate on MIREX")
print(f"TOP 1: {mirex_dpw_hit_rate_top_1:.4f}, TOP 5: {mirex_dpw_hit_rate_top_5:.4f}")
print("---------")
print("Overlap Rate on MIREX")
print(f"TOP 5: {mirex_dpw_overlap_rate:.4f}")

Hit Rate on MIREX
TOP 1: 0.0000, TOP 5: 0.8889
---------
Overlap Rate on MIREX
TOP 5: 0.1911


In [32]:
maestro_dpw_dist = np.load("./data/eval/eval_dpw_dist_maestro.npy")
maestro_dpw_score = 1/maestro_dpw_dist

maestro_dpw_hit_rate_top_1 = top_K_hit_rate(maestro_dpw_score, y_maestro, y_query, K=1)
maestro_dpw_hit_rate_top_5 = top_K_hit_rate(maestro_dpw_score, y_maestro, y_query, K=5)
maestro_dpw_overlap_rate = top_K_overlap_rate(maestro_dpw_score, y_maestro, y_query, K=5)

print("Hit Rate on MIREX")
print(f"TOP 1: {maestro_dpw_hit_rate_top_1:.4f}, TOP 5: {maestro_dpw_hit_rate_top_5:.4f}")
print("---------")
print("Overlap Rate on MIREX")
print(f"TOP 5: {maestro_dpw_overlap_rate:.4f}")

Hit Rate on MIREX
TOP 1: 0.8444, TOP 5: 0.9333
---------
Overlap Rate on MIREX
TOP 5: 0.2000
