In [1]:
import numpy as np
import pandas as pd
import os
from collections import Counter
from sklearn import metrics

Function to sort the tags in the training set (Jamendo+Music4All) by frequency.

In [2]:
def get_sorted_tags(add_m4a = True):
    
    train_split_path = '../example_splits/jamendo_moodtheme-train.tsv'

    train_tags = []

    for line in open(train_split_path, 'r'):
        tags = line.strip().split('\t')[2:]
        train_tags.extend(tags)
    train_tags = train_tags[1:]

    # Add the music4all instances to the Jamendo training set
    if add_m4a:
        m4a_split_path = '../example_splits/m4a_moodtheme-train.tsv'


        for i, line in enumerate(open(m4a_split_path, 'r')):
            if i > 0:
                tags = line.strip().split('\t')[1:]
                train_tags.extend(tags)


    sorted_train_tags = Counter(train_tags).most_common()
    sorted_train_tags = [x[0].split('---')[-1] for x in sorted_train_tags]

    sorted_counter_values = []

    counter = Counter(train_tags)

    for tag in sorted_train_tags:
        tag = 'mood/theme---'+tag
        sorted_counter_values.append(counter[tag])

    return sorted_train_tags, sorted_counter_values

Function to calculate PR-AUC and ROC-AUC scores for each label:

In [3]:
def breakdown_class_performance(gt, predictions):
    
    LABELS = ['action',  'adventure', 'advertising',  'background',  'ballad',  'calm',  'children',
 'christmas',  'commercial',  'cool',  'corporate',  'dark',  'deep',  'documentary',  'drama',
 'dramatic',  'dream',  'emotional',  'energetic',  'epic',  'fast',  'film',  'fun',  'funny',
 'game',  'groovy',  'happy',  'heavy',  'holiday',  'hopeful',  'inspiring',  'love',  'meditative',
 'melancholic',  'melodic',  'motivational',  'movie',  'nature',  'party',  'positive',  'powerful',
 'relaxing', 'retro',  'romantic',  'sad',  'sexy',  'slow',  'soft',  'soundscape',  'space',  'sport',
 'summer',  'trailer',  'travel',  'upbeat',  'uplifting']
    
    pr_aucs = []
    roc_aucs = []
    for i in range(len(LABELS)):
        pr_aucs.append(metrics.average_precision_score(gt[:,i], predictions[:,i]))
        roc_aucs.append(metrics.roc_auc_score(gt[:,i], predictions[:,i]))
    class_df = pd.DataFrame(index=LABELS)
    class_df['PR-AUC'] = pr_aucs
    class_df['ROC-AUC'] = roc_aucs
    
    return class_df

Function to evaluate the performance of a model given an array of predictions and a ground truth array.

In [4]:
def evaluate(groundtruth, predictions):

    results = {}

    for average in ['macro', 'micro']:
        results['ROC-AUC-' + average] = metrics.roc_auc_score(groundtruth, predictions, average=average)
        results['PR-AUC-' + average] = metrics.average_precision_score(groundtruth, predictions, average=average)

    for metric in results:
        print(metric,'=', results[metric])
        
    # Split the tags into head, middle, and tail groups to further evaluate performance.
    sorted_train_tags, sorted_counter_values = get_sorted_tags()

    head = sorted_train_tags[:14] 
    middle = sorted_train_tags[14:41]
    tail = sorted_train_tags[41:]
    
    class_df = breakdown_class_performance(groundtruth, predictions)
    
    label_splits = {'head':head,'middle':middle,'tail':tail}
    print('\n')
    for labels in label_splits:
        print(labels+':')
        print('PR-AUC:',np.mean(class_df.loc[label_splits[labels]]['PR-AUC']))
        print('ROC-AUC:',np.mean(class_df.loc[label_splits[labels]]['ROC-AUC']))
        
    print()

Training data:
Jamendo + Music4All + MillionSongDataset

Sampling: 
Class-aware resampling

In [5]:
# Get the model predictions and the ground truth array.

predictions_path = '../predictions/all_training_data/class_aware_resampling'

print('Validation split:\n')
split = 'validation'

ground_truth = np.load('../predictions/'+ split +'_ground_truth.npy')

bce = np.load(os.path.join(predictions_path, split,'bce_predictions.npy'))
focal = np.load(os.path.join(predictions_path, split,'focal_loss_predictions.npy'))
cb = np.load(os.path.join(predictions_path, split,'cb_focal_loss_predictions.npy'))
db = np.load(os.path.join(predictions_path, split,'db_focal_loss_predictions.npy'))

# Evaluate individual model performance:

print('BCE:\n')
evaluate(ground_truth, bce)

print('Focal loss:\n')
evaluate(ground_truth, focal)

print('CB Focal loss:\n')
evaluate(ground_truth, cb)

print('DB Focal loss:\n')
evaluate(ground_truth, db)


print('\nTest split:\n')
split = 'test'

ground_truth = np.load('../predictions/'+ split +'_ground_truth.npy')

bce = np.load(os.path.join(predictions_path, split,'bce_predictions.npy'))
focal = np.load(os.path.join(predictions_path, split,'focal_loss_predictions.npy'))
cb = np.load(os.path.join(predictions_path, split,'cb_focal_loss_predictions.npy'))
db = np.load(os.path.join(predictions_path, split,'db_focal_loss_predictions.npy'))

# Evaluate individual model performance:

print('BCE:\n')
evaluate(ground_truth, bce)

print('Focal loss:\n')
evaluate(ground_truth, focal)

print('CB Focal loss:\n')
evaluate(ground_truth, cb)

print('DB Focal loss:\n')
evaluate(ground_truth, db)

Validation split:

BCE:

ROC-AUC-macro = 0.7438425478910712
PR-AUC-macro = 0.11918333999238269
ROC-AUC-micro = 0.7691079033761603
PR-AUC-micro = 0.12449050886011888


head:
PR-AUC: 0.16789564663609804
ROC-AUC: 0.7494754299190404
middle:
PR-AUC: 0.10933731605911336
ROC-AUC: 0.7364935752762615
tail:
PR-AUC: 0.09144136353813319
ROC-AUC: 0.7518133420382912

Focal loss:

ROC-AUC-macro = 0.74385916691003
PR-AUC-macro = 0.11931738888595755
ROC-AUC-micro = 0.7755937972019219
PR-AUC-micro = 0.12205291663665638


head:
PR-AUC: 0.16747985418212882
ROC-AUC: 0.7455882166205428
middle:
PR-AUC: 0.10652248164160451
ROC-AUC: 0.7299556082583826
tail:
PR-AUC: 0.09739658764936647
ROC-AUC: 0.7672717927531832

CB Focal loss:

ROC-AUC-macro = 0.7418012871789789
PR-AUC-macro = 0.11633239323733725
ROC-AUC-micro = 0.7787111870158601
PR-AUC-micro = 0.12254027869586102


head:
PR-AUC: 0.1628802390216409
ROC-AUC: 0.7454891564662888
middle:
PR-AUC: 0.10672670055916299
ROC-AUC: 0.7366850759787315
tail:
PR-AUC: 0.090

Training data:
Jamendo + Music4All + MillionSongDataset

Sampling: 
Standard sampling

In [6]:
# Get the model predictions and the ground truth array.

predictions_path = '../predictions/all_training_data/standard_sampling'

print('Validation split:\n')
split = 'validation'

ground_truth = np.load('../predictions/'+ split +'_ground_truth.npy')

bce = np.load(os.path.join(predictions_path, split,'bce_predictions.npy'))
focal = np.load(os.path.join(predictions_path, split,'focal_loss_predictions.npy'))
cb = np.load(os.path.join(predictions_path, split,'cb_focal_loss_predictions.npy'))
db = np.load(os.path.join(predictions_path, split,'db_focal_loss_predictions.npy'))

# Evaluate individual model performance:

print('BCE:\n')
evaluate(ground_truth, bce)

print('Focal loss:\n')
evaluate(ground_truth, focal)

print('CB Focal loss:\n')
evaluate(ground_truth, cb)

print('DB Focal loss:\n')
evaluate(ground_truth, db)


print('\nTest split:\n')
split = 'test'

ground_truth = np.load('../predictions/'+ split +'_ground_truth.npy')

bce = np.load(os.path.join(predictions_path, split,'bce_predictions.npy'))
focal = np.load(os.path.join(predictions_path, split,'focal_loss_predictions.npy'))
cb = np.load(os.path.join(predictions_path, split,'cb_focal_loss_predictions.npy'))
db = np.load(os.path.join(predictions_path, split,'db_focal_loss_predictions.npy'))

# Evaluate individual model performance:

print('BCE:\n')
evaluate(ground_truth, bce)

print('Focal loss:\n')
evaluate(ground_truth, focal)

print('CB Focal loss:\n')
evaluate(ground_truth, cb)

print('DB Focal loss:\n')
evaluate(ground_truth, db)

Validation split:

BCE:

ROC-AUC-macro = 0.7482499064649113
PR-AUC-macro = 0.12382037205503996
ROC-AUC-micro = 0.7780083280731263
PR-AUC-micro = 0.13407422714862569


head:
PR-AUC: 0.17782289429400758
ROC-AUC: 0.7503896921470677
middle:
PR-AUC: 0.11791856107372602
ROC-AUC: 0.749013978451403
tail:
PR-AUC: 0.08404127773170196
ROC-AUC: 0.7448774435858794

Focal loss:

ROC-AUC-macro = 0.7530997642135144
PR-AUC-macro = 0.12297864187890421
ROC-AUC-micro = 0.787295326915569
PR-AUC-micro = 0.13687406865289184


head:
PR-AUC: 0.17804579484506072
ROC-AUC: 0.748570113396192
middle:
PR-AUC: 0.11917888964612187
ROC-AUC: 0.7506981641898473
tail:
PR-AUC: 0.07842218646283297
ROC-AUC: 0.761650318352283

CB Focal loss:

ROC-AUC-macro = 0.7494903498189812
PR-AUC-macro = 0.1223528433536589
ROC-AUC-micro = 0.7888203556294552
PR-AUC-micro = 0.13521827593650904


head:
PR-AUC: 0.16897877465654032
ROC-AUC: 0.745088417978656
middle:
PR-AUC: 0.1133868125653233
ROC-AUC: 0.7474385558936372
tail:
PR-AUC: 0.0949741

Ensemble the four models trained using standard sampling the and save the predictions:

In [7]:
ensemble_predictions = (bce+focal+cb+db)/4

evaluate(ground_truth, ensemble_predictions)

np.save('ensemble_predictions.npy', ensemble_predictions)

ROC-AUC-macro = 0.7812286699703872
PR-AUC-macro = 0.1609607384174659
ROC-AUC-micro = 0.8197803684777436
PR-AUC-micro = 0.1866509359262184


head:
PR-AUC: 0.18156240462417053
ROC-AUC: 0.7669046356293754
middle:
PR-AUC: 0.17924425900300184
ROC-AUC: 0.7776673708795513
tail:
PR-AUC: 0.10882217957057705
ROC-AUC: 0.8010081070521696

