# MOS calculation

This jupyter notebook serves to assess subjective quality of the generated recordings.

In [1]:
import pandas as pd
import os

SAMPLE_RATE = 16_000

In [2]:
TEST_ROOT = '/home/wojtekk23/Muzyka/magisterka_test/'
TEST_SET_GROUND_AUDIOS = os.path.join(TEST_ROOT, 'test_set_ground_audios/')
TEST_SET = os.path.join(TEST_ROOT, 'test_set.csv')

# test_set_ground_truths = pd.read_csv(TEST_SET_GROUND_TRUTHS, header=None, names=['ground_truth'])
# test_set_ground_truths['ground_truth'] = test_set_ground_truths['ground_truth'].map(lambda path: os.path.join(TEST_SET_GROUND_AUDIOS, os.path.basename(path)))

test_set = pd.read_csv(TEST_SET, header=None, names=['content', 'style', 'ground_truth'], delimiter='\t')
test_set['ground_truth'] = test_set['ground_truth'].map(lambda path: os.path.join(TEST_SET_GROUND_AUDIOS, os.path.basename(path)))
test_set['vqvae_basename'] = pd.Series([f'vqvae{ix:04}.wav' for ix in range(len(test_set))])
test_set.head()

Unnamed: 0,content,style,ground_truth,vqvae_basename
0,/mnt/vdb/random_audios_patch_16k//data_169_035...,/mnt/vdb/random_audios_patch_16k//data_984_014...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0000.wav
1,/mnt/vdb/random_audios_patch_16k//data_745_014...,/mnt/vdb/random_audios_patch_16k//data_908_029...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0001.wav
2,/mnt/vdb/random_audios_patch_16k//data_454_025...,/mnt/vdb/random_audios_patch_16k//data_149_057...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0002.wav
3,/mnt/vdb/random_audios_patch_16k//data_672_060...,/mnt/vdb/random_audios_patch_16k//data_497_090...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0003.wav
4,/mnt/vdb/random_audios_patch_16k//data_457_034...,/mnt/vdb/random_audios_patch_16k//data_588_019...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0004.wav


In [3]:
shuffled_test_set = test_set.sample(frac=1)
shuffled_test_set = shuffled_test_set.iloc[:50]
# shuffled_test_set.head()
len(shuffled_test_set)

50

In [4]:
MODELS = (
    'model-leaky-relu-finetuned-style-pretraining-15-11-2023',
    'model-leaky-relu-frozen-style-pretraining-15-11-2023',
    'model-leaky-relu-no-style-pretraining-13-11-2023',
    'model-original-finetuned-style-pretraining-22-11-2023',
    'model-original-frozen-style-pretraining-21-11-2023',
    'model-original-no-style-pretraining-19-11-2023',
)

MODELS_PATHS = {model: os.path.join(TEST_ROOT, model, 'outputs', model) for model in MODELS}

In [5]:
expanded_test_set = pd.DataFrame()
new_rows = []

for index, row in shuffled_test_set.iterrows():
    for model, path in MODELS_PATHS.items():
        new_row = row.copy()
        new_row['model'] = model
        new_row['model_path'] = path
        new_rows.append(new_row)

expanded_test_set = pd.DataFrame(new_rows).sample(frac=1)

# Save the test set for MOS scores (RUN ONLY IF REGENERATION IS NEEDED)
# expanded_test_set.to_csv('mos_scores/expanded_test_set.csv', encoding='utf-8')

## Evaluate scores

In [6]:
models_scores = pd.DataFrame(columns=['content_class', 'style_class', 'melody_id', 'model_name', 'inference_audio_path', 'melody_score', 'style_score', 'overall_score'])

# Set the index to whatever you ended up on (the biggest index in the mos_scores csv folder)
expanded_test_set_ix = 251

# Load the test set for MOS scores
expanded_test_set = pd.read_csv('mos_scores/expanded_test_set.csv')
expanded_test_set.head()

Unnamed: 0.1,Unnamed: 0,content,style,ground_truth,vqvae_basename,model,model_path
0,1071,/mnt/vdb/random_audios_patch_16k//data_675_005...,/mnt/vdb/random_audios_patch_16k//data_971_056...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae1071.wav,model-leaky-relu-finetuned-style-pretraining-1...,/home/wojtekk23/Muzyka/magisterka_test/model-l...
1,46,/mnt/vdb/random_audios_patch_16k//data_765_039...,/mnt/vdb/random_audios_patch_16k//data_111_077...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0046.wav,model-original-frozen-style-pretraining-21-11-...,/home/wojtekk23/Muzyka/magisterka_test/model-o...
2,509,/mnt/vdb/random_audios_patch_16k//data_319_101...,/mnt/vdb/random_audios_patch_16k//data_513_024...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0509.wav,model-original-no-style-pretraining-19-11-2023,/home/wojtekk23/Muzyka/magisterka_test/model-o...
3,227,/mnt/vdb/random_audios_patch_16k//data_794_073...,/mnt/vdb/random_audios_patch_16k//data_570_073...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0227.wav,model-leaky-relu-finetuned-style-pretraining-1...,/home/wojtekk23/Muzyka/magisterka_test/model-l...
4,954,/mnt/vdb/random_audios_patch_16k//data_574_017...,/mnt/vdb/random_audios_patch_16k//data_207_012...,/home/wojtekk23/Muzyka/magisterka_test/test_se...,vqvae0954.wav,model-leaky-relu-frozen-style-pretraining-15-1...,/home/wojtekk23/Muzyka/magisterka_test/model-l...


In [7]:
import ipywidgets as widgets
from IPython.display import Audio, display, clear_output
import ipywidgets
from ipywidgets import VBox, HBox, Button

widget_output = ipywidgets.Output()


def get_test_tuple_from_ix(ix):
    row = expanded_test_set.iloc[ix]
    vqvae_basename = row['vqvae_basename']
    ground_audio_path = row['ground_truth']
    content_audio_path = row['content']
    model_name = row['model']
    model_path = row['model_path']
    inference_audio_path = os.path.join(model_path, vqvae_basename)

    _, melody_id, style_class = os.path.basename(ground_audio_path[:-len('.wav')]).split('_', 3)
    _, _, content_class = os.path.basename(content_audio_path[:-len('.wav')]).split('_', 3)
    
    return ground_audio_path, inference_audio_path, model_name, melody_id, style_class, content_class


ground_audio_path, inference_audio_path, model_name, melody_id, style_class, content_class = get_test_tuple_from_ix(expanded_test_set_ix)

# Widget for Melody Score
melody_score = widgets.IntSlider(
    value=3,
    min=1,
    max=5,
    step=1,
    description=f'Melody Score for sample {expanded_test_set_ix}:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
def update_melody_score(melody_score, ix):
    melody_score.value = 3
    melody_score.description = f'Melody Score for sample {ix}:'

# Widget for Style Score
style_score = widgets.IntSlider(
    value=3,
    min=1,
    max=5,
    step=1,
    description=f'Style Score for sample {expanded_test_set_ix}:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
def update_style_score(style_score, ix):
    style_score.value = 3
    style_score.description = f'Style Score for sample {ix}:'

# Widget for Overall Score
overall_score = widgets.IntSlider(
    value=3,
    min=1,
    max=5,
    step=1,
    description=f'Overall Score for sample {expanded_test_set_ix}:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
def update_overall_score(overall_score, ix):
    overall_score.value = 3
    overall_score.description = f'Overall Score for sample {ix}:'

# Audio widgets
ground_audio_playback = widgets.Audio(
    value=open(ground_audio_path, 'rb').read(),
    format='wav',
    controls=True,
    loop=False,
    autoplay=False,
)
inference_audio_playback = widgets.Audio(
    value=open(inference_audio_path, 'rb').read(),
    format='wav',
    controls=True,
    loop=False,
    autoplay=False,
)
def update_audio_widget(audio_widget, audio_path):
    with open(audio_path, 'rb') as audio:
        audio_widget.value = audio.read()

# Submit Button
submit_button = Button(description="Submit Scores",
                       button_style='success',
                       layout={'width': '50%'})
def on_submit(_):
    global models_scores
    global expanded_test_set_ix
    global ground_audio_playback
    global inference_audio_playback
    global melody_score
    global style_score
    global overall_score
    global content_class, style_class, melody_id, inference_audio_path, model_name
    
    model_score = {
        'content_class': int(content_class),
        'style_class': int(style_class),
        'melody_id': int(melody_id),
        'inference_audio_path': inference_audio_path,
        'model_name': model_name,
        'melody_score': melody_score.value,
        'style_score': style_score.value,
        'overall_score': overall_score.value
    }
    models_scores = models_scores.append(model_score, ignore_index=True)

    # Update the scores
    expanded_test_set_ix += 1
    ground_audio_path, inference_audio_path, model_name, melody_id, style_class, content_class = get_test_tuple_from_ix(expanded_test_set_ix)
    update_melody_score(melody_score, expanded_test_set_ix)
    update_style_score(style_score, expanded_test_set_ix)
    update_overall_score(overall_score, expanded_test_set_ix)
    update_audio_widget(ground_audio_playback, ground_audio_path)
    update_audio_widget(inference_audio_playback, inference_audio_path)
    

# Attach the event handler to the submit button
submit_button.on_click(on_submit)

# Layout the widgets
scores_box = VBox([
    widgets.Label(value="Ground audio playback:"),
    ground_audio_playback, 
    widgets.Label(value="Inference audio playback:"),
    inference_audio_playback, 
    melody_score,
    style_score,
    overall_score,
    submit_button
])
display(scores_box, widget_output)

VBox(children=(Label(value='Ground audio playback:'), Audio(value=b'RIFF\x90\x82\x03\x00WAVEfmt \x10\x00\x00\x…

Output()

In [9]:
models_scores.head()

Unnamed: 0,content_class,style_class,melody_id,model_name,inference_audio_path,melody_score,style_score,overall_score
0,82,67,404,model-leaky-relu-no-style-pretraining-13-11-2023,/home/wojtekk23/Muzyka/magisterka_test/model-l...,3,1,1
1,81,62,780,model-leaky-relu-no-style-pretraining-13-11-2023,/home/wojtekk23/Muzyka/magisterka_test/model-l...,5,1,2
2,7,73,179,model-original-finetuned-style-pretraining-22-...,/home/wojtekk23/Muzyka/magisterka_test/model-o...,5,5,5
3,85,98,117,model-leaky-relu-frozen-style-pretraining-15-1...,/home/wojtekk23/Muzyka/magisterka_test/model-l...,2,1,1
4,63,98,423,model-leaky-relu-frozen-style-pretraining-15-1...,/home/wojtekk23/Muzyka/magisterka_test/model-l...,4,1,1


In [10]:
# Save the last calculated mos scores
models_scores.to_csv(f'mos_scores/models_scores_up_to_{expanded_test_set_ix}.csv')

In [11]:
models_scores.groupby('model_name').mean()

  models_scores.groupby('model_name').mean()


Unnamed: 0_level_0,content_class,style_class,melody_id,melody_score,style_score,overall_score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
model-leaky-relu-finetuned-style-pretraining-15-11-2023,32.875,74.75,423.0,3.875,1.375,1.375
model-leaky-relu-frozen-style-pretraining-15-11-2023,69.125,55.5,405.75,2.75,1.125,1.125
model-leaky-relu-no-style-pretraining-13-11-2023,72.625,62.125,535.5,3.125,1.375,1.5
model-original-finetuned-style-pretraining-22-11-2023,43.0,49.636364,474.909091,4.181818,4.181818,4.090909
model-original-frozen-style-pretraining-21-11-2023,58.555556,52.222222,594.555556,1.333333,2.222222,1.333333
model-original-no-style-pretraining-19-11-2023,48.0,44.6,578.6,2.8,2.8,2.4


# Combine all MOS results

In [6]:
from pathlib import Path

mos_files = list(Path('mos_scores/').glob('models_scores_up_to_*.csv'))
mos_files

[PosixPath('mos_scores/models_scores_up_to_50.csv'),
 PosixPath('mos_scores/models_scores_up_to_100.csv'),
 PosixPath('mos_scores/models_scores_up_to_151.csv'),
 PosixPath('mos_scores/models_scores_up_to_201.csv'),
 PosixPath('mos_scores/models_scores_up_to_251.csv'),
 PosixPath('mos_scores/models_scores_up_to_300.csv')]

In [7]:
combined_mos = pd.DataFrame()

for f in mos_files:
    mos_file = pd.read_csv(f)
    combined_mos = pd.concat([combined_mos, mos_file], ignore_index=True)

combined_mos = combined_mos[['content_class', 'style_class', 'melody_id', 'model_name', 'inference_audio_path', 'melody_score', 'style_score', 'overall_score']]
combined_mos.describe()

Unnamed: 0,content_class,style_class,melody_id,melody_score,style_score,overall_score
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,53.62,53.88,495.36,3.123333,2.37,2.35
std,29.402675,33.228694,261.984842,1.188626,1.38546,1.229854
min,5.0,1.0,102.0,1.0,1.0,1.0
25%,27.0,20.0,241.0,2.0,1.0,1.0
50%,56.0,61.0,499.5,3.0,2.0,2.0
75%,83.0,81.0,760.0,4.0,3.0,3.0
max,101.0,102.0,996.0,5.0,5.0,5.0


In [8]:
#combined_mos.to_csv('mos_scores/combined_mos.csv')

In [9]:
combined_mos.groupby('model_name').count()

Unnamed: 0_level_0,content_class,style_class,melody_id,inference_audio_path,melody_score,style_score,overall_score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
model-leaky-relu-finetuned-style-pretraining-15-11-2023,50,50,50,50,50,50,50
model-leaky-relu-frozen-style-pretraining-15-11-2023,50,50,50,50,50,50,50
model-leaky-relu-no-style-pretraining-13-11-2023,50,50,50,50,50,50,50
model-original-finetuned-style-pretraining-22-11-2023,50,50,50,50,50,50,50
model-original-frozen-style-pretraining-21-11-2023,50,50,50,50,50,50,50
model-original-no-style-pretraining-19-11-2023,50,50,50,50,50,50,50


In [10]:
combined_mos.groupby('model_name').mean()

  combined_mos.groupby('model_name').mean()


Unnamed: 0_level_0,content_class,style_class,melody_id,melody_score,style_score,overall_score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
model-leaky-relu-finetuned-style-pretraining-15-11-2023,53.62,53.88,495.36,3.14,1.46,1.7
model-leaky-relu-frozen-style-pretraining-15-11-2023,53.62,53.88,495.36,3.14,1.6,1.82
model-leaky-relu-no-style-pretraining-13-11-2023,53.62,53.88,495.36,3.38,1.64,1.94
model-original-finetuned-style-pretraining-22-11-2023,53.62,53.88,495.36,4.1,4.36,4.06
model-original-frozen-style-pretraining-21-11-2023,53.62,53.88,495.36,1.72,1.96,1.58
model-original-no-style-pretraining-19-11-2023,53.62,53.88,495.36,3.26,3.2,3.0


### Check the instruments excluded from pretraining ("bowed strings")

In [20]:
combined_mos[(combined_mos['style_class'] >= 41) & (combined_mos['style_class'] <= 48)]

Unnamed: 0,content_class,style_class,melody_id,model_name,inference_audio_path,melody_score,style_score,overall_score
