# MOS calculation (Val2)

This jupyter notebook serves to assess subjective quality of the generated zero-shot validation set recordings.

In [1]:
import pandas as pd
import os

SAMPLE_RATE = 16_000

In [4]:
TEST_ROOT = '/mnt/vdb/'
TEST_SET_GROUND_AUDIOS = os.path.join(TEST_ROOT, 'random_audios_patch_16k/')
TEST_SET = os.path.join(TEST_ROOT, 'validation_set_2.csv')

# test_set_ground_truths = pd.read_csv(TEST_SET_GROUND_TRUTHS, header=None, names=['ground_truth'])
# test_set_ground_truths['ground_truth'] = test_set_ground_truths['ground_truth'].map(lambda path: os.path.join(TEST_SET_GROUND_AUDIOS, os.path.basename(path)))

test_set = pd.read_csv(TEST_SET, header=None, names=['content', 'style', 'ground_truth'], delimiter='\t')
# test_set['ground_truth'] = test_set['ground_truth'].map(lambda path: os.path.join(TEST_SET_GROUND_AUDIOS, os.path.basename(path)))
test_set['vqvae_basename'] = pd.Series([f'vqvae{ix:03}.wav' for ix in range(len(test_set))])
test_set.head()

Unnamed: 0,content,style,ground_truth,vqvae_basename
0,/mnt/vdb/random_audios_patch_16k//data_719_084...,/mnt/vdb/random_audios_patch_16k//data_850_042...,/mnt/vdb/random_audios_patch_16k//data_719_042...,vqvae000.wav
1,/mnt/vdb/random_audios_patch_16k//data_952_023...,/mnt/vdb/random_audios_patch_16k//data_212_045...,/mnt/vdb/random_audios_patch_16k//data_952_045...,vqvae001.wav
2,/mnt/vdb/random_audios_patch_16k//data_378_040...,/mnt/vdb/random_audios_patch_16k//data_361_042...,/mnt/vdb/random_audios_patch_16k//data_378_042...,vqvae002.wav
3,/mnt/vdb/random_audios_patch_16k//data_984_015...,/mnt/vdb/random_audios_patch_16k//data_712_049...,/mnt/vdb/random_audios_patch_16k//data_984_049...,vqvae003.wav
4,/mnt/vdb/random_audios_patch_16k//data_558_068...,/mnt/vdb/random_audios_patch_16k//data_582_044...,/mnt/vdb/random_audios_patch_16k//data_558_044...,vqvae004.wav


In [5]:
shuffled_test_set = test_set.sample(frac=1)
shuffled_test_set = shuffled_test_set.iloc[:50]
# shuffled_test_set.head()
len(shuffled_test_set)

50

In [6]:
MODELS = (
    'model-leaky-relu-finetuned-style-pretraining-15-11-2023',
    'model-leaky-relu-frozen-style-pretraining-15-11-2023',
    'model-leaky-relu-no-style-pretraining-13-11-2023',
    'model-original-finetuned-style-pretraining-22-11-2023',
    'model-original-frozen-style-pretraining-21-11-2023',
    'model-original-no-style-pretraining-19-11-2023',
)

MODELS_PATHS = {model: os.path.join('/home/wojtekk23/ss-vq-vae/experiments/outputs/', model, 'val2/outputs', model, 'val2') for model in MODELS}

In [8]:
expanded_test_set = pd.DataFrame()
new_rows = []

for index, row in shuffled_test_set.iterrows():
    for model, path in MODELS_PATHS.items():
        new_row = row.copy()
        new_row['model'] = model
        new_row['model_path'] = path
        new_rows.append(new_row)

expanded_test_set = pd.DataFrame(new_rows).sample(frac=1)

# Save the test set for MOS scores (RUN ONLY IF REGENERATION IS NEEDED)
# expanded_test_set.to_csv('mos_scores/val2_expanded_test_set.csv', encoding='utf-8')

## Evaluate scores

In [29]:
models_scores = pd.DataFrame(columns=['content_class', 'style_class', 'melody_id', 'model_name', 'inference_audio_path', 'melody_score', 'style_score', 'overall_score'])

# Set the index to whatever you ended up on (the biggest index in the mos_scores csv folder)
expanded_test_set_ix = 240

# Load the test set for MOS scores
expanded_test_set = pd.read_csv('mos_scores/val2_expanded_test_set.csv')
expanded_test_set.head()

Unnamed: 0.1,Unnamed: 0,content,style,ground_truth,vqvae_basename,model,model_path
0,753,/mnt/vdb/random_audios_patch_16k//data_890_034...,/mnt/vdb/random_audios_patch_16k//data_199_044...,/mnt/vdb/random_audios_patch_16k//data_890_044...,vqvae753.wav,model-original-no-style-pretraining-19-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...
1,82,/mnt/vdb/random_audios_patch_16k//data_613_029...,/mnt/vdb/random_audios_patch_16k//data_494_046...,/mnt/vdb/random_audios_patch_16k//data_613_046...,vqvae082.wav,model-original-no-style-pretraining-19-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...
2,782,/mnt/vdb/random_audios_patch_16k//data_902_032...,/mnt/vdb/random_audios_patch_16k//data_767_043...,/mnt/vdb/random_audios_patch_16k//data_902_043...,vqvae782.wav,model-leaky-relu-finetuned-style-pretraining-1...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...
3,569,/mnt/vdb/random_audios_patch_16k//data_626_017...,/mnt/vdb/random_audios_patch_16k//data_773_046...,/mnt/vdb/random_audios_patch_16k//data_626_046...,vqvae569.wav,model-original-frozen-style-pretraining-21-11-...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...
4,633,/mnt/vdb/random_audios_patch_16k//data_929_092...,/mnt/vdb/random_audios_patch_16k//data_236_045...,/mnt/vdb/random_audios_patch_16k//data_929_045...,vqvae633.wav,model-leaky-relu-frozen-style-pretraining-15-1...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...


In [30]:
import ipywidgets as widgets
from IPython.display import Audio, display, clear_output
import ipywidgets
from ipywidgets import VBox, HBox, Button

widget_output = ipywidgets.Output()


def get_test_tuple_from_ix(ix):
    row = expanded_test_set.iloc[ix]
    vqvae_basename = row['vqvae_basename']
    ground_audio_path = row['ground_truth']
    content_audio_path = row['content']
    model_name = row['model']
    model_path = row['model_path']
    inference_audio_path = os.path.join(model_path, vqvae_basename)

    _, melody_id, style_class = os.path.basename(ground_audio_path[:-len('.wav')]).split('_', 3)
    _, _, content_class = os.path.basename(content_audio_path[:-len('.wav')]).split('_', 3)
    
    return ground_audio_path, inference_audio_path, model_name, melody_id, style_class, content_class


ground_audio_path, inference_audio_path, model_name, melody_id, style_class, content_class = get_test_tuple_from_ix(expanded_test_set_ix)

# Widget for Melody Score
melody_score = widgets.IntSlider(
    value=3,
    min=1,
    max=5,
    step=1,
    description=f'Melody Score for sample {expanded_test_set_ix}:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
def update_melody_score(melody_score, ix):
    melody_score.value = 3
    melody_score.description = f'Melody Score for sample {ix}:'

# Widget for Style Score
style_score = widgets.IntSlider(
    value=3,
    min=1,
    max=5,
    step=1,
    description=f'Style Score for sample {expanded_test_set_ix}:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
def update_style_score(style_score, ix):
    style_score.value = 3
    style_score.description = f'Style Score for sample {ix}:'

# Widget for Overall Score
overall_score = widgets.IntSlider(
    value=3,
    min=1,
    max=5,
    step=1,
    description=f'Overall Score for sample {expanded_test_set_ix}:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='50%')
)
def update_overall_score(overall_score, ix):
    overall_score.value = 3
    overall_score.description = f'Overall Score for sample {ix}:'

# Audio widgets
ground_audio_playback = widgets.Audio(
    value=open(ground_audio_path, 'rb').read(),
    format='wav',
    controls=True,
    loop=False,
    autoplay=False,
)
inference_audio_playback = widgets.Audio(
    value=open(inference_audio_path, 'rb').read(),
    format='wav',
    controls=True,
    loop=False,
    autoplay=False,
)
def update_audio_widget(audio_widget, audio_path):
    with open(audio_path, 'rb') as audio:
        audio_widget.value = audio.read()

# Submit Button
submit_button = Button(description="Submit Scores",
                       button_style='success',
                       layout={'width': '50%'})
def on_submit(_):
    global models_scores
    global expanded_test_set_ix
    global ground_audio_playback
    global inference_audio_playback
    global melody_score
    global style_score
    global overall_score
    global content_class, style_class, melody_id, inference_audio_path, model_name
    
    model_score = {
        'content_class': int(content_class),
        'style_class': int(style_class),
        'melody_id': int(melody_id),
        'inference_audio_path': inference_audio_path,
        'model_name': model_name,
        'melody_score': melody_score.value,
        'style_score': style_score.value,
        'overall_score': overall_score.value
    }
    models_scores = models_scores.append(model_score, ignore_index=True)

    # Update the scores
    expanded_test_set_ix += 1
    ground_audio_path, inference_audio_path, model_name, melody_id, style_class, content_class = get_test_tuple_from_ix(expanded_test_set_ix)
    update_melody_score(melody_score, expanded_test_set_ix)
    update_style_score(style_score, expanded_test_set_ix)
    update_overall_score(overall_score, expanded_test_set_ix)
    update_audio_widget(ground_audio_playback, ground_audio_path)
    update_audio_widget(inference_audio_playback, inference_audio_path)
    

# Attach the event handler to the submit button
submit_button.on_click(on_submit)

# Layout the widgets
scores_box = VBox([
    widgets.Label(value="Ground audio playback:"),
    ground_audio_playback, 
    widgets.Label(value="Inference audio playback:"),
    inference_audio_playback, 
    melody_score,
    style_score,
    overall_score,
    submit_button
])
display(scores_box, widget_output)

VBox(children=(Label(value='Ground audio playback:'), Audio(value=b'RIFF\xe4&\x04\x00WAVEfmt \x10\x00\x00\x00\…

Output()

In [31]:
models_scores.head()

Unnamed: 0,content_class,style_class,melody_id,model_name,inference_audio_path,melody_score,style_score,overall_score
0,67,46,903,model-original-no-style-pretraining-19-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,2,2,2
1,101,47,254,model-leaky-relu-no-style-pretraining-13-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,2,1,1
2,98,44,348,model-original-frozen-style-pretraining-21-11-...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,1,2,1
3,16,46,940,model-leaky-relu-finetuned-style-pretraining-1...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,1,1,1
4,104,44,219,model-leaky-relu-no-style-pretraining-13-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,4,1,2


In [32]:
# Save the last calculated mos scores
models_scores.to_csv(f'mos_scores/val2_models_scores_up_to_{expanded_test_set_ix}.csv')

In [33]:
models_scores.groupby('model_name').mean()

  models_scores.groupby('model_name').mean()


Unnamed: 0_level_0,content_class,style_class,melody_id,melody_score,style_score,overall_score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
model-leaky-relu-finetuned-style-pretraining-15-11-2023,60.5,45.375,823.25,3.0,1.25,1.5
model-leaky-relu-frozen-style-pretraining-15-11-2023,60.545455,45.363636,705.636364,3.0,1.545455,1.636364
model-leaky-relu-no-style-pretraining-13-11-2023,55.222222,44.333333,516.555556,3.888889,1.333333,1.555556
model-original-finetuned-style-pretraining-22-11-2023,51.444444,44.777778,723.777778,4.0,3.444444,3.222222
model-original-frozen-style-pretraining-21-11-2023,77.727273,45.181818,543.090909,1.272727,1.818182,1.181818
model-original-no-style-pretraining-19-11-2023,66.833333,45.25,531.0,3.083333,2.25,2.166667


# Combine all MOS results

In [34]:
from pathlib import Path

mos_files = list(Path('mos_scores/').glob('val2_models_scores_up_to_*.csv'))
mos_files

[PosixPath('mos_scores/val2_models_scores_up_to_240.csv'),
 PosixPath('mos_scores/val2_models_scores_up_to_300.csv'),
 PosixPath('mos_scores/val2_models_scores_up_to_111.csv'),
 PosixPath('mos_scores/val2_models_scores_up_to_51.csv'),
 PosixPath('mos_scores/val2_models_scores_up_to_178.csv')]

In [36]:
combined_mos = pd.DataFrame()

for f in mos_files:
    mos_file = pd.read_csv(f)
    combined_mos = pd.concat([combined_mos, mos_file], ignore_index=True)

combined_mos = combined_mos[['content_class', 'style_class', 'melody_id', 'model_name', 'inference_audio_path', 'melody_score', 'style_score', 'overall_score']]
combined_mos.describe()

Unnamed: 0,content_class,style_class,melody_id,melody_score,style_score,overall_score
count,300.0,300.0,300.0,300.0,300.0,300.0
mean,57.94,45.34,566.78,3.196667,1.996667,2.01
std,30.725939,2.654787,269.453873,1.289781,1.052147,1.039311
min,8.0,41.0,116.0,1.0,1.0,1.0
25%,31.0,44.0,348.0,2.0,1.0,1.0
50%,65.0,46.0,545.0,4.0,2.0,2.0
75%,87.0,47.0,796.0,4.0,3.0,3.0
max,104.0,50.0,985.0,5.0,5.0,5.0


In [38]:
# combined_mos.to_csv('mos_scores/val2_combined_mos.csv')

In [39]:
combined_mos.groupby('model_name').count()

Unnamed: 0_level_0,content_class,style_class,melody_id,inference_audio_path,melody_score,style_score,overall_score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
model-leaky-relu-finetuned-style-pretraining-15-11-2023,50,50,50,50,50,50,50
model-leaky-relu-frozen-style-pretraining-15-11-2023,50,50,50,50,50,50,50
model-leaky-relu-no-style-pretraining-13-11-2023,50,50,50,50,50,50,50
model-original-finetuned-style-pretraining-22-11-2023,50,50,50,50,50,50,50
model-original-frozen-style-pretraining-21-11-2023,50,50,50,50,50,50,50
model-original-no-style-pretraining-19-11-2023,50,50,50,50,50,50,50


In [40]:
combined_mos.groupby('model_name').mean()

  combined_mos.groupby('model_name').mean()


Unnamed: 0_level_0,content_class,style_class,melody_id,melody_score,style_score,overall_score
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
model-leaky-relu-finetuned-style-pretraining-15-11-2023,57.94,45.34,566.78,3.44,1.32,1.54
model-leaky-relu-frozen-style-pretraining-15-11-2023,57.94,45.34,566.78,3.36,1.62,1.84
model-leaky-relu-no-style-pretraining-13-11-2023,57.94,45.34,566.78,3.88,1.66,1.96
model-original-finetuned-style-pretraining-22-11-2023,57.94,45.34,566.78,3.96,3.32,3.28
model-original-frozen-style-pretraining-21-11-2023,57.94,45.34,566.78,1.38,1.76,1.2
model-original-no-style-pretraining-19-11-2023,57.94,45.34,566.78,3.16,2.3,2.24


### Check the instruments excluded from pretraining ("bowed strings")

In [41]:
combined_mos[(combined_mos['style_class'] >= 41) & (combined_mos['style_class'] <= 48)]

Unnamed: 0,content_class,style_class,melody_id,model_name,inference_audio_path,melody_score,style_score,overall_score
0,78,48,729,model-leaky-relu-frozen-style-pretraining-15-1...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,5,1,2
1,79,42,472,model-original-no-style-pretraining-19-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,4,2,2
2,92,47,513,model-original-finetuned-style-pretraining-22-...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,2,1,1
3,53,41,777,model-original-frozen-style-pretraining-21-11-...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,1,3,1
4,8,48,895,model-original-no-style-pretraining-19-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,1,3,1
...,...,...,...,...,...,...,...,...
295,37,45,272,model-leaky-relu-finetuned-style-pretraining-1...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,3,1,1
296,23,41,397,model-original-frozen-style-pretraining-21-11-...,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,1,2,1
297,14,48,258,model-leaky-relu-no-style-pretraining-13-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,5,1,2
298,23,41,397,model-leaky-relu-no-style-pretraining-13-11-2023,/home/wojtekk23/ss-vq-vae/experiments/outputs/...,4,3,3
