# Datasets analyses

## Random audios (MIDI Dataset)

In [11]:
import pandas as pd

def get_split_stats(file_path):
    data = pd.read_csv(file_path, delimiter='\t', header=None)
    content_melodies = data[0].str.extract(r'data_(\d{3})_')[0].unique()
    style_instruments = data[1].str.extract(r'_(\d{3})\.wav')[0].unique()
    result_files = data[2].unique()
    return len(result_files), content_melodies, style_instruments

splits = {
    'Train': '../data/random_audios/splits/training_set.csv',
    'Val1': '../data/random_audios/splits/validation_set_1.csv',
    'Val2': '../data/random_audios/splits/validation_set_2.csv',
    'Test': '../data/random_audios/splits/test_set.csv'
}

stats = {}
for split_name, file_path in splits.items():
    num_samples, melody_ids, instrument_ids = get_split_stats(file_path)
    stats[split_name] = {
        'Number of Samples': num_samples,
        'Melody ID Range': f"{melody_ids.min()}-{melody_ids.max()}",
        'Unique Melody IDs': len(melody_ids),
        'Instrument ID Range': f"{instrument_ids.min()}-{instrument_ids.max()}",
        'Unique Instrument IDs': len(instrument_ids),
        'Instruments Notes': 'Only 041-050' if split_name == 'Val2' else 'Excludes 041-050'
    }

stats_df = pd.DataFrame.from_dict(stats, orient='index')

In [12]:
stats_df

Unnamed: 0,Number of Samples,Melody ID Range,Unique Melody IDs,Instrument ID Range,Unique Instrument IDs,Instruments Notes
Train,35235,101-999,899,001-104,94,Excludes 041-050
Val1,838,101-999,554,001-104,94,Excludes 041-050
Val2,799,101-997,545,041-050,10,Only 041-050
Test,1643,101-999,757,001-104,94,Excludes 041-050


In [13]:
# Convert to latex
print(stats_df.T.to_latex())

\begin{tabular}{lllll}
\toprule
{} &             Train &              Val1 &          Val2 &              Test \\
\midrule
Number of Samples     &             35235 &               838 &           799 &              1643 \\
Melody ID Range       &           101-999 &           101-999 &       101-997 &           101-999 \\
Unique Melody IDs     &               899 &               554 &           545 &               757 \\
Instrument ID Range   &           001-104 &           001-104 &       041-050 &           001-104 \\
Unique Instrument IDs &                94 &                94 &            10 &                94 \\
Instruments Notes     &  Excludes 041-050 &  Excludes 041-050 &  Only 041-050 &  Excludes 041-050 \\
\bottomrule
\end{tabular}



  print(stats_df.T.to_latex())
