In [1]:
import pandas as pd
import mido
import os
from tqdm import tqdm
import matplotlib.pyplot as plt

In [2]:
maestro_csv = 'maestro-v3.0.0/maestro-v3.0.0.csv'
maestro_df = pd.read_csv(maestro_csv)
maestro_df

FileNotFoundError: [Errno 2] No such file or directory: 'maestro-v3.0.0/maestro-v3.0.0.csv'

# Get the pedal events for a single midi file

In [None]:
example_midi = os.path.join(MAESTRO_FOLDER, maestro_df.iloc[0]['midi_filename'])
print(example_midi)

for message in mido.MidiFile(example_midi):
    if message.is_cc(64):
        assert message.channel == 0
        print(message)
        # time actually represents the time since the last message, not duration of the message
        print(message.value, message.time)

## Get a subset of Maestro

In [None]:
maestro_subset = maestro_df.sample(frac=1/50, random_state=42)

In [None]:
# process pedal events into a list
def add_pedal_events_col(df):
    all_pedal_events = []
    for midi_base in tqdm(df['midi_filename']):
        file_name = os.path.join(MAESTRO_FOLDER, midi_base)

        # List to store (value, time_until_next) pairs for sustain pedal messages
        pedal_events = []
        time_until_next = 0
        # Iterate over messages in reverse
        for message in reversed(list(mido.MidiFile(file_name))):
            if message.type == 'control_change' and message.control == 64:
                # Store the pedal message value and time until the next pedal message
                pedal_events.append((message.value, time_until_next))
                time_until_next = 0  # Reset time counter after recording this pedal message
            time_until_next += message.time  # Accumulate time backwards

        all_pedal_events.append(pd.DataFrame(pedal_events, columns=['level', 'duration']))
    
    df = df.assign(pedal_events=all_pedal_events)
    return df

maestro_subset = add_pedal_events_col(maestro_subset)
maestro_subset['pedal_events']

In [None]:
all_events = pd.concat(maestro_subset['pedal_events'].to_list(), ignore_index=True)
all_events

In [None]:
print(len(level_times))
counts, bin_edges, patches = plt.hist(all_events['level'], bins=range(129))
plt.xlabel("Pedal Level")
plt.ylabel("Count")
plt.title(f"Distribution of Pedal Level in Pedal Events on {len(maestro_subset)} random samples out of {len(maestro_df)}")
plt.show()

## Same Graph, weighted by time

In [None]:


def plot_pedal_level_time(all_events):
    fig, axes = plt.subplots(1, 2, figsize=(8, 4))  # 2 rows, 2 columns
    cropped_events = all_events[(0 < all_events['level']) & (all_events['level'] < 127)]
    axes[0].hist(all_events['level'], bins=range(0, 129), weights=all_events['duration'])
    axes[1].hist(cropped_events['level'], bins=range(1, 128), weights=cropped_events['duration'])
    axes[0].set_title("Total Pedal Time per Pedal Level")
    axes[1].set_title("Same Plot, Cropped off 0 and 127")
    
    # Adjust layout
    plt.tight_layout()
    plt.show()
    events_0 = all_events[all_events['level'] == 0]
    events_127 = all_events[all_events['level'] == 127]

    total_duration = all_events['duration'].sum()
    fraction_no_pedal = events_0['duration'].sum()/total_duration
    fraction_full_pedal = events_127['duration'].sum()/total_duration
    print(f"Fraction no pedal: {fraction_no_pedal:.3f}")
    print(f"Fraction full pedal: {fraction_full_pedal:.3f}")
    print(f"Fraction 1-126 pedal: {1-fraction_no_pedal-fraction_full_pedal:.3f}")
    print(f"Total duration: {total_duration:.3f}")

# aggregate 
plot_pedal_level_time(all_events)

## Half Pedaling Peak
There's a sharp peak at exactly 64; we suspect this corresponds to pianists half-pedaling.

In [None]:
maestro_subset

In [None]:
maestro_subset['num_pedal_events'] = [len(x) for x in maestro_subset['pedal_events']]
maestro_subset['num_pedal_events'].describe()

In [None]:
maestro_subset['num_pedal_events_normalized'] = maestro_subset['num_pedal_events'] / maestro_subset['duration']
maestro_subset['num_pedal_events_normalized'].describe()

In [None]:
pd.set_option('display.max_colwidth', 150)
maestro_subset.sort_values(by='num_pedal_events_normalized', ascending=False)

In [None]:
ballade_df = maestro_df[maestro_df['canonical_title'] == 'Ballade No. 1 in G Minor, Op. 23']
ballade_df = ballade_df.drop_duplicates(subset=['year'], keep='first')
ballade_df

# Compare Ballade Performances

In [None]:
ballade_df = add_pedal_events_col(ballade_df)

In [None]:
ballade_df

In [None]:
for ind, row in list(ballade_df.iterrows())[:3]:
    print("Performance ind:", ind)
    print("File:", row['midi_filename'])
    plot_pedal_level_time(row['pedal_events'])

# fig, axes = plt.subplots(1, 2, figsize=(8, 4))
# for _, row in list(ballade_df.iterrows())[:2]:
#     # check that pedal events duration matches up with duration given in the data
#     print(sum(row['pedal_events']['duration']), row['duration'])
#     pedal_events = row['pedal_events']
#     cropped_events = pedal_events[(pedal_events['level'] > 0) & (pedal_events['level'] < 127)]
#     axes[0].hist(pedal_events['level'], bins=range(129), weights=row['pedal_events']['duration'], alpha=0.5)
#     axes[1].hist(cropped_events['level'], alpha=0.2, bins=range(1, 128), weights=cropped_events['duration'])
# plt.show()