# Data Augmentation

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install and load packages

In [2]:
!pip install pretty-midi

Collecting pretty-midi
  Downloading pretty_midi-0.2.10.tar.gz (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting mido>=1.1.16 (from pretty-midi)
  Downloading mido-1.3.3-py3-none-any.whl.metadata (6.4 kB)
Downloading mido-1.3.3-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.6/54.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pretty-midi
  Building wheel for pretty-midi (setup.py) ... [?25l[?25hdone
  Created wheel for pretty-midi: filename=pretty_midi-0.2.10-py3-none-any.whl size=5592286 sha256=3dfe73897cf7216e901db47f80018f054073c6ab4144bc493ad4b737c40d2e15
  Stored in directory: /root/.cache/pip/wheels/e6/95/ac/15ceaeb2823b04d8e638fd1495357adb8d26c00ccac9d7782e
Successfully built pretty-midi
Installing collected packages: mido, pretty-midi
Successf

## Load Files

In [3]:
import zipfile
import os

zip_path = '/content/drive/MyDrive/midiclassics.zip'
extract_path = '/content/'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

# Check folder structure
os.listdir(extract_path)

['.config',
 'Rothchlid Symphony Rmw12 3mov.mid',
 'Beethoven',
 'Clarke',
 'Prokofiev',
 'Clementi',
 'Paradisi',
 'Scarlatti',
 'Debussy Suite Bergamasque 1mov.mid',
 'Busser',
 'MacCunn',
 'Peterson-Berger',
 'Alkan',
 'MacBeth',
 "Rimsky Korsakov ''Flight Of the Bumblebee''.mid",
 'Tchaikovsky Lake Of The Swans Act 1 5mov.mid',
 'Borodin',
 'Rothchild Piano Sonata Rmw13 2mov.mid',
 'Tchaikovsky Lake Of The Swans Act 1 1mov.mid',
 'Herold',
 'Rothchild Symphony Rmw12 1mov.mid',
 'Diabelli Sonatina op151 n2 2mov.mid',
 'Vivaldi',
 'Komzak',
 'Tchaikovsky Lake Of The Swans Act 2 13mov.mid',
 'Liszt Bach Prelude Transcription.mid',
 'Handel',
 'Ravel',
 'Botsford',
 'Bizet Symphony in C 4mov.mid',
 'Holst, M',
 '.DS_Store',
 'Gershuin Rhapsody In Blue Piano Duet.mid',
 'Buxethude Buxwv138 Prelude.mid',
 'Strauss, J',
 'Debussy',
 'Chabrier',
 'Bellini',
 'Messager',
 'Sarasate',
 'Ginastera',
 'Bizet Symphony in C 2mov.mid',
 'Liszt Paganini Etude n3.mid',
 'Reger Burlesque op58 n3.mid

## Data Pre-Processing

### Augmentation

In [4]:
import os
import glob
import warnings

warnings.filterwarnings("ignore")

def gather_distinct_midis(folder_path):
    midi_path_set = set()
    distinct_midi_files = []

    midi_file_list = glob.glob(os.path.join(folder_path, '**', '*.mid'), recursive=True)

    for midi in midi_file_list:
        full_path = os.path.abspath(midi)
        if full_path not in midi_path_set:
            midi_path_set.add(full_path)
            distinct_midi_files.append(full_path)

    return distinct_midi_files

In [5]:
import pretty_midi
import os
import random

def augment_midi_folder(input_folder, output_folder, num_augments=5):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    midi_list = glob.glob(os.path.join(input_folder, '**', '*.mid'), recursive=True)
    for filename in midi_list:
        if filename.endswith(".mid") or filename.endswith(".midi"):
            # file_path = os.path.join(input_folder, filename)
            base_name = filename.split('/')[-1].split(".mid")[0]

            try:
                midi = pretty_midi.PrettyMIDI(filename)
            except Exception as e:
                print(f"Error reading {filename}: {e}")
                continue

            for i in range(num_augments):
                augmented = pretty_midi.PrettyMIDI()
                for instrument in midi.instruments:
                    new_instrument = pretty_midi.Instrument(program=instrument.program, is_drum=instrument.is_drum)
                    for note in instrument.notes:
                        # Pitch shift
                        pitch_shift = random.choice([-2, -1, 0, 1, 2])
                        new_pitch = min(max(note.pitch + pitch_shift, 0), 127)

                        # Velocity variation
                        velocity_variation = random.randint(-10, 10)
                        new_velocity = min(max(note.velocity + velocity_variation, 0), 127)

                        # Tempo scaling
                        time_scale = random.choice([0.9, 1.0, 1.1])
                        start = note.start * time_scale
                        end = note.end * time_scale

                        new_note = pretty_midi.Note(
                            velocity=new_velocity,
                            pitch=new_pitch,
                            start=start,
                            end=end
                        )
                        new_instrument.notes.append(new_note)

                    augmented.instruments.append(new_instrument)
                output_file = os.path.join(output_folder, f"{base_name}_aug{i}.mid")
                augmented.write(output_file)


In [6]:
augment_midi_folder("/content/Mozart", "/content/Mozart_augmented", num_augments=4)
augment_midi_folder("/content/Beethoven", "/content/Beethoven_augmented", num_augments=5)
augment_midi_folder("/content/Chopin", "/content/Chopin_augmented", num_augments=6)

Error reading /content/Mozart/Piano Sonatas/Nueva carpeta/K281 Piano Sonata n03 3mov.mid: Could not decode key with 2 flats and mode 2
Error reading /content/Beethoven/Anhang 14-3.mid: Could not decode key with 3 flats and mode 255


In [7]:
# Bach directory
bach_folder_path = '/content/Bach'
bach_midi_list = gather_distinct_midis(bach_folder_path)
print(f"Number of MIDI files found for Bach: {len(bach_midi_list)}")

Number of MIDI files found for Bach: 925


In [8]:
# Beethoven directory
beethoven_folder_path = '/content/Beethoven_augmented'
beethoven_midi_list = gather_distinct_midis(beethoven_folder_path)
print(f"Number of MIDI files found for Beethoven: {len(beethoven_midi_list)}")

Number of MIDI files found for Beethoven: 1055


In [9]:
# Chopin directory
chopin_folder_path = '/content/Chopin_augmented'
chopin_midi_list = gather_distinct_midis(chopin_folder_path)
print(f"Number of MIDI files found for Chopin: {len(chopin_midi_list)}")

Number of MIDI files found for Chopin: 816


In [10]:
# Mozart directory
mozart_folder_path = '/content/Mozart_augmented'
mozart_midi_list = gather_distinct_midis(mozart_folder_path)
print(f"Number of MIDI files found for Mozart: {len(mozart_midi_list)}")

Number of MIDI files found for Mozart: 1024


In [11]:
from music21 import converter, note, chord, tempo, meter
import numpy as np
from fractions import Fraction

# Utility Functions

def to_float(value):
    if isinstance(value, Fraction):
        return float(value)
    return float(value)

def complete_chord_row(chord_row, max_pitches=4, fill_note=0):
    start_time = to_float(chord_row[0])
    chord_pitches = chord_row[1:]
    chord_pitches += [fill_note] * (max_pitches - len(chord_pitches))
    return [start_time] + chord_pitches

def pad_feature_table(feature_rows, expected_length, fill_note=0):
    result = []
    for row in feature_rows:
        row = row + [fill_note] * (expected_length - len(row))
        result.append(row)
    return np.array(result, dtype=float)

In [12]:
# Feature Extraction

def extract_score_data(parsed_score, max_chord_count=4):
    note_data            = []
    chord_data           = []
    tempo_data           = []
    rhythm_pattern_data  = []
    time_signature_data  = []

    for item in parsed_score.flat:
        if isinstance(item, note.Note):
            note_data.append([
                to_float(item.offset),
                item.pitch.midi,
                to_float(item.quarterLength),
                item.volume.realized
            ])
            rhythm_pattern_data.append([
                to_float(item.offset),
                to_float(item.quarterLength)
            ])
        elif isinstance(item, chord.Chord):
            this_chord = [to_float(item.offset)] + [p.midi for p in item.pitches]
            chord_data.append(complete_chord_row(this_chord, max_pitches=max_chord_count))

    for t in parsed_score.flat.getElementsByClass(tempo.MetronomeMark):
        tempo_data.append([to_float(t.offset), t.number])

    for sig in parsed_score.flat.getElementsByClass(meter.TimeSignature):
        time_signature_data.append([to_float(sig.offset), sig.numerator, sig.denominator])

    max_col_notes      = max((len(r) for r in note_data), default=0)
    max_col_chords     = max((len(r) for r in chord_data), default=0)
    max_col_tempo      = max((len(r) for r in tempo_data), default=0)
    max_col_rhythm     = max((len(r) for r in rhythm_pattern_data), default=0)
    max_col_timesig    = max((len(r) for r in time_signature_data), default=0)

    expected_cols = max(max_col_notes, max_col_chords, max_col_tempo, max_col_rhythm, max_col_timesig)

    note_array         = pad_feature_table(note_data, expected_cols)
    chord_array        = pad_feature_table(chord_data, expected_cols)
    tempo_array        = pad_feature_table(tempo_data, expected_cols)
    rhythm_array       = pad_feature_table(rhythm_pattern_data, expected_cols)
    timesig_array      = pad_feature_table(time_signature_data, expected_cols)

    min_row_count = min(
        len(note_array), len(chord_array), len(tempo_array),
        len(rhythm_array), len(timesig_array)
    )

    note_array     = note_array[:min_row_count]
    chord_array    = chord_array[:min_row_count]
    tempo_array    = tempo_array[:min_row_count]
    rhythm_array   = rhythm_array[:min_row_count]
    timesig_array  = timesig_array[:min_row_count]

    def force_2d(matrix):
        if matrix.ndim == 1:
            return matrix.reshape(-1, 1)
        return matrix

    note_array     = force_2d(note_array)
    chord_array    = force_2d(chord_array)
    tempo_array    = force_2d(tempo_array)
    rhythm_array   = force_2d(rhythm_array)
    timesig_array  = force_2d(timesig_array)

    combined_array = np.hstack([note_array, chord_array, tempo_array, rhythm_array, timesig_array]) \
        if min_row_count > 0 else np.empty((0, 0))
    return combined_array


In [13]:
# Collect features for composers
from concurrent.futures import ProcessPoolExecutor
def process_single_midi(args):
    composer_title, midi_path = args
    try:
        music_score = converter.parse(midi_path)
        data_matrix = extract_score_data(music_score)
        return data_matrix, composer_title, midi_path
    except Exception:
        return None

    return np.array(all_composer_features, dtype=object), np.array(composer_label_list), np.array(midi_used_files)

def collect_composer_data(composer_title, midi_path_list, max_files=300):
    results = []
    args_list = [(composer_title, path) for path in midi_path_list[:max_files]]
    print(args_list)
    with ProcessPoolExecutor() as executor:
        for result in executor.map(process_single_midi, args_list):
            if result:
                results.append(result)

    if not results:
        return np.array([]), np.array([]), np.array([])

    features, labels, paths = zip(*results)
    return np.array(features, dtype=object), np.array(labels), np.array(paths)


In [14]:
bach_feature_arrays, bach_label_list, bach_file_list = collect_composer_data("Bach", bach_midi_list)
print(f"Bach dataset: Feature arrays extracted from {len(bach_feature_arrays)} files.")

[('Bach', '/content/Bach/Bwv0992 Capriccio.mid'), ('Bach', '/content/Bach/Bwv1014 Harpsicord and Violin Sonata 2mov.mid'), ('Bach', '/content/Bach/Bwv0566 Prelude and Fugue.mid'), ('Bach', '/content/Bach/Bwv0572 Fantasia.mid'), ('Bach', '/content/Bach/Bwv0811 English Suite n6 6mov.mid'), ('Bach', '/content/Bach/Bwv1014 Harpsicord and Violin Sonata 3mov.mid'), ('Bach', '/content/Bach/Bwv0997 Partita for Lute 3mov.mid'), ('Bach', '/content/Bach/Bwv0811 English Suite n6 1mov.mid'), ('Bach', '/content/Bach/Bwv0561 Fantasie and Fuga.mid'), ('Bach', '/content/Bach/Bwv802 Four Inventions (Duettos) Clavier-U"bung III n1.mid'), ('Bach', '/content/Bach/Bwv0811 English Suite n6 7mov.mid'), ('Bach', '/content/Bach/Piano version of Bachs two part inventions No.4.mid'), ('Bach', '/content/Bach/Toccata and Fugue in D minor, BWV 565. (Busoni Piano Arr.mid'), ('Bach', '/content/Bach/Bwv0806 English Suite n1 10mov .mid'), ('Bach', '/content/Bach/Bwv0811 English Suite n6 8mov.mid'), ('Bach', '/content/Ba

In [15]:
beethoven_feature_arrays, beethoven_label_list, beethoven_file_list = collect_composer_data("Beethoven", beethoven_midi_list)
print(f"Beethoven dataset: Feature arrays extracted from {len(beethoven_feature_arrays)} files.")

[('Beethoven', '/content/Beethoven_augmented/Piano Sonata n04_aug0.mid'), ('Beethoven', "/content/Beethoven_augmented/Lieder op48 n6 ''Busslied''_aug0.mid"), ('Beethoven', '/content/Beethoven_augmented/Sonata in Bflat Major Op.106_aug2.mid'), ('Beethoven', "/content/Beethoven_augmented/Lieder op48 n1 ''Bitten''_aug1.mid"), ('Beethoven', '/content/Beethoven_augmented/op119 Douze Bagatellas_aug0.mid'), ('Beethoven', '/content/Beethoven_augmented/Piano Sonata n10 2mov_aug2.mid'), ('Beethoven', '/content/Beethoven_augmented/Violin Concerto op61 1mov_aug3.mid'), ('Beethoven', '/content/Beethoven_augmented/Symphony n5 op67 3-4mov_aug4.mid'), ('Beethoven', "/content/Beethoven_augmented/Lieder op48 n2 ''Die Liebe Des Nachsten''_aug3.mid"), ('Beethoven', "/content/Beethoven_augmented/Symphony n3 3mov ''Eroica''_aug2.mid"), ('Beethoven', "/content/Beethoven_augmented/Overture ''Corolian'' op62_aug2.mid"), ('Beethoven', '/content/Beethoven_augmented/Bagatella op33 n6_aug1.mid'), ('Beethoven', '/c

In [16]:
chopin_feature_arrays, chopin_label_list, chopin_file_list = collect_composer_data("Chopin", chopin_midi_list)
print(f"Chopin dataset: Feature arrays extracted from {len(chopin_feature_arrays)} files.")

[('Chopin', '/content/Chopin_augmented/Mazurka op30 n2 Drchew_aug0.mid'), ('Chopin', '/content/Chopin_augmented/Prelude op.28-15 Raindrop_aug3.mid'), ('Chopin', "/content/Chopin_augmented/Etude op25 n12 ''The Ocean''_aug4.mid"), ('Chopin', '/content/Chopin_augmented/Nocturne op09 n3_aug0.mid'), ('Chopin', '/content/Chopin_augmented/Etude No.1_aug0.mid'), ('Chopin', '/content/Chopin_augmented/Scherzo n1 op20_aug5.mid'), ('Chopin', '/content/Chopin_augmented/Nocturne op37 01_aug5.mid'), ('Chopin', '/content/Chopin_augmented/Etude No.7 in C Major Opus.10, No.7_aug1.mid'), ('Chopin', '/content/Chopin_augmented/19 Polish Songs, for Solo Voice and Piano accomplements, No.9_aug3.mid'), ('Chopin', "/content/Chopin_augmented/Prelude n17 op28 ''Scene On the Place of Notre Dame_aug4.mid"), ('Chopin', '/content/Chopin_augmented/Nocturne No.18 in E Major Op62_aug5.mid'), ('Chopin', '/content/Chopin_augmented/Sonata op35 n1 _aug3.mid'), ('Chopin', '/content/Chopin_augmented/Mazurka op30 n2 Drchew_au

In [None]:
mozart_feature_arrays, mozart_label_list, mozart_file_list = collect_composer_data("Mozart", mozart_midi_list)
print(f"Mozart dataset: Feature arrays extracted from {len(mozart_feature_arrays)} files.")

[('Mozart', '/content/Mozart_augmented/Symphony n25 K183 4mov_aug1.mid'), ('Mozart', "/content/Mozart_augmented/Vesperae solemnes de confessore ''Vespers'' K339 1 Dixit Dominus_aug1.mid"), ('Mozart', '/content/Mozart_augmented/K191 Bassoon Concerto 2mov_aug0.mid'), ('Mozart', '/content/Mozart_augmented/K333 Piano Sonata n13 3mov_aug3.mid'), ('Mozart', '/content/Mozart_augmented/Early Pieces n20 Presto_aug0.mid'), ('Mozart', '/content/Mozart_augmented/K546 Adagio & Fugue for Strings_aug2.mid'), ('Mozart', "/content/Mozart_augmented/Symphony n41 K551 1mov ''Jupiter''_aug3.mid"), ('Mozart', '/content/Mozart_augmented/Symphony n37 K444 1mov_aug1.mid'), ('Mozart', '/content/Mozart_augmented/Early Pieces n16 Minuet_aug1.mid'), ('Mozart', '/content/Mozart_augmented/Piano Sonata n11 K331_aug3.mid'), ('Mozart', "/content/Mozart_augmented/K525 Serenade 3mov ''Eine Kleine Natchmusik''_aug2.mid"), ('Mozart', '/content/Mozart_augmented/Piano Concerto n24 K491 3mov_aug3.mid'), ('Mozart', '/content/M