# MIDI-Trained Chord Recognition Model

## Data Preprocessing

### 1. Load and Extract from midi_folder

In [1]:
import os
import json
import pretty_midi
import pandas as pd
import numpy as np
from collections import defaultdict
import mido
import io

# define chord type templates: intervals relative to root
CHORD_TEMPLATES = {
    "Major":         {0, 4, 7},
    "Minor":         {0, 3, 7},
    "Dominant 7th":  {0, 4, 7, 10},
    "Diminished":    {0, 3, 6},
    "Augmented":     {0, 4, 8},
}

PITCH_CLASS_NAMES = ['C', 'C#', 'D', 'D#', 'E', 'F',
                     'F#', 'G', 'G#', 'A', 'A#', 'B']

# normalize chord, removing octave transpositions 
def normalize_chord(chord_tuple):
    normalized_chord = {note % 12 for note in chord_tuple}  # keep only unique notes modulo 12
    return tuple(sorted(normalized_chord))

# identify and name chords 
def identify_named_chord(chord_tuple):
    if not chord_tuple:
        return "Unknown"

    pitch_classes = sorted({p % 12 for p in chord_tuple})
    for root in pitch_classes:
        transposed = sorted({(p - root) % 12 for p in pitch_classes})
        for label, template in CHORD_TEMPLATES.items():
            if set(transposed) == template:
                root_name = PITCH_CLASS_NAMES[root]
                return f"{root_name} {label}"
    return "Unknown"

# fixed mapping for chord vocab: all 12 roots * templates
def create_fixed_chord_vocab():
    ALL_CHORDS = [
        f"{pitch} {chord_type}"
        for pitch in PITCH_CLASS_NAMES
        for chord_type in CHORD_TEMPLATES.keys()
    ]
    chord_to_index = {chord: idx for idx, chord in enumerate(ALL_CHORDS)}
    return chord_to_index

# extract chord sequence
def midi_to_chord_sequence(midi_file, merge_threshold=0.3):
    #midi_data = pretty_midi.PrettyMIDI(midi_file)
    
    raw = mido.MidiFile(midi_file, clip=True)
    merged = mido.MidiFile() 
    merged.ticks_per_beat = raw.ticks_per_beat
    merged_track = mido.merge_tracks(raw.tracks)
    merged.tracks.append(merged_track)
    
    # dump to memory buffer
    buf = io.BytesIO()
    merged.save(file=buf)
    buf.seek(0)

    midi_data = pretty_midi.PrettyMIDI(buf)

    events = []
    # for each note, add two events: on/off
    for instrument in midi_data.instruments:
        if instrument.is_drum:
            continue
        for note in instrument.notes:
            events.append((note.start, 'on', note.pitch))
            events.append((note.end, 'off', note.pitch))
    

    events.sort(key=lambda x: x[0])

    active_notes = set()  # track notes that are in use
    chords = []  # final list
    previous_chord = None
    chord_start_time = None
    last_event_time = 0

    # if note is starting, add to active set
    # if note ending, remove it from active set
    for time, action, pitch in events:
        if action == 'on':
            active_notes.add(pitch)
        elif action == 'off':
            active_notes.discard(pitch)

        current_chord = normalize_chord(active_notes) if active_notes else None
        chord_label = identify_named_chord(current_chord) if current_chord else None

        # if chord changed
        if chord_label != previous_chord:
            if previous_chord is not None and chord_start_time is not None:
                if time - chord_start_time >= merge_threshold:
                    chords.append((round(chord_start_time, 3), round(time, 3), previous_chord))
            chord_start_time = time
            previous_chord = chord_label

        last_event_time = time

    # capture final chord if any
    if previous_chord is not None and chord_start_time is not None:
        chords.append((round(chord_start_time, 3), round(midi_data.get_end_time(), 3), previous_chord))

    return chords, midi_data

# timeframe-level feature extraction and align with chord labels
def extract_frame_level_data(chords, midi_data, chord_to_index, frame_hop=1):
    end_time = midi_data.get_end_time()
    frame_times = np.arange(0, end_time, frame_hop)

    chroma = midi_data.get_chroma(fs=int(1 / frame_hop))
    chroma = chroma.T  # transpose to shape (frames, 12)

    data = []

    for i, t in enumerate(frame_times):
        frame_feature = chroma[i] if i < len(chroma) else np.zeros(12)
        label = None
        for start, end, chord in chords:
            if start <= t < end:
                if chord in chord_to_index:
                    label = chord_to_index[chord]
                break
        if label is not None:
            data.append((t, frame_feature, label))
    return data


# process all midi files in the folder, save to CSV
def process_midi_folder(input_folder, chord_csv, frame_csv, frame_hop=1):
    chord_rows = []
    frame_rows = []
    chord_to_index = create_fixed_chord_vocab()

    for root, _, files in os.walk(input_folder):
        for fname in files:
            if not fname.lower().endswith(('.mid','.midi')): continue
            path = os.path.join(root, fname)
            rel = os.path.relpath(path, input_folder)
            try:
                chords, midi = midi_to_chord_sequence(path)
                # chord-level
                for st, ed, ch in chords:
                    chord_rows.append([rel, st, ed, ch])
                # frame-level
                frames = extract_frame_level_data(chords, midi, chord_to_index, frame_hop)
                for t, feat, lbl in frames:
                    frame_rows.append([rel, t, *feat, lbl])

            except Exception as e:
                print(f"[ERROR] {rel}: {e}")

    # save to csv
    chord_df = pd.DataFrame(chord_rows, columns=["filename","start_time","end_time","chord"])
    chord_df.to_csv(chord_csv, index=False)
    cols = [f"chroma_{i}" for i in range(12)]
    frame_df = pd.DataFrame(frame_rows, columns=["filename","time", *cols, "label"])
    frame_df.to_csv(frame_csv, index=False)

    print(f"✔ Saved chords to: {chord_csv}")
    print(f"✔ Saved frames to: {frame_csv}")
    return chord_to_index


# def process_midi_folder(midi_folder, chord_output_csv, frame_output_csv, frame_hop=1):
#     chord_data = []
#     frame_data = []

#     chord_to_index = create_fixed_chord_vocab()

#     for midi_file in os.listdir(midi_folder):
#         if midi_file.endswith(".mid") or midi_file.endswith(".midi"):
#             file_path = os.path.join(midi_folder, midi_file)
#             try:
#                 chords, midi_data = midi_to_chord_sequence(file_path)
#                 for timestamp_start, timestamp_end, chord in chords:
#                     chord_data.append([midi_file, timestamp_start, timestamp_end, chord])
#             except Exception as e:
#                 print(f"Error processing {midi_file}: {e}")

#     # second pass to align frame-wise data using finalized vocab
#     for midi_file in os.listdir(midi_folder):
#         if midi_file.endswith(".mid") or midi_file.endswith(".midi"):
#             file_path = os.path.join(midi_folder, midi_file)
#             try:
#                 chords, midi_data = midi_to_chord_sequence(file_path)
#                 frame_entries = extract_frame_level_data(chords, midi_data, chord_to_index, frame_hop)
#                 for t, feat, label in frame_entries:
#                     frame_data.append([midi_file, round(t, 3)] + list(feat) + [label])
#             except Exception as e:
#                 print(f"Error processing {midi_file} for frame-level: {e}")

#     # save chord segment CSV
#     chord_df = pd.DataFrame(chord_data, columns=["filename", "start_time", "end_time", "chord"])
#     chord_df.to_csv(chord_output_csv, index=False)

#     # save frame-level CSV
#     feat_cols = [f"chroma_{i}" for i in range(12)]
#     frame_df = pd.DataFrame(frame_data, columns=["filename", "time"] + feat_cols + ["label"])
#     frame_df.to_csv(frame_output_csv, index=False)

#     print(f"Chord segments saved to {chord_output_csv}")
#     print(f"Frame-level data saved to {frame_output_csv}")
    
#     return chord_to_index

### 2. Extract and Combine to csv file

In [2]:
# paths
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

folder_to_process = 'midi_folder' # test use; change to 'lakh-midi-clean' for actual experiments

base = os.path.basename(folder_to_process.rstrip(os.sep))
chord_csv = os.path.join(output_dir, f"chord_dataset.csv")
frame_csv = os.path.join(output_dir, f"timeframe_dataset.csv")
vocab_json = os.path.join(output_dir, f"chord_vocab.json")

chord_to_index = process_midi_folder(folder_to_process, chord_csv, frame_csv)

with open(vocab_json, 'w') as f:
    json.dump(chord_to_index, f, indent=2)
    

✔ Saved chords to: output/chord_dataset.csv
✔ Saved frames to: output/timeframe_dataset.csv


### 3. One-hot Encoding

In [3]:
# one-hot encoding 
import pandas as pd
import numpy as np
import os
import json

output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

frame_csv_path = "output/timeframe_dataset.csv"
chord_vocab_path = "output/chord_vocab.json"
output_onehot_csv_path = os.path.join(output_dir, "timeframe_onehot.csv")


# load from JSON file
with open(chord_vocab_path, "r") as f:
    chord_to_index = json.load(f)

# reverse
chord_to_index = {str(k): v for k, v in chord_to_index.items()}


def one_hot_encode_labels(label_indices, num_classes):
    return np.eye(num_classes)[label_indices]

# load original timeframe-level dataset
df = pd.read_csv(frame_csv_path)

# get label col
label_indices = df["label"].astype(int).values

# one-hot encoding 
num_classes = len(chord_to_index)
one_hot = one_hot_encode_labels(label_indices, num_classes)

# create DataFrame 
one_hot_columns = [f"class_{i}" for i in range(num_classes)]
one_hot_df = pd.DataFrame(one_hot, columns=one_hot_columns)

# combine with filename + time 
minimal_df = df[["filename", "time"]].reset_index(drop=True)
result_df = pd.concat([minimal_df, one_hot_df], axis=1)

result_df.to_csv(output_onehot_csv_path, index=False)

print(f"One-hot encoded data saved to {output_onehot_csv_path}")

One-hot encoded data saved to output/timeframe_onehot.csv


## Baseline Model: SVM

In [4]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

frame_csv_path = "output/timeframe_dataset.csv"
df = pd.read_csv(frame_csv_path)

# split to train and test dataset
feature_cols = [f"chroma_{i}" for i in range(12)]

X = df[feature_cols].values
y = df["label"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# RBF kernel 
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)

y_pred = svm_model.predict(X_test_scaled)

# print confusion metrics with zero_division fix
print("Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.71      0.57       185
           1       0.50      0.38      0.43        24
           2       0.47      0.37      0.41        19
           3       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         2
           5       0.55      0.68      0.61        31
           6       0.75      0.46      0.57        13
           7       0.00      0.00      0.00         4
           8       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         2
          10       0.59      0.74      0.66       133
          11       0.53      0.47      0.49        45
          12       0.57      0.29      0.38        14
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          15       0.69      0.74      0.71        65
          16       0.00      0.00      0.00         5
    

## Deep Learning Models

### Reorganize timing data

In [14]:
# build_sequence_tensor(df, seq_len)  ——>  X_seq, y_seq_onehot

import numpy as np
import pandas as pd
from tensorflow.keras.utils import to_categorical

def build_sequence_tensor(frame_df: pd.DataFrame,
                          seq_len: int = 64,
                          num_feat: int = 12,
                          num_classes: int = 24):
    """
    Convert frame-level dataframe -> (N_song, seq_len, num_feat) & one-hot encoded labels
    """
    # pre-allocate arrays
    by_song = frame_df.groupby("filename")
    n_song  = len(by_song)
    X_seq   = np.zeros((n_song, seq_len, num_feat), dtype=np.float32)
    y_seq   = np.zeros((n_song, seq_len), dtype=np.int32)

    # fill or truncate sequences
    for idx, (_, group) in enumerate(by_song):
        # Ensure data is sorted by time
        group = group.sort_values("time")
        x = group[[f"chroma_{i}" for i in range(num_feat)]].to_numpy()
        y = group["label"].to_numpy()

        # Pad or truncate to fixed sequence length
        pad = max(seq_len - len(x), 0)
        x = np.pad(x, ((0, pad), (0, 0)), "constant")[:seq_len]
        y = np.pad(y, (0, pad), "constant")[:seq_len]

        X_seq[idx] = x
        y_seq[idx] = y

    # one-hot encode labels
    y_seq_ohe = to_categorical(y_seq, num_classes=num_classes)
    return X_seq, y_seq_ohe

frame_df = pd.read_csv("output/timeframe_dataset.csv")

seq_len = 64         # typical length for 2–4 bars; can be tuned
num_feat = 12
num_classes = frame_df["label"].max() + 1

X_seq, y_seq_ohe = build_sequence_tensor(frame_df, seq_len, num_feat, num_classes)
print("X_seq:", X_seq.shape, "y_seq_ohe:", y_seq_ohe.shape)

# train / test split
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(
    X_seq, y_seq_ohe, test_size=0.2, random_state=42, shuffle=True
)


X_seq: (213, 64, 12) y_seq_ohe: (213, 64, 59)


### 1. CNN Model

### 2. RNN Model

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, TimeDistributed, Dense

tf.keras.backend.clear_session()

batch_size = 16
epochs     = 30
es = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)

rnn_model = Sequential([
    SimpleRNN(64, return_sequences=True, input_shape=X_tr.shape[1:]),  # (seq_len, num_feat)
    TimeDistributed(Dense(y_tr.shape[-1], activation="softmax"))
])

rnn_model.compile(
    loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

rnn_model.summary()

hist_rnn = rnn_model.fit(
    X_tr, y_tr,
    validation_split=0.1,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[es],
    verbose=1
)

print("✓ RNN training done!")


  super().__init__(**kwargs)


Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 88ms/step - accuracy: 0.0308 - loss: 4.3436 - val_accuracy: 0.0551 - val_loss: 3.7984
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - accuracy: 0.1450 - loss: 3.8008 - val_accuracy: 0.6121 - val_loss: 2.8565
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.4855 - loss: 3.0216 - val_accuracy: 0.6912 - val_loss: 2.0129
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5573 - loss: 2.3389 - val_accuracy: 0.7426 - val_loss: 1.4647
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.6259 - loss: 1.8792 - val_accuracy: 0.7656 - val_loss: 1.2577
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.6534 - loss: 1.6602 - val_accuracy: 0.7693 - val_loss: 1.1649
Epoch 7/30
[1m10/10[0m [32m━━━━

### 3. LSTM Model

In [None]:
# bidirectional LSTM model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, TimeDistributed, Dense

tf.keras.backend.clear_session()

batch_size = 16
epochs     = 30
es = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)

lstm_model = Sequential([
    Bidirectional(
        LSTM(64, return_sequences=True),
        input_shape=X_tr.shape[1:]
    ),
    TimeDistributed(Dense(y_tr.shape[-1], activation="softmax"))
])

lstm_model.compile(
    loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
)

lstm_model.summary()

hist_lstm = lstm_model.fit(
    X_tr, y_tr,
    validation_split=0.1,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[es],
    verbose=1
)

print("✓ LSTM training done!")


  super().__init__(**kwargs)


Epoch 1/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 287ms/step - accuracy: 0.2686 - loss: 4.0663 - val_accuracy: 0.6314 - val_loss: 3.8054
Epoch 2/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 120ms/step - accuracy: 0.5795 - loss: 3.6366 - val_accuracy: 0.6811 - val_loss: 2.4185
Epoch 3/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 64ms/step - accuracy: 0.6301 - loss: 2.2844 - val_accuracy: 0.7004 - val_loss: 1.2886
Epoch 4/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 66ms/step - accuracy: 0.6302 - loss: 1.6763 - val_accuracy: 0.7114 - val_loss: 1.1283
Epoch 5/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - accuracy: 0.6480 - loss: 1.5429 - val_accuracy: 0.7353 - val_loss: 1.0456
Epoch 6/30
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - accuracy: 0.7200 - loss: 1.2291 - val_accuracy: 0.7518 - val_loss: 0.9859
Epoch 7/30
[1m10/10[0m [32m━

### 4. CNN + LSTM Model

## Evaluation

In [19]:
import numpy as np
import mir_eval

# helpers
def ints_to_chords(int_array):
    return [to_mireval(idx2label[i]) if 0 <= i < len(idx2label) else "N"
            for i in int_array]

def evaluate_chord_model(name, model, X, y_ohe, frame_rate=1.0):
    """Evaluate model and print all available mir_eval metrics."""
    y_pred_int = np.argmax(model.predict(X, verbose=0), axis=-1).flatten()
    y_true_int = np.argmax(y_ohe,                    axis=-1).flatten()

    est_labels = ints_to_chords(y_pred_int)
    ref_labels = ints_to_chords(y_true_int)

    n = len(y_true_int)
    intervals = np.column_stack([np.arange(n)/frame_rate,
                                 (np.arange(n)+1)/frame_rate])

    result = mir_eval.chord.evaluate(
        intervals, ref_labels, intervals, est_labels
    )

    # normalize output to (names, scores)
    if isinstance(result, dict):
        score_names = list(result.keys())
        scores      = list(result.values())

    elif isinstance(result, (list, tuple)) and len(result) == 2:
        # (names, scores)  OR  (scores, names)  depending on version
        a, b = result
        if isinstance(a[0], str):
            score_names, scores = a, b
        else:
            scores, score_names = a, b

    else:                                            # list/tuple of floats
        scores = list(result)
        # default names for the 7-metric variant
        score_names = ["root", "majmin", "thirds",
                       "triads", "sevenths", "tetrads", "mirex"][:len(scores)]


    print(f"\n=== {name} ===")
    for nm, sc in zip(score_names, scores):
        print(f"{nm:>10}: {sc:.4f}")

    frame_acc = np.mean(y_true_int == y_pred_int)
    print(f"{'frame_acc':>10}: {frame_acc:.4f}")


evaluate_chord_model("RNN Model",  rnn_model,  X_te, y_te)
evaluate_chord_model("LSTM Model", lstm_model, X_te, y_te)



=== RNN Model ===
    thirds: 0.7918
thirds_inv: 0.7918
    triads: 0.7918
triads_inv: 0.7918
   tetrads: 0.7602
tetrads_inv: 0.7602
      root: 0.8187
     mirex: 0.7918
    majmin: 0.7944
majmin_inv: 0.7944
  sevenths: 0.7627
sevenths_inv: 0.7627
  underseg: 0.9186
   overseg: 0.9117
       seg: 0.9117
 frame_acc: 0.7602

=== LSTM Model ===
    thirds: 0.8169
thirds_inv: 0.8169
    triads: 0.8169
triads_inv: 0.8169
   tetrads: 0.7914
tetrads_inv: 0.7914
      root: 0.8278
     mirex: 0.8169
    majmin: 0.8195
majmin_inv: 0.8195
  sevenths: 0.7940
sevenths_inv: 0.7940
  underseg: 0.9012
   overseg: 0.9302
       seg: 0.9012
 frame_acc: 0.7914
