In [2]:
from pathlib import Path

midi_root = Path("../data/raw_midi/lmd_full")
midi_root


WindowsPath('../data/raw_midi/lmd_full')

In [3]:
midi_files = list(midi_root.rglob("*.mid"))
print("Total MIDI files found:", len(midi_files))
midi_files[:5]


Total MIDI files found: 178561


[WindowsPath('../data/raw_midi/lmd_full/0/00000ec8a66b6bd2ef809b0443eeae41.mid'),
 WindowsPath('../data/raw_midi/lmd_full/0/0000799e8672292fe6f0fff08554ca40.mid'),
 WindowsPath('../data/raw_midi/lmd_full/0/00012722c199ae2a628ebb792ccc617a.mid'),
 WindowsPath('../data/raw_midi/lmd_full/0/000203a04a64ad57329a058f11e235cb.mid'),
 WindowsPath('../data/raw_midi/lmd_full/0/00032fb2047d3cdd0394b89349d858b4.mid')]

In [4]:
from music21 import converter

test_midi = midi_files[0]
test_midi


WindowsPath('../data/raw_midi/lmd_full/0/00000ec8a66b6bd2ef809b0443eeae41.mid')

In [5]:
score = converter.parse(str(test_midi))
score


<music21.stream.Score 0x1d6c3ef4d90>

In [None]:
from pathlib import Path
import subprocess

midi_root = Path("../data/raw_midi/lmd_full")
abc_root = Path("../data/abc")
abc_root.mkdir(parents=True, exist_ok=True)

MIDI2ABC = r"C:\Program Files\abcmidi\midi2abc.exe"

def midi_to_abc_file(midi_path: Path, out_path: Path):
    """
    Use the external midi2abc tool to convert one MIDI file to ABC.
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)


    cmd = [MIDI2ABC, str(midi_path), "-o", str(out_path)]
    result = subprocess.run(cmd, capture_output=True, text=True)

    if result.returncode != 0:
        raise RuntimeError(
            f"midi2abc failed for {midi_path} with code {result.returncode}\n"
            f"STDERR:\n{result.stderr}"
        )


In [7]:
midi_files = list(midi_root.rglob("*.mid"))
len(midi_files), midi_files[0]


(178561,
 WindowsPath('../data/raw_midi/lmd_full/0/00000ec8a66b6bd2ef809b0443eeae41.mid'))

In [8]:
test_midi = midi_files[0]
test_abc = abc_root / (test_midi.stem + ".abc")

midi_to_abc_file(test_midi, test_abc)
print(test_abc, test_abc.exists())
print(test_abc.read_text()[:500])


..\data\abc\00000ec8a66b6bd2ef809b0443eeae41.abc True
X: 1
T: from ..\data\raw_midi\lmd_full\0\00000ec8a66b6bd2ef809b0443eeae41.mid
M: 4/4
L: 1/8
Q:1/4=148
K:C % 0 sharps
V:1
^f/2F/2^a/2F/2 ^g/2F/2c/2F/2 ^c/2F/2=c/2^c/2 F/2=f/2^F/2A/2| \
^D/2D/2^a/2D/2 ^g/2D/2c/2D/2 ^c/2D/2=c/2^c/2 D/2d/2D/2f/2| \
^A/2A,/2a/2A,/2 ^g/2A,/2c/2A,/2 ^c/2A,/2=c/2^c/2 A,/2f/2A,/2A/2| \
^A,/2A,/2a/2A,/2 ^g/2A,/2c'/2A,/2 ^c'/2A,/2=c'/2^c'/2 A,/2g/2A,/2f/2|
^f/2F/2^a/2F/2 ^g/2F/2c/2F/2 ^c/2F/2=c/2^c/2 F/2=f/2^F/2A/2| \
^D/2D/2^a/2D/2 ^g/2D/2c/2D/2 ^c/2D/2=c/2^c/2 D/2d/2D/2f


In [None]:
from tqdm import tqdm

failed = []
n_ok = 0

for midi_path in tqdm(midi_files):  
    out_path = abc_root / (midi_path.stem + ".abc")
    if out_path.exists():
        n_ok += 1
        continue

    try:
        midi_to_abc_file(midi_path, out_path)
        n_ok += 1
    except Exception as e:
        failed.append((str(midi_path), repr(e)))

n_ok, len(failed)


100%|████████████████████████████████████████████████████████████████████████| 178561/178561 [1:00:56<00:00, 48.84it/s]


(174612, 3949)

In [10]:
len(failed)

3949

In [11]:
def clean_abc_data(abc_content):
    """
    Input: Raw content of a single .abc file
    Output: Cleaned string containing only music data and structural headers,
            or None if the file is too short/corrupted.
    """
    if not abc_content:
        return None

    lines = abc_content.splitlines()
    cleaned_lines = []
    
    
    
    keep_headers = ('X:', 'M:', 'L:', 'K:', 'V:', 'P:')
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        
        if len(line) > 1 and line[1] == ':':
            if line.startswith(keep_headers):
                cleaned_lines.append(line)
            
            
            
        
        
        elif not line.startswith('%'):
            cleaned_lines.append(line)
            
    
    result = "\n".join(cleaned_lines)
    
    
    if len(result) < 50:  
        return None
        
    return result

In [12]:
from pathlib import Path
from tqdm import tqdm

def process_batch(file_paths, output_filename):
    """
    Reads a list of paths, cleans them, and appends them to a single output file.
    Returns the total number of characters (tokens) written.
    """
    total_chars = 0
    valid_count = 0
    
    
    with open(output_filename, 'w', encoding='utf-8') as f_out:
        for path in tqdm(file_paths, desc=f"Writing {output_filename.name}"):
            
            raw_text = path.read_text(encoding='utf-8', errors='ignore')
            
            
            clean_text = clean_abc_data(raw_text)
            
            
            if clean_text:
                
                f_out.write(clean_text + "\n\n")
                total_chars += len(clean_text)
                valid_count += 1
                
                
    print(f"Finished {output_filename.name}: {valid_count} songs, {total_chars} characters.")
    return total_chars

In [None]:
import random


abc_source_dir = Path("../data/abc")  
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)


all_files = list(abc_source_dir.glob("*.abc"))
print(f"Total files found: {len(all_files)}")


random.seed(42)
random.shuffle(all_files)


n_total = len(all_files)
idx_train = int(n_total * 0.98)
idx_val = int(n_total * 0.99) 


train_files = all_files[:idx_train]
val_files = all_files[idx_train:idx_val]
test_files = all_files[idx_val:]

print(f"Split sizes: Train={len(train_files)}, Val={len(val_files)}, Test={len(test_files)}")



train_chars = process_batch(train_files, output_dir / "train.txt")
val_chars = process_batch(val_files, output_dir / "val.txt")
test_chars = process_batch(test_files, output_dir / "test.txt")

print("Data Pipeline Complete.")

Total files found: 178561
Split sizes: Train=174989, Val=1786, Test=1786


Writing train.txt:  28%|███████████████▎                                       | 48563/174989 [07:22<11:59, 175.76it/s]

In [None]:
total_tokens = build_corpus(SOURCE_DIR, DEST_FILE)