### Test HuggingFace dataset usage

In [1]:
from datasets import load_dataset

# Will download and cache the dataset automatically
ds = load_dataset("amaai-lab/MidiCaps")

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 100%|██████████| 168385/168385 [00:01<00:00, 148497.76 examples/s]


In [2]:
print(ds)
print(ds['train'][0])  # Sample from training set

DatasetDict({
    train: Dataset({
        features: ['location', 'caption', 'genre', 'genre_prob', 'mood', 'mood_prob', 'key', 'time_signature', 'tempo', 'tempo_word', 'duration', 'duration_word', 'chord_summary', 'chord_summary_occurence', 'instrument_summary', 'instrument_numbers_sorted', 'all_chords', 'all_chords_timestamps', 'test_set'],
        num_rows: 168385
    })
})
{'location': 'lmd_full/1/1a0751ad20e2f82957410a7510a1b13e.mid', 'caption': 'A melodic electronic composition with classical influences, featuring a string ensemble, trumpet, brass section, synth strings, and drums. Set in F# minor with a 4/4 time signature, it moves at an Allegro tempo. The mood evokes a cinematic, spacious, and epic atmosphere while maintaining a sense of relaxation.', 'genre': ['electronic', 'classical'], 'genre_prob': [0.3596, 0.2367], 'mood': ['melodic', 'film', 'space', 'epic', 'relaxing'], 'mood_prob': [0.1228, 0.1114, 0.0917, 0.0828, 0.079], 'key': 'F# minor', 'time_signature': '4/4', 'tem

In [8]:
from torch.utils.data import Dataset

class MidiCapsDataset(Dataset):
    def __init__(self, hf_dataset):
        self.data = hf_dataset

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        # Customize this based on dataset structure
        return {
            "input": item["location"],       # or load/process MIDI here
            "caption": item["caption"]        # example key
        }

In [9]:
from torch.utils.data import DataLoader

train_dataset = MidiCapsDataset(ds["train"])
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [10]:
from torch.utils.data import DataLoader

# Assuming `train_loader` is already defined:
for batch in train_loader:
    print("Batch keys:", batch.keys())  # ['input', 'caption'] for example
    print("Input example:", batch['input'][0])  # Check first sample input
    print("Caption example:", batch['caption'][0])  # Check first sample caption
    break  # Remove this to loop over all batches

Batch keys: dict_keys(['input', 'caption'])
Input example: lmd_full/1/157e84c82fc923cb4e0ed7ada2fb4df6.mid
Caption example: A cheerful pop song with a touch of rock, featuring clean electric guitar, piano, flute, electric bass, and string ensemble, all coming together to create a melodic and motivational atmosphere. Set in the key of E minor with a moderate tempo of 92 beats per minute, this song maintains a 4/4 time signature throughout its duration, with the chords E and A/E providing the harmonic foundation.


### Test REMI+ preprocessing

In [None]:
from miditok import REMI, TokenizerConfig  # here we choose to use REMI

# Our parameters <- use default tokens & remi+ for first trial
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "num_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": True,
    "use_rests": False,
    "use_tempos": True,
    "use_time_signatures": True, #remi+
    "use_programs": True,        #remi+
    "num_tempos": 32,  # number of tempo bins
    "tempo_range": (40, 250),  # (min, max)
    "one_token_stream_for_programs": True
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = REMI(config)

  super().__init__(tokenizer_config, params)


In [16]:
from pathlib import Path

# Tokenize a MIDI file
tokens = tokenizer("/home/yihsin/dataset/midicaps/lmd_full/1/157e84c82fc923cb4e0ed7ada2fb4df6.mid")  # automatically detects Score objects, paths, tokens

# Convert to MIDI and save it
generated_midi = tokenizer(tokens)  # MidiTok can handle PyTorch/Numpy/Tensorflow tensors
generated_midi.dump_midi("./decoded_midi.mid")

In [None]:
# save vocab to pkl file
tokenizer.vocab

{'PAD_None': 0,
 'BOS_None': 1,
 'EOS_None': 2,
 'MASK_None': 3,
 'Bar_None': 4,
 'Pitch_21': 5,
 'Pitch_22': 6,
 'Pitch_23': 7,
 'Pitch_24': 8,
 'Pitch_25': 9,
 'Pitch_26': 10,
 'Pitch_27': 11,
 'Pitch_28': 12,
 'Pitch_29': 13,
 'Pitch_30': 14,
 'Pitch_31': 15,
 'Pitch_32': 16,
 'Pitch_33': 17,
 'Pitch_34': 18,
 'Pitch_35': 19,
 'Pitch_36': 20,
 'Pitch_37': 21,
 'Pitch_38': 22,
 'Pitch_39': 23,
 'Pitch_40': 24,
 'Pitch_41': 25,
 'Pitch_42': 26,
 'Pitch_43': 27,
 'Pitch_44': 28,
 'Pitch_45': 29,
 'Pitch_46': 30,
 'Pitch_47': 31,
 'Pitch_48': 32,
 'Pitch_49': 33,
 'Pitch_50': 34,
 'Pitch_51': 35,
 'Pitch_52': 36,
 'Pitch_53': 37,
 'Pitch_54': 38,
 'Pitch_55': 39,
 'Pitch_56': 40,
 'Pitch_57': 41,
 'Pitch_58': 42,
 'Pitch_59': 43,
 'Pitch_60': 44,
 'Pitch_61': 45,
 'Pitch_62': 46,
 'Pitch_63': 47,
 'Pitch_64': 48,
 'Pitch_65': 49,
 'Pitch_66': 50,
 'Pitch_67': 51,
 'Pitch_68': 52,
 'Pitch_69': 53,
 'Pitch_70': 54,
 'Pitch_71': 55,
 'Pitch_72': 56,
 'Pitch_73': 57,
 'Pitch_74': 58,
 'Pitc

In [None]:
# store processed tokens to .pkl file for training
tokens.tokens

['Bar_None',
 'TimeSig_4/4',
 'Position_0',
 'Tempo_94.19',
 'Program_0',
 'Pitch_62',
 'Velocity_79',
 'Duration_2.3.8',
 'Program_0',
 'Pitch_69',
 'Velocity_95',
 'Duration_1.0.8',
 'Program_0',
 'Pitch_71',
 'Velocity_87',
 'Duration_2.3.8',
 'Program_33',
 'Pitch_43',
 'Velocity_91',
 'Duration_1.3.8',
 'Program_4',
 'Pitch_69',
 'Velocity_91',
 'Duration_2.3.8',
 'Program_4',
 'Pitch_71',
 'Velocity_83',
 'Duration_2.3.8',
 'Program_4',
 'Pitch_62',
 'Velocity_91',
 'Duration_2.3.8',
 'Program_27',
 'Pitch_71',
 'Velocity_75',
 'Duration_2.3.8',
 'Program_27',
 'Pitch_62',
 'Velocity_79',
 'Duration_2.3.8',
 'Program_27',
 'Pitch_69',
 'Velocity_87',
 'Duration_1.0.8',
 'Position_8',
 'Program_0',
 'Pitch_67',
 'Velocity_83',
 'Duration_2.3.8',
 'Program_27',
 'Pitch_67',
 'Velocity_83',
 'Duration_2.3.8',
 'Position_12',
 'Program_33',
 'Pitch_43',
 'Velocity_95',
 'Duration_0.4.8',
 'Position_16',
 'Program_33',
 'Pitch_43',
 'Velocity_87',
 'Duration_1.3.8',
 'Position_20',
 '

### Clean midicaps dataset

In [30]:
# define tokenizer
from miditok import REMI, TokenizerConfig  # here we choose to use REMI

# Our parameters <- use default tokens & remi+ for first trial
TOKENIZER_PARAMS = {
    "pitch_range": (21, 109),
    "beat_res": {(0, 4): 8, (4, 12): 4},
    "num_velocities": 32,
    "special_tokens": ["PAD", "BOS", "EOS", "MASK"],
    "use_chords": True,
    "use_rests": False,
    "use_tempos": True,
    "use_time_signatures": True, #remi+
    "use_programs": True,        #remi+
    "num_tempos": 32,  # number of tempo bins
    "tempo_range": (40, 250),  # (min, max)
    "one_token_stream_for_programs": True
}
config = TokenizerConfig(**TOKENIZER_PARAMS)

# Creates the tokenizer
tokenizer = REMI(config)
event2idx = tokenizer.vocab

  super().__init__(tokenizer_config, params)


In [32]:
base_dir = Path("/home/yihsin/dataset/midicaps")
output_train = Path("/home/yihsin/midicaps-mini-parsed/train")
output_valid = Path("/home/yihsin/midicaps-mini-parsed/valid")
output_train.mkdir(parents=True, exist_ok=True)
output_valid.mkdir(parents=True, exist_ok=True)

In [38]:
# get 1/10 of the dataset
import random
import numpy as np
from pathlib import Path

# Base directory containing subfolders
base_dir = Path(base_dir)

# Recursively find all .mid files
all_mid_files = list(base_dir.rglob("*.mid"))
print(f"Found {len(all_mid_files)} MIDI files.")

# Randomly pick 1/10 of them
sample_size = max(1, len(all_mid_files) // 10)
sampled_files = random.sample(all_mid_files, sample_size)

# Convert to list of strings (paths)
sampled_file_paths = [str(p) for p in sampled_files]

Found 168385 MIDI files.


In [40]:
# Find all MIDI files
all_mid_files = sampled_file_paths
print(f"Found {len(all_mid_files)} MIDI files.")

num_valid = max(1, len(all_mid_files) // 100)
valid_set = set(all_mid_files[:num_valid])

for i, mid_path in enumerate(all_mid_files):
    # Derive unique name: <subfolder>_<id>.npy
    pth_obj = Path(mid_path)
    relative = pth_obj.relative_to(base_dir)
    subfolder = relative.parts[-2]
    stem = pth_obj.stem  # filename without .mid
    filename = f"{subfolder}_{stem}.npy"

    try:
        tokens = tokenizer(mid_path).tokens
        arr = np.array([event2idx[e] for e in tokens])
    except Exception as e:
        print(f"[!] Failed to process {mid_path}: {e}")
        continue

    # Choose folder
    if mid_path in valid_set:
        out_path = output_valid / filename
    else:
        out_path = output_train / filename

    np.save(out_path, arr)

    if i % 100 == 0:
        print(f"Processed {i}/{len(all_mid_files)}")

print("✅ Done.")

Found 16838 MIDI files.
Processed 0/16838
Processed 100/16838
Processed 200/16838
Processed 300/16838
Processed 400/16838
Processed 500/16838
Processed 600/16838
Processed 700/16838
Processed 800/16838
Processed 900/16838
Processed 1000/16838
Processed 1100/16838
Processed 1200/16838
Processed 1300/16838
Processed 1400/16838
Processed 1500/16838
Processed 1600/16838
Processed 1700/16838
Processed 1800/16838
Processed 1900/16838
Processed 2000/16838
Processed 2100/16838
Processed 2200/16838
Processed 2300/16838
Processed 2400/16838
Processed 2500/16838
Processed 2600/16838
Processed 2700/16838
Processed 2800/16838
Processed 2900/16838
Processed 3000/16838
Processed 3100/16838
Processed 3200/16838
Processed 3300/16838
Processed 3400/16838
Processed 3500/16838
Processed 3600/16838
Processed 3700/16838
Processed 3800/16838
Processed 3900/16838
Processed 4000/16838
Processed 4100/16838
Processed 4200/16838
Processed 4300/16838
Processed 4400/16838
Processed 4500/16838
Processed 4600/16838
P

In [41]:
np.load("/home/yihsin/midicaps-mini-parsed/train/f_fd7b815a42b30bb3fd3f64230698e7a6.npy")

array([  4, 529, 189, ...,  59, 114, 172], shape=(14635,))