In [1]:
import os
import sys
from pathlib import Path
from tqdm import tqdm
from miditok import REMI
from datasets import Dataset

# Configuration
INPUT_ROOT = "dataset_for_phase2"
OUTPUT_ROOT = "processed_datasets"
PRETRAINED_TOK_DIR = os.path.join(OUTPUT_ROOT, "maestro_tok")
TARGET_STYLES = ['Baroque', 'Romantic']
BLOCK_SIZE = 512

# 1. Load Pretrained Tokenizer
print("Loading Tokenizer...")
if not os.path.exists(PRETRAINED_TOK_DIR):
    print(f"Error: Directory not found: {PRETRAINED_TOK_DIR}")
    sys.exit(1)

try:
    tokenizer = REMI.from_pretrained(PRETRAINED_TOK_DIR)
    print(f"Tokenizer loaded. Vocab Size: {len(tokenizer)}")
    if len(tokenizer) < 1000:
        print("Warning: Vocabulary size is small. Verify BPE model.")
except Exception as e:
    print(f"Tokenizer load failed: {e}")
    sys.exit(1)

# 2. Processing Function
def process_style_folder(style_name):
    input_dir = os.path.join(INPUT_ROOT, style_name)
    midi_paths = list(Path(input_dir).glob("*.mid"))
    
    if not midi_paths:
        print(f"Error: No .mid files in {input_dir}")
        return None

    print(f"Processing {style_name} ({len(midi_paths)} files)...")
    
    all_chunks = []
    error_counts = 0
    
    for path in tqdm(midi_paths, desc=f"Tokenizing {style_name}"):
        try:
            tokens_result = tokenizer.encode(path)
            flat_tokens = []
            
            if isinstance(tokens_result, list):
                for seq in tokens_result:
                    flat_tokens.extend(seq.ids)
            else:
                flat_tokens = tokens_result.ids

            if not flat_tokens: continue

            # Chunking
            for i in range(0, len(flat_tokens), BLOCK_SIZE):
                chunk = flat_tokens[i : i + BLOCK_SIZE]
                if len(chunk) == BLOCK_SIZE:
                    all_chunks.append({"input_ids": chunk})
                    
        except Exception:
            error_counts += 1
            continue
            
    if error_counts > 0:
        print(f"Errors in {style_name}: {error_counts}/{len(midi_paths)}")
        
    return all_chunks

# 3. Main Execution
def main():
    if not os.path.exists(INPUT_ROOT):
        print(f"Error: Input root '{INPUT_ROOT}' missing.")
        return
    
    total_chunks = 0
    
    for style in TARGET_STYLES:
        chunks = process_style_folder(style)
        
        if chunks:
            print(f" -> {style}: {len(chunks)} chunks.")
            hf_dataset = Dataset.from_list(chunks)
            save_path = os.path.join(OUTPUT_ROOT, f"{style}_train")
            hf_dataset.save_to_disk(save_path)
            total_chunks += len(chunks)
        else:
            print(f" -> {style}: No data generated.")

    print(f"\nTotal chunks generated: {total_chunks}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
  return cls(**input_dict, **kwargs)


Loading Tokenizer...
Tokenizer loaded. Vocab Size: 20000
Processing Baroque (1624 files)...


Tokenizing Baroque: 100%|██████████| 1624/1624 [01:04<00:00, 25.00it/s]


Errors in Baroque: 1/1624
 -> Baroque: 11749 chunks.


Saving the dataset (1/1 shards): 100%|██████████| 11749/11749 [00:00<00:00, 73633.87 examples/s]


Processing Romantic (2195 files)...


Tokenizing Romantic: 100%|██████████| 2195/2195 [01:31<00:00, 23.98it/s]


Errors in Romantic: 29/2195
 -> Romantic: 12927 chunks.


Saving the dataset (1/1 shards): 100%|██████████| 12927/12927 [00:00<00:00, 96834.16 examples/s]


Total chunks generated: 24676



