In [1]:
import os
import re
from datasets import load_dataset
from tqdm import tqdm

# Configuration
OUTPUT_ROOT = "dataset_for_phase2"
TARGET_STYLES = {
    'Baroque': 'Baroque',
    'Romantic': 'Romantic',
    'Renaissance': 'Renaissance'
}
MAX_FILES_PER_STYLE = None

def clean_filename(text):
    """Sanitize filename by removing special characters."""
    if not text: return "Unknown"
    return re.sub(r'[^a-zA-Z0-9]', '_', str(text).strip())

def main():
    # 1. Setup directories
    for folder in TARGET_STYLES.values():
        path = os.path.join(OUTPUT_ROOT, folder)
        os.makedirs(path, exist_ok=True)

    print("Loading dataset...")
    dataset = load_dataset('TiMauzi/imslp-midi-by-sa', split='train')
    
    print("Extracting files...")
    counts = {style: 0 for style in TARGET_STYLES}
    
    for sample in tqdm(dataset):
        raw_style = str(sample.get('style', '')).lower()
        if not raw_style: continue

        # Identify target style
        target_style = next((k for k in TARGET_STYLES if k.lower() in raw_style), None)
        if not target_style: continue
        
        # Check limit
        if MAX_FILES_PER_STYLE and counts[target_style] >= MAX_FILES_PER_STYLE:
            continue

        # Prepare path
        composer = clean_filename(sample.get('composer', 'Unknown'))
        filename = f"{composer}_{counts[target_style]}.mid"
        save_path = os.path.join(OUTPUT_ROOT, TARGET_STYLES[target_style], filename)
        
        # Write file
        try:
            with open(save_path, 'wb') as f:
                f.write(sample['midi'])
            counts[target_style] += 1
        except Exception as e:
            print(f"Error saving {filename}: {e}")

    print(f"\nExtraction complete. Output: {os.path.abspath(OUTPUT_ROOT)}")
    for style, count in counts.items():
        print(f"{style:<12}: {count} files")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


Loading dataset...
Extracting files...


100%|██████████| 5593/5593 [00:14<00:00, 375.52it/s]


Extraction complete. Output: /home/dengjian/dataset_for_phase2
Baroque     : 1624 files
Romantic    : 2195 files
Renaissance : 688 files



