In [8]:
import os
import json
import random 
import shutil


## Make small batch

In [9]:
# I want 1 gb files 

# Config variables
FOLDER_SIZE = 1024 * 1024 * 1024
MAIN_FOLDER_PATH = r"D:\Nsynth dataset\train\nsynth-train"
BATCH_TARGET_FOLDER = r"D:\Nsynth dataset\batch\train"

def make_small_batch(main_folder_path = MAIN_FOLDER_PATH, folder_size = FOLDER_SIZE, batch_target_folder = BATCH_TARGET_FOLDER):
    """
    Creates a randomized 1GB audio batch from a larger NSynth dataset, including both audio files and corresponding metadata.

    Parameters:
    ----------
    main_folder_path : str
        Path to the full NSynth dataset directory containing the "audio" folder and "examples.json".
        
    folder_size : int
        Maximum size (in bytes) for the output batch folder. Default is 1GB.

    batch_target_folder : str
        Path where the batch folder will be created. It will include:
            - audio/: copied .wav files
            - examples.json: metadata for copied files

    Function Workflow:
    ------------------
    1. Shuffles all .wav files in the input folder.
    2. Iteratively copies random files until the total batch size reaches the given limit (e.g., 1 GB).
    3. Filters the `examples.json` file to include only entries matching the copied files.
    4. Saves the filtered metadata alongside the copied files in the new batch folder.

    Outputs:
    --------
    - Copies audio files to `batch_target_folder/audio`
    - Writes a filtered `examples.json` to `batch_target_folder`
    - Prints:
        - Each copied file
        - Total batch size (MB)
        - Number of audio files copied
        - Whether the number of audio files matches the number of labels
        
        """
    main_audio_folder = os.path.join(main_folder_path, "audio") #Main audio folder
    main_audio_json = os.path.join(main_folder_path, "examples.json") #Main audio json file
    target_audio_folder = os.path.join(batch_target_folder, "audio") #Target audio folder
    target_audio_json = os.path.join(batch_target_folder, "examples.json") #Target audio json file


    #Create folders if they don't exist for the batch
    os.makedirs(batch_target_folder, exist_ok=True)
    os.makedirs(os.path.join(batch_target_folder, "audio"), exist_ok=True)

    #Shuffle the files
    all_files = os.listdir(main_audio_folder)
    random.shuffle(all_files)

    current_size = 0 #Current size of the batch
    #Iterate through the files
    for file in all_files:
        file_path = os.path.join(main_audio_folder, file)
        file_size = os.path.getsize(file_path)

        if current_size + file_size > folder_size: #Folder size limit reached
            break
        dst = os.path.join(target_audio_folder, file)
        shutil.copy2(file_path, dst)  # ✅ copy the file
        current_size += file_size

        print(f"Copied {file} to {dst}")
    print(f"✅ Batch created! Total size: {current_size / (1024**2):.2f} MB")
    batch_audio_folder_lenght = len(os.listdir(target_audio_folder))
    print(f"Batch audio folder length: {batch_audio_folder_lenght}")



    with open(main_audio_json, "r") as f:
        audio_metadata = json.load(f)

    # Get list of copied .wav files (just filenames, no full paths)
    copied_files = os.listdir(target_audio_folder)

    # Init filtered dict
    json_dict = {}

    # Filter metadata entries based on copied files
    for file_ext in copied_files:
        file_key, ext = os.path.splitext(file_ext)  # e.g., "keyboard_acoustic_004-060-025", ".wav"
        
        # Check if metadata exists
        if file_key in audio_metadata:
            json_dict[file_key] = audio_metadata[file_key]

    # Save filtered metadata to new examples.json
    with open(target_audio_json, "w") as f:
        json.dump(json_dict, f)

    print(f"✅ Saved metadata for {len(json_dict)} files.")


    print("Labels in the batch equals audio batch" if len(json_dict) == batch_audio_folder_lenght else "Labels in the batch does not equal audio batch")

            


            
            


            



    
        



Copied flute_synthetic_002-079-127.wav to D:\Nsynth dataset\batch\train\audio\flute_synthetic_002-079-127.wav
Copied keyboard_electronic_015-076-025.wav to D:\Nsynth dataset\batch\train\audio\keyboard_electronic_015-076-025.wav
Copied vocal_synthetic_005-051-100.wav to D:\Nsynth dataset\batch\train\audio\vocal_synthetic_005-051-100.wav
Copied bass_synthetic_141-061-025.wav to D:\Nsynth dataset\batch\train\audio\bass_synthetic_141-061-025.wav
Copied organ_electronic_002-068-025.wav to D:\Nsynth dataset\batch\train\audio\organ_electronic_002-068-025.wav
Copied brass_acoustic_041-067-075.wav to D:\Nsynth dataset\batch\train\audio\brass_acoustic_041-067-075.wav
Copied bass_synthetic_053-025-075.wav to D:\Nsynth dataset\batch\train\audio\bass_synthetic_053-025-075.wav
Copied keyboard_synthetic_008-064-050.wav to D:\Nsynth dataset\batch\train\audio\keyboard_synthetic_008-064-050.wav
Copied bass_synthetic_132-090-100.wav to D:\Nsynth dataset\batch\train\audio\bass_synthetic_132-090-100.wav
Co