In [None]:
pip install datasets


Verify Dataset

In [None]:
from datasets import load_dataset

# Load the Parquet dataset using the specific branch for Parquet files
dataset = load_dataset("takara-ai/sangyo_no_yume_industrial_dreams", split="train", streaming=True)

# Load a single example to inspect the structure
example = next(iter(dataset))

# Print the structure of the dataset
print(example)

# Print the available columns
print("Columns:", example.keys())


- read gallery_config.yaml
- clear existing images
- contact datasets
- put images in gallery
- record URL's and prompts in gallery_images.yaml

In [None]:
import yaml
from datasets import load_dataset
import os
import random
import shutil

def get_base_dir():
    try:
        return os.path.dirname(os.path.abspath(__file__))
    except NameError:
        return os.getcwd()

def clear_directory(directory):
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print(f'Failed to delete {file_path}. Reason: {e}')

base_dir = get_base_dir()
gallery_dir = os.path.abspath(os.path.join(base_dir, '..', 'env', 'assets', 'images', 'gallery'))

config_path = os.path.join(base_dir, 'gallery_config.yaml')
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

gallery_images = []

for dataset_config in config['datasets']:
    dataset_name = dataset_config['name']
    num_images = dataset_config['num_images']

    output_dir = os.path.join(gallery_dir, dataset_name)
    
    # Clear existing images in the output directory
    if os.path.exists(output_dir):
        clear_directory(output_dir)
    else:
        os.makedirs(output_dir)

    dataset = load_dataset(dataset_name, revision="refs/convert/parquet", split="train", streaming=True)
    
    buffer_size = min(num_images * 10, 1000)
    buffer = []
    
    for example in dataset:
        if len(buffer) < buffer_size:
            buffer.append(example)
        else:
            if random.random() < buffer_size / (buffer_size + 1):
                replace_index = random.randint(0, buffer_size - 1)
                buffer[replace_index] = example
        
        if len(buffer) >= buffer_size:
            break
    
    selected_images = random.sample(buffer, min(num_images, len(buffer)))
    
    for example in selected_images:
        image_filename = f"{example['seed']}.png"
        image_path = os.path.join(output_dir, image_filename)
        example['image'].save(image_path)
        
        gallery_images.append({
            'image_url': f'/assets/images/gallery/{dataset_name}/{image_filename}',
            'positive_prompt': example['positive_prompt'],
        })

output_file = os.path.join(base_dir, 'gallery_images.yaml')
with open(output_file, 'w') as file:
    yaml.dump({'images': gallery_images}, file)

print(f"Generated {output_file} with randomly selected images from the streaming datasets.")
