In [None]:
pip install datasets joblib


Verify Dataset

In [None]:
from datasets import load_dataset

# Load the Parquet dataset using the specific branch for Parquet files
dataset = load_dataset("takara-ai/sangyo_no_yume_industrial_dreams", split="train", streaming=True)

# Load a single example to inspect the structure
example = next(iter(dataset))

# Print the structure of the dataset
print(example)

# Print the available columns
print("Columns:", example.keys())


- read gallery_config.yaml
- clear existing images
- contact datasets
- put images in gallery
- record URL's and prompts in gallery_images.yaml

# IMAGE DATASET RETRIEVER

In [None]:
import yaml
import os
import logging
from multiprocessing import Pool, cpu_count
from functools import partial
from dataset_processor import process_dataset, clear_directory

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def get_base_dir():
    try:
        return os.path.dirname(os.path.abspath(__file__))
    except NameError:
        return os.getcwd()

def main():
    logging.info("Script started")

    base_dir = get_base_dir()
    logging.info(f"Base directory: {base_dir}")

    gallery_dir = os.path.abspath(os.path.join(base_dir, '..', 'env', 'assets', 'images', 'gallery'))
    logging.info(f"Gallery directory: {gallery_dir}")

    config_path = os.path.join(base_dir, 'gallery_config.yaml')
    logging.info(f"Loading config from: {config_path}")
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)

    # Use multiprocessing to process datasets in parallel
    num_processes = min(cpu_count(), len(config['datasets']))
    logging.info(f"Using {num_processes} processes for parallel processing")

    with Pool(num_processes) as pool:
        process_func = partial(process_dataset, gallery_dir=gallery_dir, base_dir=base_dir)
        results = pool.map(process_func, config['datasets'])

    # Flatten the list of gallery images
    gallery_images = [image for sublist in results for image in sublist]

    output_file = os.path.join(base_dir, 'gallery_images.yaml')
    logging.info(f"Saving gallery images to: {output_file}")
    with open(output_file, 'w') as file:
        yaml.dump({'images': gallery_images}, file)

    logging.info(f"Generated {output_file} with {len(gallery_images)} randomly selected images from the streaming datasets.")
    logging.info("Script completed")

if __name__ == "__main__":
    main()
