In [6]:
import os
from PIL import Image
import io
from datasets import Dataset, Image as DatasetImage
from huggingface_hub import HfApi, create_repo
from tqdm.auto import tqdm
import torch
from transformers import AutoProcessor, Blip2ForConditionalGeneration
from huggingface_hub import notebook_login

# Authenticate with Hugging Face
notebook_login()

# Set up the BLIP2 model for captioning
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to(device)

def generate_caption(image_path):
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt").to(device, torch.float16)
    generated_ids = model.generate(**inputs, max_new_tokens=50)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

# Prepare your local image directory
local_dir = "/workspace/data/"
dataset_name = "your-dataset-name"  # Replace with your desired dataset name

# Create a private repository for the dataset
api = HfApi()
create_repo(dataset_name, private=True, repo_type="dataset")

data = []
for filename in tqdm(os.listdir(local_dir)):
    if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
        image_path = os.path.join(local_dir, filename)
        
        # Generate caption
        caption = generate_caption(image_path)
        
        # Enhance the caption for MMORPG assets
        enhanced_caption = f"a highly detailed MMORPG asset of {caption}, intricate design, 4k resolution, game art style"
        
        # Open image and convert to RGB
        with Image.open(image_path).convert('RGB') as img:
            # Convert PIL Image to bytes
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='JPEG')
            img_byte_arr = img_byte_arr.getvalue()

        data.append({
            "file_name": filename,
            "image": {"bytes": img_byte_arr},
            "prompt": enhanced_caption
        })

# Create the dataset
dataset = Dataset.from_dict({
    "file_name": [item["file_name"] for item in data],
    "image": [item["image"] for item in data],
    "prompt": [item["prompt"] for item in data]
})

# Cast the image column to Image type
dataset = dataset.cast_column("image", DatasetImage())

# Push the dataset to the Hugging Face Hub
dataset.push_to_hub(dataset_name, private=True)

# Create and add a dataset card (README.md)
dataset_card_content = f"""
# MMORPG Asset Advanced Dataset (Private)

This private dataset contains high-quality MMORPG asset images with detailed prompts for advanced fine-tuning of image generation models.

## Dataset Details
- **Size**: {len(dataset)} images
- **Format**: Images with corresponding detailed prompts
- **Use Case**: Fine-tuning image generation models for MMORPG asset creation

## Prompt Format
Each image is paired with a detailed prompt in the format:
"a highly detailed MMORPG asset of [generated description], intricate design, 4k resolution, game art style"

This format is designed to provide rich, consistent descriptions for MMORPG assets, enhancing the fine-tuning process for image generation models.
"""

with open("README.md", "w") as f:
    f.write(dataset_card_content)

api.upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_id=dataset_name,
    repo_type="dataset",
)

print(f"Dataset '{dataset_name}' has been created and uploaded to Hugging Face Hub as a private dataset.")
print(f"You can now use this dataset in your training script with: --dataset_name='{dataset_name}'")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/120 [00:00<?, ?it/s]

Expanding inputs for image tokens in BLIP-2 should be done in processing. Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Both `max_new_tokens` (=50) and `max_length`(=51) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=51) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=50) and `max_length`(=51) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/do

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

- empty or missing yaml metadata in repo card


In [None]:
!