In [3]:
!rm instruction-tuned-sd -rf
!git clone https://github.com/huggingface/instruction-tuned-sd


Cloning into 'instruction-tuned-sd'...
remote: Enumerating objects: 170, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 170 (delta 61), reused 58 (delta 35), pack-reused 63[K
Receiving objects: 100% (170/170), 67.53 KiB | 7.50 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [4]:
cd instruction-tuned-sd

/kaggle/working/instruction-tuned-sd


In [5]:
cd data_preparation

/kaggle/working/instruction-tuned-sd/data_preparation


In [6]:
!pip install -r requirements.txt

[0m

In [7]:

import argparse
import hashlib
import os

import model_utils
import tensorflow as tf
import tensorflow_datasets as tfds
from PIL import Image
from tqdm import tqdm


def parse_args():
    parser = argparse.ArgumentParser(
        description="Prepare a dataset for InstructPix2Pix style training."
    )
    parser.add_argument(
        "--model_id", type=str, default="sayakpaul/whitebox-cartoonizer"
    )
    parser.add_argument("--dataset_id", type=str, default="imagenette")
    parser.add_argument("--max_num_samples", type=int, default=50)
    parser.add_argument("--data_root", type=str, default="cartoonizer-dataset")
    args = parser.parse_known_args()[0]
    return args


def load_dataset(dataset_id: str, max_num_samples: int) -> tf.data.Dataset:
    dataset = tfds.load(dataset_id, split="train")
    dataset = dataset.shuffle(max_num_samples if max_num_samples is not None else 128)
    if max_num_samples is not None:
        print(f"Dataset will be restricted to {max_num_samples} samples.")
        dataset = dataset.take(max_num_samples)
    return dataset


def main(args):
    print("Loading initial dataset and the Cartoonizer model...")
    dataset = load_dataset(args.dataset_id, args.max_num_samples)
    concrete_fn = model_utils.load_model(args.model_id)
    inference_fn = model_utils.perform_inference(concrete_fn)

    print("Preparing the image pairs...")
    os.makedirs(args.data_root, exist_ok=True)
    for sample in tqdm(dataset.as_numpy_iterator()):
        original_image = sample["image"]
        cartoonized_image = inference_fn(original_image)

        hash_image = hashlib.sha1(original_image.tobytes()).hexdigest()
        sample_dir = os.path.join(args.data_root, hash_image)
        os.makedirs(sample_dir,exist_ok=True)

        original_image = Image.fromarray(original_image).convert("RGB")
        original_image.save(os.path.join(sample_dir, "original_image.png"))
        cartoonized_image.save(os.path.join(sample_dir, "cartoonized_image.png"))

    print(f"Total generated image-pairs: {len(os.listdir(args.data_root))}.")


if __name__ == "__main__":
    args = parse_args()
#     args=argparse.ArgumentParser().parse_known_args()[0]
    main(args)

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Loading initial dataset and the Cartoonizer model...
[1mDownloading and preparing dataset 1.45 GiB (download: 1.45 GiB, generated: 1.46 GiB, total: 2.91 GiB) to ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/9469 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0.incomplete2XG8G5/imagenette-train.tfrecord*...: …

Generating validation examples...:   0%|          | 0/3925 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0.incomplete2XG8G5/imagenette-validation.tfrecord*…

[1mDataset imagenette downloaded and prepared to ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0. Subsequent calls will reuse this data.[0m
Dataset will be restricted to 50 samples.


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Preparing the image pairs...


50it [00:31,  1.61it/s]


Total generated image-pairs: 50.


In [8]:
!huggingface-cli login --token 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 --add-to-git-credential  # token里面是自己的配置. https://huggingface.co/settings/tokens 点newtoken_然后设置为write属性.

Token is valid.
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
!pip install datasets==2.11.0
import argparse
import os
from typing import List

import numpy as np
from datasets import Dataset, Features
from datasets import Image as ImageFeature
from datasets import Value

DS_NAME = "cartoonizer-dataset"


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_root", type=str, default="cartoonizer-dataset")
    parser.add_argument("--instructions_path", type=str, default="instructions.txt")
    args=parser.parse_known_args()[0]
    return args


def load_instructions(instructions_path: str) -> List[str]:
    with open(instructions_path, "r") as f:
        instructions = f.readlines()
    instructions = [i.strip() for i in instructions]
    return instructions


def generate_examples(data_paths: List[str], instructions: List[str]):
    def fn():
        for data_path in data_paths:
            yield {
                "original_image": {"path": data_path[0]},
                "edit_prompt": np.random.choice(instructions),
                "cartoonized_image": {"path": data_path[1]},
            }

    return fn


def main(args):
    instructions = load_instructions(args.instructions_path)

    data_paths = os.listdir(args.data_root)
    data_paths = [os.path.join(args.data_root, d) for d in data_paths]
    new_data_paths = []
    for data_path in data_paths:
        original_image = os.path.join(data_path, "original_image.png")
        cartoonized_image = os.path.join(data_path, "cartoonized_image.png")
        new_data_paths.append((original_image, cartoonized_image))

    generation_fn = generate_examples(new_data_paths, instructions)
    print("Creating dataset...")
    ds = Dataset.from_generator(
        generation_fn,
        features=Features(
            original_image=ImageFeature(),
            edit_prompt=Value("string"),
            cartoonized_image=ImageFeature(),
        ),
    )

    print("Pushing to the Hub...")
    ds.push_to_hub(DS_NAME)


if __name__ == "__main__":
    args = parse_args()
    main(args)

[0mCreating dataset...
Downloading and preparing dataset generator/default to /root/.cache/huggingface/datasets/generator/default-0854825ace59f3dd/0.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-0854825ace59f3dd/0.0.0. Subsequent calls will reuse this data.
Pushing to the Hub...


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
#  成功push到了  https://huggingface.co/datasets/zhangbo2008/cartoonizer-dataset !!!!!!!!!!