In [10]:
!rm instruction-tuned-sd -rf
!git clone https://github.com/huggingface/instruction-tuned-sd


Cloning into 'instruction-tuned-sd'...
remote: Enumerating objects: 170, done.[K
remote: Counting objects: 100% (107/107), done.[K
remote: Compressing objects: 100% (72/72), done.[K
remote: Total 170 (delta 61), reused 58 (delta 35), pack-reused 63[K
Receiving objects: 100% (170/170), 67.53 KiB | 4.82 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [11]:
cd instruction-tuned-sd

/kaggle/working/instruction-tuned-sd/data_preparation/instruction-tuned-sd


In [12]:
cd data_preparation

/kaggle/working/instruction-tuned-sd/data_preparation/instruction-tuned-sd/data_preparation


In [13]:
!pip install -r requirements.txt

[0m

In [None]:

import argparse
import hashlib
import os

import model_utils
import tensorflow as tf
import tensorflow_datasets as tfds
from PIL import Image
from tqdm import tqdm
num2=5000

def parse_args():
    parser = argparse.ArgumentParser(
        description="Prepare a dataset for InstructPix2Pix style training."
    )
    parser.add_argument(
        "--model_id", type=str, default="sayakpaul/whitebox-cartoonizer"
    )
    parser.add_argument("--dataset_id", type=str, default="imagenette")
    parser.add_argument("--max_num_samples", type=int, default=num2)
    parser.add_argument("--data_root", type=str, default="cartoonizer-dataset")
    args = parser.parse_known_args()[0]
    return args


def load_dataset(dataset_id: str, max_num_samples: int) -> tf.data.Dataset:
    dataset = tfds.load(dataset_id, split="train")
    dataset = dataset.shuffle(max_num_samples if max_num_samples is not None else 128)
    if max_num_samples is not None:
        print(f"Dataset will be restricted to {max_num_samples} samples.")
        dataset = dataset.take(max_num_samples)
    return dataset


def main(args):
    print("Loading initial dataset and the Cartoonizer model...")
    dataset = load_dataset(args.dataset_id, args.max_num_samples)
    concrete_fn = model_utils.load_model(args.model_id)
    inference_fn = model_utils.perform_inference(concrete_fn)

    print("Preparing the image pairs...")
    os.makedirs(args.data_root, exist_ok=True)
    for sample in tqdm(dataset.as_numpy_iterator()):
        original_image = sample["image"]
        cartoonized_image = inference_fn(original_image)

        hash_image = hashlib.sha1(original_image.tobytes()).hexdigest()
        sample_dir = os.path.join(args.data_root, hash_image)
        os.makedirs(sample_dir,exist_ok=True)

        original_image = Image.fromarray(original_image).convert("RGB")
        original_image.save(os.path.join(sample_dir, "original_image.png"))
        cartoonized_image.save(os.path.join(sample_dir, "cartoonized_image.png"))

    print(f"Total generated image-pairs: {len(os.listdir(args.data_root))}.")


if __name__ == "__main__":
    args = parse_args()
#     args=argparse.ArgumentParser().parse_known_args()[0]
    main(args)

Loading initial dataset and the Cartoonizer model...
[1mDownloading and preparing dataset 1.45 GiB (download: 1.45 GiB, generated: 1.46 GiB, total: 2.91 GiB) to ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/9469 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0.incomplete1U6N40/imagenette-train.tfrecord*...: …

Generating validation examples...:   0%|          | 0/3925 [00:00<?, ? examples/s]

Shuffling ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0.incomplete1U6N40/imagenette-validation.tfrecord*…

[1mDataset imagenette downloaded and prepared to ~/tensorflow_datasets/imagenette/full-size-v2/1.0.0. Subsequent calls will reuse this data.[0m
Dataset will be restricted to 5000 samples.


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

Preparing the image pairs...



0it [00:00, ?it/s][A
1it [00:15, 15.60s/it][A
2it [00:15,  6.59s/it][A
3it [00:16,  3.76s/it][A
4it [00:17,  2.73s/it][A
5it [00:17,  1.81s/it][A
6it [00:17,  1.25s/it][A
7it [00:18,  1.02s/it][A
8it [00:18,  1.24it/s][A
9it [00:18,  1.58it/s][A
10it [00:19,  1.83it/s][A
11it [00:19,  2.30it/s][A
12it [00:19,  2.74it/s][A
13it [00:20,  2.00it/s][A
14it [00:20,  2.20it/s][A
15it [00:22,  1.30it/s][A
16it [00:22,  1.72it/s][A
17it [00:22,  2.17it/s][A
18it [00:22,  2.57it/s][A
19it [00:23,  2.87it/s][A
20it [00:23,  3.38it/s][A
21it [00:24,  2.16it/s][A
22it [00:24,  2.59it/s][A
23it [00:27,  1.10s/it][A
24it [00:27,  1.22it/s][A
25it [00:27,  1.49it/s][A
27it [00:28,  1.63it/s][A
28it [00:29,  1.76it/s][A
29it [00:29,  1.78it/s][A
30it [00:29,  2.22it/s][A
31it [00:30,  1.79it/s][A
32it [00:30,  2.25it/s][A
33it [00:31,  1.59it/s][A
34it [00:34,  1.11s/it][A
35it [00:34,  1.18it/s][A
37it [00:36,  1.12it/s][A
38it [00:37,  1.02it/s][A
39it [00:37,  

In [None]:
!huggingface-cli login --token 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 --add-to-git-credential  # token里面是自己的配置. https://huggingface.co/settings/tokens 点newtoken_然后设置为write属性.

In [None]:
!pip install datasets==2.11.0
import argparse
import os
from typing import List

import numpy as np
from datasets import Dataset, Features
from datasets import Image as ImageFeature
from datasets import Value

DS_NAME = "cartoonizer-dataset"+str(num2) #保存的数据集名字.


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_root", type=str, default="cartoonizer-dataset")
    parser.add_argument("--instructions_path", type=str, default="instructions.txt")
    args=parser.parse_known_args()[0]
    return args


def load_instructions(instructions_path: str) -> List[str]:
    with open(instructions_path, "r") as f:
        instructions = f.readlines()
    instructions = [i.strip() for i in instructions]
    return instructions


def generate_examples(data_paths: List[str], instructions: List[str]):
    def fn():
        for data_path in data_paths:
            yield {
                "original_image": {"path": data_path[0]},
                "edit_prompt": np.random.choice(instructions),
                "cartoonized_image": {"path": data_path[1]},
            }

    return fn


def main(args):
    instructions = load_instructions(args.instructions_path)

    data_paths = os.listdir(args.data_root)
    data_paths = [os.path.join(args.data_root, d) for d in data_paths]
    new_data_paths = []
    for data_path in data_paths:
        original_image = os.path.join(data_path, "original_image.png")
        cartoonized_image = os.path.join(data_path, "cartoonized_image.png")
        new_data_paths.append((original_image, cartoonized_image))

    generation_fn = generate_examples(new_data_paths, instructions)
    print("Creating dataset...")
    ds = Dataset.from_generator(
        generation_fn,
        features=Features(
            original_image=ImageFeature(),
            edit_prompt=Value("string"),
            cartoonized_image=ImageFeature(),
        ),
    )

    print("Pushing to the Hub...")
    ds.push_to_hub(DS_NAME)


if __name__ == "__main__":
    args = parse_args()
    main(args)

In [None]:
#  成功push到了  https://huggingface.co/datasets/zhangbo2008/cartoonizer-dataset !!!!!!!!!!

In [None]:
#总结这个项目就是把一个数据集上的图片都拿cartoonmodel转化一下,然后把结果推送hf上. 很简单.