In [3]:
import os
from tqdm.auto import tqdm

import s5cmdpy
import unibox as ub

In [4]:
v5c_path =  "/rmt/image_data/data_catalog/data_catalog/gold/dset_all_v5c.parquet"
v5c_df = ub.loads(v5c_path)

ub.peeks(v5c_df)

2024-07-29 19:34:08,461 [INFO] UniLogger: UniLoader.loads: .parquet LOADED from "/rmt/image_data/data_catalog/data_catalog/gold/dset_all_v5c.parquet" in 27.85s


(4688203, 34)
Index(['index_num', 'filename', 's3_uri', 'pixiv__pid', 'clip__clip_aesthetic',
       'pixiv__likeCount', 'pixiv__viewCount', 'pixiv__bookmarkCount',
       'pixiv__aiType', 'pixiv__compound_score', 'caption',
       'aigccls__aigc_pred', 'aigccls__is_likely-ai', 'dset__pixels',
       'dset__from', 'danbooru_id', 'danbooru__up_score',
       'danbooru__down_score', 'danbooru__fav_count', 'danbooru__is_banned',
       'danbooru__tag_string_artist', 'danbooru__tag_string_character',
       'clip__twitter_aesthetic_v2', 'comps__comp_score_pred',
       'resized__resized_s3_uri', 'clip_aesthetic_2_5',
       'danbooru__quality_tag', 'twitter__favorite_count', 'twitter__hashtags',
       'twitter__compound_score', 'twitter__quality_tag', 'dset__quality_tag',
       'dset__artist_occurence_count', 'dset__row_cap_dict'],
      dtype='object')


Unnamed: 0,index_num,filename,s3_uri,pixiv__pid,clip__clip_aesthetic,pixiv__likeCount,pixiv__viewCount,pixiv__bookmarkCount,pixiv__aiType,pixiv__compound_score,...,resized__resized_s3_uri,clip_aesthetic_2_5,danbooru__quality_tag,twitter__favorite_count,twitter__hashtags,twitter__compound_score,twitter__quality_tag,dset__quality_tag,dset__artist_occurence_count,dset__row_cap_dict
0,25,85967624_p0.jpg,s3://dataset-pixiv/artworks/85967624/85967624_...,85967624,5.768,667.0,5547.0,981.0,0.0,11.154647,...,,,,,,,,average quality,1722743,
1,56,85967667_p0.jpg,s3://dataset-pixiv/artworks/85967667/85967667_...,85967667,6.38851,3972.0,33782.0,6720.0,0.0,13.333247,...,s3://unidataset-danbooru/resized_1310720/image...,6.234648,average quality,,,,,good quality,105,
2,93,85967719_p0.jpg,s3://dataset-pixiv/artworks/85967719/85967719_...,85967719,6.14013,589.0,5614.0,949.0,0.0,12.345294,...,,,,,,,,average quality,1722743,


In [5]:
v5c_better_df = v5c_df[v5c_df["dset__quality_tag"].isin(["good quality", "best quality"])]
len(v5c_better_df)

1515328

In [6]:
sample_imgs = v5c_better_df.sample(1000)
sample_uris = sample_imgs["s3_uri"].tolist()
ub.peeks(sample_uris)

{'metadata': {'len': 1000, 'item_type': 'str'},
 'preview': ['s3://unidataset-danbooru/images/image_6253406/image_6253406.jpg',
  's3://unidataset-danbooru/images/image_3135690/image_3135690.jpg',
  's3://unidataset-danbooru/images/image_6249994/image_6249994.jpg']}

In [7]:
IMG_ROOT_DIR = "/lv0/kohya_datasets"


curr_img_dir = "small_test_1k"
_curr_img_dir = f"{IMG_ROOT_DIR}/{curr_img_dir}"
os.makedirs(_curr_img_dir, exist_ok=True)


subdir_name = "1"
_subdir_name = f"{_curr_img_dir}/{subdir_name}"
os.makedirs(_subdir_name, exist_ok=True)


s5cmdpy.download_from_s3_list(sample_uris, _subdir_name)

[interval=5] running s5cmd:   0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
!ls -l {_subdir_name} | wc -l

1001


In [9]:
_subdir_name

'/lv0/kohya_datasets/small_test_1k/1'

In [10]:
from dataproc4.utils.string_utils import safe_concat, safe_split_tag_string

def cap_dict_to_str_temp_character(cap_dict):
    """把一个dataproc4格式的caption dict转换成临时ft用的txt
    """
    tff = cap_dict.get("tags_front_fixed", "")  # artist
    tf = cap_dict.get("tags_front", "")         # character
    tm = cap_dict.get("tags_mid", "")           # general
    tm_alt = cap_dict.get("alternate_tags_mid", "")  # natural language
    tb = cap_dict.get("tags_back", "")          # quality

    tm_alt = tm_alt[0] if len(tm_alt) > 0 else ""

    tm_tags = safe_split_tag_string(tm)
    # max_tags = min(55, len(tm_tags))
    # sample_size = random.randint(5, max_tags) if len(tm_tags) >= 5 else len(tm_tags)
    # tm_tags = random.sample(tm_tags, sample_size)
    
    return safe_concat([tf], tm_tags, [tm_alt, tb])   # do not add artist tag

def write_captions_to_txt(cap_dict, output_dir):
    for k, v in tqdm(cap_dict.items(), desc=f"saving txt files to {output_dir}"):
        txt_content = cap_dict_to_str_temp_character(v)
        if not txt_content:
            print("NO CAPTION: ", k)
        txt_filename = os.path.join(output_dir, f"{k.split('.')[0]}.txt")

        os.makedirs(output_dir, exist_ok=True)
        ub.saves(txt_content, txt_filename, debug_print=False)

        

img_files = ub.traverses(_subdir_name, ub.IMG_FILES)


# create a txt for each img file
file_cap_dict = sample_imgs[["filename", "caption"]].set_index("filename").to_dict()["caption"]
write_captions_to_txt(file_cap_dict, _subdir_name)

                                          

saving txt files to /lv0/kohya_datasets/small_test_1k/1:   0%|          | 0/1000 [00:00<?, ?it/s]

In [11]:
txts = ub.traverses("/lv0/kohya_datasets/small_test_1k/1", [".txt"])
txts_content = ub.concurrent_loads(txts)

for txt in txts_content:
    if not txt:
        print("EMPTY")

                                          

Loading batches:   0%|          | 0/1000 [00:00<?, ?it/s]