In [148]:
import os
import json
import tqdm
import pandas as pd
import numpy as np

from PIL import Image

In [75]:
FASHION200K_DIR = ""
FASHION_200K_CATEGORIES = "../categories/fashion_200k_categories.json"

In [76]:
with open(f"{FASHION200K_DIR}/loaded_images.txt", "r") as f:
    loaded_images = f.read()
loaded_images = set(loaded_images.split("\n"))

with open(FASHION_200K_CATEGORIES, "r") as f:
    category_mapping = json.load(f)
used_categories = set(category_mapping.keys())

In [127]:
label_categories_mapping = {
    "dress": "dress",
    "skirt": "skirt",
    "jackt": "outwear",
    "outwear": "outwear",
    "sweater": "outwear",
    "blazer": "outwear",
    "shorts": "shorts",
    "pants": "pants",
    "trousers": "pants",
    "pant": "pants",
    "jeans": "pants",
    "blouse": "top",
    "top": "top",
    "shirt": "top",
}

In [110]:
labels_dir = f"{FASHION200K_DIR}/labels"

In [77]:
detection_dir = f"{FASHION200K_DIR}/detection"

In [129]:
filename2category = {}

In [132]:
for filename in os.listdir(labels_dir):
    with open(f"{labels_dir}/{filename}", "r") as f:
        labels_data = f.read()
    for line in labels_data.split("\n"):
        line = line.split()
        if len(line) < 2:
            continue
        name = line[0]
        for w in line[2:]:
            if w in label_categories_mapping:
                filename2category[name] = label_categories_mapping[w]

In [78]:
def valid_bbox(bbox) -> bool:
    if bbox[2] <= bbox[0]:
        return False
    if bbox[3] <= bbox[1]:
        return False
    for coord in bbox:
        if coord < 0:
            return False
    return True

In [136]:
categories = []
crop_paths = []

for filename in os.listdir(detection_dir):
    if not filename.startswith("women"):
        continue
    with open(f"{detection_dir}/{filename}", "r") as f:
        detection_data = f.read()
    for line in tqdm.tqdm(detection_data.split("\n")):
        line = line.split()
        if len(line) < 2:
            continue
        image_path = line[0]
        if image_path not in filename2category:
            continue
        if image_path not in loaded_images:
            continue
        full_img = Image.open(os.path.join(FASHION200K_DIR, image_path))
        for i, det in enumerate(line[1:]):
            cat = det.split("_")[0]
            if cat != filename2category[image_path]:
                continue
                
            bbox = det.split("_")[-4:]
            bbox = [float(bbox[0]) * full_img.width,
                    float(bbox[2]) * full_img.height,
                    float(bbox[1]) * full_img.width,
                    float(bbox[3]) * full_img.height]
            if not valid_bbox(bbox):
                continue
            crop_path = f"{FASHION200K_DIR}/{os.path.dirname(image_path)}/crop{i}_{os.path.basename(image_path)}"
            categories.append(cat)
            crop_paths.append(crop_path)
            if not os.path.exists(crop_path):
                full_img.crop(bbox).save(crop_path)           


  5%|▍         | 3555/72446 [00:00<00:03, 17842.33it/s]

100%|██████████| 72446/72446 [00:04<00:00, 18050.55it/s]
100%|██████████| 72377/72377 [00:05<00:00, 12180.99it/s]
100%|██████████| 74471/74471 [00:02<00:00, 28385.23it/s]
100%|██████████| 71126/71126 [00:00<00:00, 169382.40it/s]
100%|██████████| 47932/47932 [00:04<00:00, 11710.76it/s]


In [154]:
df = pd.DataFrame({
    "category_name": categories,
    "filenames": crop_paths
})

In [155]:
rds = np.random.random(len(df))

In [156]:
split = [("train" if rd < 0.9 else "val") for rd in rds]

In [157]:
df["split"] = split

In [158]:
df.to_csv(f"{FASHION200K_DIR}/crop_categories.csv", index=False)