In [1]:
import numpy as np
import pandas as pd
import os
import shutil
import pickle

In [6]:
with open("data/list_landmarks_celeba.txt", "r") as f:
    lines = f.readlines()
    
n = int(lines[0])
labels = lines[1]
label_splited = labels.split(" ")[:10]
label_splited.insert(0, "image_id")
labels = ",".join(map(lambda elem: elem.strip(), label_splited))

with open("data/list_landmarks_celeba.csv", "w") as f:
    f.write(f"{labels}\n")
    for line in lines[2:]:
        splited = []
        for item in line.split(" "):
            if len(item) > 0:
                splited.append(item.strip())
        
        line = ",".join(splited)
        f.write(f"{line}\n")


In [7]:
with open("data/list_bbox_celeba.txt", "r") as f:
    lines = f.readlines()
    
n = int(lines[0])
labels = lines[1]
label_splited = labels.split(" ")[:10]
labels = ",".join(map(lambda elem: elem.strip(), label_splited))

with open("data/list_bbox_celeba.csv", "w") as f:
    f.write(f"{labels}\n")
    for line in lines[2:]:
        splited = []
        for item in line.split(" "):
            if len(item) > 0:
                splited.append(item.strip())
        
        line = ",".join(splited)
        f.write(f"{line}\n")


In [24]:
with open("data/list_eval_partition.txt", "r") as f:
    lines = f.readlines()
    
labels = "image_id,partition"

with open("data/list_eval_partition.csv", "w") as f:
    f.write(f"{labels}\n")
    for line in lines:
        splited = []
        for item in line.split(" "):
            if len(item) > 0:
                splited.append(item.strip())
        
        line = ",".join(splited)
        f.write(f"{line}\n")


## CELEBA Preprocessing

In [25]:
TRAIN_SPLIT_FILE_PATH = "data/list_eval_partition.csv"
LM_FILE_PATH = "data/list_landmarks_celeba.csv"
BBOX_FILE_PATH = "data/list_bbox_celeba.csv"

In [26]:
split_file = pd.read_csv(TRAIN_SPLIT_FILE_PATH)
split_file.head()

Unnamed: 0,image_id,partition
0,000001.jpg,0
1,000002.jpg,0
2,000003.jpg,0
3,000004.jpg,0
4,000005.jpg,0


In [27]:


lm_file = pd.read_csv(LM_FILE_PATH)
print(lm_file.head())

bbox_file = pd.read_csv(BBOX_FILE_PATH)
print(bbox_file)

     image_id  lefteye_x  lefteye_y  righteye_x  righteye_y  nose_x  nose_y  \
0  000001.jpg        165        184         244         176     196     249   
1  000002.jpg        140        204         220         204     168     254   
2  000003.jpg        244        104         264         105     263     121   
3  000004.jpg        796        539         984         539     930     687   
4  000005.jpg        273        169         328         161     298     172   

   leftmouth_x  leftmouth_y  rightmouth_x  rightmouth_y  
0          194          271           266           260  
1          146          289           226           289  
2          235          134           251           140  
3          762          756           915           756  
4          283          208           323           207  
          image_id   x_1  y_1  width  height
0       000001.jpg    95   71    226     313
1       000002.jpg    72   94    221     306
2       000003.jpg   216   59     91     1

In [28]:
def train_eval_split():
    split_file = pd.read_csv(TRAIN_SPLIT_FILE_PATH)
    
    path = split_file["image_id"]
    partition = split_file["partition"]
    
    train_paths = path[partition == 0]
    valid_paths = path[partition == 1]
    test_paths = path[partition == 2]
    
    return train_paths, valid_paths, test_paths

In [29]:
train, valid, test = train_eval_split()
print(train[:5])

0    000001.jpg
1    000002.jpg
2    000003.jpg
3    000004.jpg
4    000005.jpg
Name: image_id, dtype: object


In [30]:
def copy_celeba_data(dataset, cat):
    
    if not os.path.exists("data/celeba") or not os.path.isdir("data/celeba"):
        os.mkdir("data/celeba")
        os.mkdir("data/celeba/train")
        os.mkdir("data/celeba/valid")
        os.mkdir("data/celeba/test")
    
    count = 0
    
    for filename in dataset:
        frompath = os.path.join("data", "img_celeba", filename).replace("\\", "/")
        topath = os.path.join("data", "celeba", cat, filename).replace("\\", "/")
        
        shutil.move(frompath, topath)
        
        count += 1
        
    print(f"Dataset counts: {count}")

In [31]:
copy_celeba_data(train, "train")
copy_celeba_data(valid, "valid")
copy_celeba_data(test, "test")

Dataset counts: 162770
Dataset counts: 19867
Dataset counts: 19962


In [32]:
def annot_split(train, valid, test):
    train_lm = lm_file[lm_file["image_id"].isin(train)]
    valid_lm = lm_file[lm_file["image_id"].isin(valid)]
    test_lm = lm_file[lm_file["image_id"].isin(test)]
    
    train_bbox = bbox_file[bbox_file["image_id"].isin(train)]
    valid_bbox = bbox_file[bbox_file["image_id"].isin(valid)]
    test_bbox = bbox_file[bbox_file["image_id"].isin(test)]
    
    train_lm.to_csv("data/celeba/train_lm.csv")
    valid_lm.to_csv("data/celeba/valid_lm.csv")
    test_lm.to_csv("data/celeba/test_lm.csv")
    
    train_bbox.to_csv("data/celeba/train_bbox.csv")
    valid_bbox.to_csv("data/celeba/valid_bbox.csv")
    test_bbox.to_csv("data/celeba/test_bbox.csv")

In [33]:
annot_split(train, valid, test)

## WIDER Preprocessing

In [40]:
def rearrange_wider():
    os.mkdir("data/wider")
    
    os.mkdir("data/wider/train")
    os.mkdir("data/wider/valid")
    os.mkdir("data/wider/test")
    
    for orig_cat in ["train", "val"]:
        wider_annot_path = f"data/wider_face_split/wider_face_{orig_cat}_bbx_gt.txt"
        cat = orig_cat
        if orig_cat == "val":
            cat = "valid"
        
        flags = "name"
        
        filename = None
        dataset = []
        
        with open(wider_annot_path, "r") as f:
            for line in f:
                if flags == "name":
                    filename = line.replace("\n", "").replace("\r", "")
                    flags = "num"
                    
                elif flags == "num":
                    num = int(line)
                    cnt = num
                    bboxes = []
                        
                    flags = "bbox"
                    
                elif flags == "bbox":
                    if cnt > 0:
                        x, y, w, h = list(map(float, line.split(" ")[:4]))
                        bboxes.append((x, y, w, h))
                        cnt -= 1
                    
                    if cnt == 0:
                        from_path = f"data/WIDER_{orig_cat}/images/" + filename
                        to_path = f"data/wider/{cat}"
                        
                        shutil.move(from_path, to_path)
                        filename = filename.split("/")[1]

                        data_sample = [filename, bboxes]
                        dataset.append(data_sample)

                        flags = "name"
            
        print(f"{cat}: {len(dataset)}")
            
        with open(f"data/wider/wider_{cat}.bin", "wb") as f:
            dump_str = pickle.dumps(dataset)
            f.write(dump_str)

In [41]:
rearrange_wider()

train: 12880
valid: 3226
