# Clean Dataset

based on `qualitative` GT

In [16]:
# 削除するリスト
noise_file_list = [
    # fold 0 
    '10500500C4DD7000_0_15_67',
    '10500500C4DD7000_0_16_68',
    '10500500C4DD7000_0_25_62',
    '10500500C4DD7000_0_33_56',
    '105001001A0FFC00_0_21_14',
    '10500500C4DD7000_0_32_56',
    '10500500C4DD7000_0_32_57',
    '10500500C4DD7000_0_33_56',
    '10500500C4DD7000_0_35_59',
    '10500500C4DD7000_0_39_60',
    '10500500C4DD7000_0_41_58',
    '105001001A0FFC00_0_17_19',
    '105001001A0FFC00_0_18_20',
    '105001001A0FFC00_0_24_21',
    '10400100684A4B00_1_3_94',
    '10400100684A4B00_1_4_104',
    '10400100684A4B00_1_9_101',
    '10400100684A4B00_1_12_106',
    '10400100684A4B00_1_15_79',
    '10400100684A4B00_1_24_70',
    '10400100684A4B00_1_25_70',
    # fold 1
    '',
]

## Fold

In [17]:
import glob
import os
import math
import argparse
import random
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold

import numpy as np

## Func

In [18]:
from re import I


def gather_images_masks(image_dir):
    image_types = ["preimg", "postimg"]
    mask_types = ["flood", "building", "road", "roadspeed"]
    images = []
    masks = []
    for i in range(len(image_types)):
        raw_images = glob.glob(os.path.join(image_dir, f"*{image_types[i]}.tif"))
        raw_images.sort()
        images.append(raw_images)
    for i in range(len(mask_types)):
        image_masks = glob.glob(os.path.join(image_dir, f"*{mask_types[i]}.tif"))
        image_masks.sort()
        masks.append(image_masks)
    return images, masks

def make_train_val_csvs(image_dirs,
                        out_dir,
                        seed,
                        folds):
    geojsons = []
    pre_images = []
    post_images = []
    build_labels = []
    road_labels = []
    flood_labels = []
    speed_labels = []
    for d in image_dirs:
        anno = glob.glob(os.path.join(d, "annotations", "*.geojson"))
        bldgs = glob.glob(os.path.join(d, "annotations", "masks", "building*.tif"))
        roads = glob.glob(os.path.join(d, "annotations", "masks", "road*.tif"))
        flood = glob.glob(os.path.join(d, "annotations", "masks", "flood*.tif"))
        roadspeed = glob.glob(os.path.join(d, "annotations", "masks", "roadspeed*.tif"))
        pre = glob.glob(os.path.join(d, "PRE-event", "*.tif"))
        post = glob.glob(os.path.join(d, "POST-event", "*.tif"))
        an, bu, ro, fl, rs, preims, postims = match_im_label(anno, bldgs, roads, flood, roadspeed, pre, post)

        geojsons.extend(an)
        build_labels.extend(bu)
        road_labels.extend(ro)
        flood_labels.extend(fl)
        speed_labels.extend(rs)
        post_images.extend(postims)
        pre_images.extend(preims)

    all_images = [[],[]]
    all_masks = [[],[],[],[]]
    for i in range(len(geojsons)):
        all_images[0].append(pre_images[i])
        all_images[1].append(post_images[i])
        all_masks[0].append(flood_labels[i])
        all_masks[1].append(build_labels[i])
        all_masks[2].append(road_labels[i])
        all_masks[3].append(speed_labels[i])
        
    df = pd.DataFrame()
    df['preimg'] = all_images[0]
    df['postimg'] = all_images[1]
    df['flood'] = all_masks[0]
    df['building'] = all_masks[1]
    df['road'] = all_masks[2]
    df['roadspeed'] = all_masks[3]
    
    n_fold = np.zeros(len(df["preimg"]))
    kf = KFold(n_splits=folds, random_state=seed, shuffle=True)
    for k, (_, val_idx) in enumerate(kf.split(np.array(range(len(df["preimg"]))))):
        n_fold[val_idx] = k
        
    df["fold"] = n_fold.astype(np.uint8)
    print("元データの画像枚数:", len(df["preimg"]))
    
    # data cleaning
    df[['title', 'cleanflag']] = df.apply(get_claenflag_and_title, axis=1, result_type='expand') 
    
    # display(df)
    df = df[df['cleanflag'] == False]
    
    print("削除後のデータの画像枚数:", len(df["preimg"]))
    
    for fold in range(folds):
        train_df = df[df['fold'] != fold].reset_index(drop=True)
        val_df = df[df['fold'] == fold].reset_index(drop=True)
        
        PATH_FOLD_CSV_TRAIN = os.path.join(out_dir, f'clean-v1_fold{fold}_seed{seed}_train.csv')
        PATH_FOLD_CSV_VAL = os.path.join(out_dir, f'clean-v1_fold{fold}_seed{seed}_val.csv')
        
        train_df.to_csv(PATH_FOLD_CSV_TRAIN, index=False, header=True)
        val_df.to_csv(PATH_FOLD_CSV_VAL, index=False, header=True)
        print(f'FOLD: {fold} train: {len(train_df)} val: {len(val_df)}')
        
    return df
    

def match_im_label(anno, bldgs, roads, floods, roadspeeds, pre, post):
    out_pre = []
    out_post = []
    out_anno = []
    out_bu = []
    out_ro = []
    out_fl = []
    out_rs = []
    for i in anno:
        tileid = os.path.basename(i).split('.')[0]
        pre_im = [j for j in pre if f"_{tileid}.tif" in j][0]
        post_im = [j for j in post if f"_{tileid}.tif" in j][0]
        build = [j for j in bldgs if "building_" in j and f"_{tileid}.tif" in j][0]
        road = [j for j in roads if "road_" in j and f"_{tileid}.tif" in j][0]
        flood = [j for j in floods if "flood_" in j and f"_{tileid}.tif" in j][0]
        speed = [j for j in roadspeeds if "roadspeed_" in j and f"_{tileid}.tif" in j][0]
        
        out_anno.append(i)
        out_bu.append(build)
        out_ro.append(road)
        out_fl.append(flood)
        out_rs.append(speed)
        out_pre.append(pre_im)
        out_post.append(post_im)
        
    return out_anno, out_bu, out_ro, out_fl, out_rs, out_pre, out_post

def get_claenflag_and_title(row):
    title = str(row['preimg']).split('/')[-1].split('.')[0]
    
    flag = False
    
    if title in noise_file_list:
        flag = True
        
    return title, flag

In [19]:
folds = 5
seed = 417
np.random.seed(seed)
random.seed(seed)

root_dir = '../../data/'
aois = ['Germany_Training_Public', 'Louisiana-East_Training_Public',]
out_dir = f'../../data/folds/'
os.makedirs(out_dir, exist_ok=True)

##### train val split as random
image_dirs = [os.path.join(root_dir, n) for n in aois]
df = make_train_val_csvs(image_dirs, out_dir, folds=folds, seed=seed)

元データの画像枚数: 801
削除後のデータの画像枚数: 781
FOLD: 0 train: 640 val: 141
FOLD: 1 train: 621 val: 160
FOLD: 2 train: 621 val: 160
FOLD: 3 train: 621 val: 160
FOLD: 4 train: 621 val: 160


In [20]:
df

Unnamed: 0,preimg,postimg,flood,building,road,roadspeed,fold,title,cleanflag
0,../../data/Germany_Training_Public/PRE-event/1...,../../data/Germany_Training_Public/POST-event/...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,4,10500500C4DD7000_0_45_68,False
1,../../data/Germany_Training_Public/PRE-event/1...,../../data/Germany_Training_Public/POST-event/...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,4,10500500C4DD7000_0_35_63,False
2,../../data/Germany_Training_Public/PRE-event/1...,../../data/Germany_Training_Public/POST-event/...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,1,10500500C4DD7000_0_42_58,False
3,../../data/Germany_Training_Public/PRE-event/1...,../../data/Germany_Training_Public/POST-event/...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,3,10500500C4DD7000_0_37_69,False
4,../../data/Germany_Training_Public/PRE-event/1...,../../data/Germany_Training_Public/POST-event/...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,../../data/Germany_Training_Public/annotations...,4,10500500C4DD7000_0_34_61,False
...,...,...,...,...,...,...,...,...,...
796,../../data/Louisiana-East_Training_Public/PRE-...,../../data/Louisiana-East_Training_Public/POST...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,1,10300100AF395C00_2_23_45,False
797,../../data/Louisiana-East_Training_Public/PRE-...,../../data/Louisiana-East_Training_Public/POST...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,4,10400100684A4B00_1_7_85,False
798,../../data/Louisiana-East_Training_Public/PRE-...,../../data/Louisiana-East_Training_Public/POST...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,3,10400100684A4B00_1_10_97,False
799,../../data/Louisiana-East_Training_Public/PRE-...,../../data/Louisiana-East_Training_Public/POST...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,../../data/Louisiana-East_Training_Public/anno...,2,105001001A0FFC00_0_15_3,False
