In [1]:
import os
from shutil import copyfile
import numpy as np
import pandas as pd
from PIL import Image
import BDDDataSets as bdd
from bdd_make_datasets import pandas_to_bddjson
import re


In [2]:
cfg_name = '/home/till/projects/night-drive/config/config_bdd_make_datasets.json'
cfg = bdd.GetConfig(cfg_name)

In [3]:
# load main json
data = pd.read_json("/home/till/SharedFolder/CurrentDatasets/bdd100k_sorted/annotations/bdd100k_sorted_main.json")

In [4]:
data.reset_index(drop=True, inplace=True)
data.head()

Unnamed: 0,labels,name,scene,set_A,set_A_n_over,set_B,set_B_n_over,set_C,set_C_n_over,set_all,timeofday,weather
0,"[{'category': 'traffic sign', 'attributes': {'...",4feefa31-18e668bd.jpg,city street,test,0,test,0,test,0,test,night,clear
1,"[{'category': 'traffic sign', 'attributes': {'...",b35a415a-02526f57.jpg,city street,test,0,test,0,test,0,test,daytime,snowy
2,"[{'category': 'traffic light', 'attributes': {...",b2194b15-1825056a.jpg,city street,test,0,test,0,test,0,test,night,clear
3,"[{'category': 'car', 'attributes': {'occluded'...",28e9d4a5-e70efd65.jpg,undefined,test,0,test,0,test,0,test,night,clear
4,"[{'category': 'person', 'attributes': {'occlud...",7481b347-18b6baee.jpg,city street,test,0,test,0,test,0,test,daytime,snowy


In [17]:
cfg = bdd.GetConfig(cfg_name)
# GET GAN-AUGMENTED DATA SETS
# create a useful gan_info_dict containing info about each split element of sampler dict
gan_info_dict = {}
gan_info_dict["splits"] = cfg.gan_augment_dict.keys()
c = 0
for split in gan_info_dict["splits"]:
    gan_info_dict[split] = {}
    gan_info_dict[split]["split_traindev"] = re.sub("train", "train_dev", split)
    # get base split
    gan_info_dict[split]["base_split"] = cfg.gan_augment_dict[split]["base_split"]
    # get augmentation fraction
    gan_info_dict[split]["augment_fraction"] = cfg.gan_augment_dict[split]["augment_fraction"]
    # get augmentation mode
    gan_info_dict[split]["augment_mode"] = cfg.gan_augment_dict[split]["augment_mode"]
    # set association
    gan_info_dict[split]["set"] = "set_" + split[6:]
    #
    gan_info_dict[split]["aug_set_base"] = gan_info_dict[split]["set"]+"_base"
    gan_info_dict[split]["aug_set_n_over_aug"] = gan_info_dict[split]["set"]+"_n_over_aug"
    gan_info_dict[split]["aug_set_n_over_base"] = gan_info_dict[split]["set"]+"_n_over_base"
    #
    gan_info_dict[split]["base_set"] = "set_" + gan_info_dict[split]["base_split"][-1]
    gan_info_dict[split]["base_set_n_over"] = gan_info_dict[split]["base_set"]+"_n_over"
    # destination path
    if cfg.do_make_dirs_gan:  # create a separate dir for each split
        gan_info_dict[split]["destination_path"] = os.path.join(cfg.destination_path, split)
        gan_info_dict[split]["destination_path_traindev"] = os.path.join(cfg.destination_path, gan_info_dict[split]["split_traindev"])
    else:  # create all files in the same dir
        gan_info_dict[split]["destination_path"] = cfg.destination_path
    # destination file names
    gan_info_dict[split]["destination_json_filename"] = cfg.destination_filename_stem + split + ".json"
    gan_info_dict[split]["destination_json_over_filename"] = cfg.destination_filename_stem + split + "_over" + ".json"
    # for train_dev
    gan_info_dict[split]["destination_json_filename_traindev"] = cfg.destination_filename_stem + gan_info_dict[split]["split_traindev"] + ".json"
    gan_info_dict[split]["destination_json_over_filename_traindev"] = cfg.destination_filename_stem + gan_info_dict[split]["split_traindev"] + "_over" + ".json"
    # destination file path
    gan_info_dict[split]["destination_json_filepath"] = os.path.join(gan_info_dict[split]["destination_path"], gan_info_dict[split]["destination_json_filename"])
    gan_info_dict[split]["destination_json_over_filepath"] = os.path.join(gan_info_dict[split]["destination_path"], gan_info_dict[split]["destination_json_over_filename"])
    # for train_dev
    gan_info_dict[split]["destination_json_filepath_traindev"] = os.path.join(gan_info_dict[split]["destination_path"], gan_info_dict[split]["destination_json_filename_traindev"])
    gan_info_dict[split]["destination_json_over_filepath_traindev"] = os.path.join(gan_info_dict[split]["destination_path"], gan_info_dict[split]["destination_json_over_filename_traindev"])

    c += 1


In [36]:
cfg = bdd.GetConfig(cfg_name)
# set seed for numpy
np.random.seed(1234)
for split in gan_info_dict["splits"]:
    # unpack some helpers
    aug_set = gan_info_dict[split]["set"]
    aug_set_base = gan_info_dict[split]["aug_set_base"]
    aug_set_n_over_aug = gan_info_dict[split]["aug_set_n_over_aug"]
    aug_set_n_over_base = gan_info_dict[split]["aug_set_n_over_base"]
    base_split = gan_info_dict[split]["base_split"]
    base_set = gan_info_dict[split]["base_set"]
    base_set_n_over = gan_info_dict[split]["base_set_n_over"]
    # add columns to the dataframe indicating split association
    data[aug_set_base] = data[base_set]
    data[aug_set] = -1
    data[aug_set_n_over_aug] = 0
    data[aug_set_n_over_base] = 0
    #
    for tod, aug_frac in gan_info_dict[split]["augment_fraction"].items():  # for each timeofday
        for wc in sorted(data["weather"].unique().tolist()):  # for each weather condition
            for sub in ["train", "train_dev"]:
                # get indices of all elements of the base set and given class
                class_idx = data[(data[base_set].eq(sub) & data.timeofday.eq(tod) & data.weather.eq(wc))].index
                # get number of original samples and number of oversamples
                n_samples_total = len(class_idx)
                n_over_total = data.loc[class_idx,base_set_n_over].sum()
                #
                if gan_info_dict[split]["augment_mode"] == "before_over":
                    # set identifier to 0 for included originals
                    data.loc[class_idx, aug_set] = 0
                    # get number of unique samples
                    n_samples_aug = np.round(n_samples_total * aug_frac).astype("int")
                    if n_samples_aug > 0:

                        # randomly sample from those indices without replacement for augmentation
                        idx_aug = np.random.choice(class_idx, size=n_samples_aug, replace=False)
                        data.loc[idx_aug,aug_set] = 1

                        # get number of oversamples for remaining base data and augmented data
                        n_over_aug = np.round(n_over_total * aug_frac).astype("int")
                        n_over_base = n_over_total - n_over_aug

                        # randomly over-sample the augmented data
                        idx_over_aug = np.random.choice(idx_aug, n_over_aug, replace=True)
                        idx_uni, counts = np.unique(idx_over_aug, return_counts=True)
                        data.loc[idx_uni, aug_set_n_over_aug] = counts

                        # randomly over-sample the base data
                        idx_base = np.setdiff1d(class_idx, idx_over_aug, assume_unique=True)
                        idx_over_base = np.random.choice(idx_base, n_over_base, replace=True)
                        idx_uni, counts = np.unique(idx_over_base, return_counts=True)
                        data.loc[idx_uni, aug_set_n_over_base] = counts
                    
                elif [gan_info_dict[split]["augment_mode"] == x for x in ["after_over", "after_over_expand"]]:
                    # get number of unique samples
                    n_samples_aug = np.round((n_samples_total+n_over_total) * aug_frac).astype("int")
                    if n_samples_aug > 0:

                        # randomly sample from those indices with replacement for augmentation
                        idx_aug = np.random.choice(class_idx, size=n_samples_aug, replace=True)
                        idx_uni_aug, counts = np.unique(idx_aug, return_counts=True)
                        data.loc[idx_uni_aug, aug_set] = 1
                        data.loc[idx_uni_aug, aug_set_n_over_aug] = np.maximum(0,counts-1)
                        
                        if gan_info_dict[split]["augment_mode"] == "after_over_expand":
                            # set identifier to 0 for included originals, which here is all
                            data.loc[class_idx, aug_set] = np.maximum(0, data.loc[class_idx, aug_set])
                            # here, every sample that is augmented is automatically also kept as original
                            data.loc[idx_aug_uni,aug_set] = 2
                        elif gan_info_dict[split]["augment_mode"] == "after_over":
                            #
                            n_samples_base = (n_samples_total+n_over_total) - n_samples_aug
                            # here, we keep first the max num of unqiues
                            n_unique_base = np.minimum(n_samples_base, n_samples_total)
                            idx_base = np.random.choice(class_idx, size=n_unique, replace=False)
                            data.loc[idx_base,aug_set] = data.loc[idx_base,aug_set] + 1  # currently, is either -1 or 1, so after this is 0 or 2
                            # randomly over-sample the base data
                            n_over_base = n_samples_base - n_unique_base
                            idx_over_base = np.random.choice(idx_base, n_over_base, replace=True)
                            idx_uni_base, counts = np.unique(idx_over_base, return_counts=True)
                            data.loc[idx_uni_base, aug_set_n_over_base] = counts


In [7]:
gan_info_dict[split]["aug_set_base"]

'set_B_ganaug_025_base'

In [8]:
data.columns

Index(['labels', 'name', 'scene', 'set_A', 'set_A_n_over', 'set_B',
       'set_B_n_over', 'set_C', 'set_C_n_over', 'set_all', 'timeofday',
       'weather', 'set_A_ganaug_025_base', 'set_A_ganaug_025',
       'set_A_ganaug_025_n_over_aug', 'set_A_ganaug_025_n_over_base',
       'set_A_ganaug_050_base', 'set_A_ganaug_050',
       'set_A_ganaug_050_n_over_aug', 'set_A_ganaug_050_n_over_base',
       'set_B_ganaug_025_base', 'set_B_ganaug_025',
       'set_B_ganaug_025_n_over_aug', 'set_B_ganaug_025_n_over_base'],
      dtype='object')

In [9]:
data.loc[5050:5100,['set_A','timeofday', \
       'weather', 'set_A_ganaug_025_base', 'set_A_ganaug_025', 'set_A_ganaug_025_n_over_aug', \
       'set_A_ganaug_025_n_over_base']].head(50)


Unnamed: 0,set_A,timeofday,weather,set_A_ganaug_025_base,set_A_ganaug_025,set_A_ganaug_025_n_over_aug,set_A_ganaug_025_n_over_base
5050,unassigned,night,clear,unassigned,-1,0,0
5051,unassigned,night,clear,unassigned,-1,0,0
5052,train,daytime,snowy,train,0,0,3
5053,unassigned,night,clear,unassigned,-1,0,0
5054,unassigned,night,clear,unassigned,-1,0,0
5055,train,daytime,clear,train,0,0,0
5056,unassigned,night,clear,unassigned,-1,0,0
5057,unassigned,night,clear,unassigned,-1,0,0
5058,train,daytime,cloudy,train,1,0,0
5059,unassigned,night,rainy,unassigned,-1,0,0


In [10]:
data["name"].head()

0    4feefa31-18e668bd.jpg
1    b35a415a-02526f57.jpg
2    b2194b15-1825056a.jpg
3    28e9d4a5-e70efd65.jpg
4    7481b347-18b6baee.jpg
Name: name, dtype: object

In [37]:
# add suffix
_, ext = os.path.splitext(data.loc[0,"name"])
data["name_aug"] = ''
mask = data["timeofday"]=="night"
data.loc[mask, "name_aug"] = data.loc[mask, "name"].apply(lambda x: os.path.splitext(x)[0]+cfg.gan_transform_suffix["daytime"]+ext)
mask = data["timeofday"]=="daytime"
data.loc[mask, "name_aug"] = data.loc[mask, "name"].apply(lambda x: os.path.splitext(x)[0]+cfg.gan_transform_suffix["night"]+ext)

In [40]:
def get_filepath(root, fname_search):
    for dir, _, fnames in os.walk(root):
        for fname in fnames:
            if fname in fname_search:
                return os.path.join(dir, fname)
    raise Exception("Couldn't load image {} from root {}".format(fname_search, root))

# get path
data["path"] = data["name"].apply(lambda x: get_filepath(cfg.root_dir, x))
data["path_aug"] = data["name_aug"].apply(lambda x: get_filepath(cfg.root_dir_gan, x))


KeyboardInterrupt: 

In [61]:
cfg = bdd.GetConfig(cfg_name)
# now separate data, copy images, and create the different jsons
_, ext = os.path.splitext(data.loc[0,"name"])
for split in gan_info_dict["splits"]:  # for each split
    # unpack some helpers
    aug_set = gan_info_dict[split]["set"]
    aug_set_base = gan_info_dict[split]["aug_set_base"]
    aug_set_n_over_aug = gan_info_dict[split]["aug_set_n_over_aug"]
    aug_set_n_over_base = gan_info_dict[split]["aug_set_n_over_base"]
    base_split = gan_info_dict[split]["base_split"]
    base_set = gan_info_dict[split]["base_set"]
    base_set_n_over = gan_info_dict[split]["base_set_n_over"]
    # for each sub-split
    for sub in ["train", "train_dev"]:
        if sub == "train_dev":
            destination_json_over_filepath = gan_info_dict[split]["destination_json_over_filepath_traindev"]
            destination_json_filepath = gan_info_dict[split]["destination_json_filepath_traindev"]
        else:
            destination_json_over_filepath = gan_info_dict[split]["destination_json_over_filepath"]
            destination_json_filepath = gan_info_dict[split]["destination_json_filepath"]

        ### get all elements associated with the current split into a separate data frame
        # first query all entries not to be augmented
        cur_file = data.query("({0}==@sub) & (({1}==0) | ({1}==2))".format(aug_set_base, aug_set)).reset_index(drop=True)
        # next, query the data to be augmented
        cur_aug = data.query("({0}==@sub) & (({1}==1) | ({1}==2))".format(aug_set_base, aug_set)).reset_index(drop=True)
        # rename augmented data
        cur_aug["name"] = cur_aug["name_aug"]
        cur_aug["path"] = cur_aug["path_aug"]
        cur_aug[aug_set_n_over_base] = cur_aug[aug_set_n_over_aug]
        # now combine
        cur_file = pd.concat([cur_file, cur_aug], axis=0).reset_index(drop=True)

        # create folder structure
        if not os.path.exists(gan_info_dict[split]["destination_path"]):
            os.makedirs(gan_info_dict[split]["destination_path"])
        elif cfg.do_make_dirs_gan:
            raise Exception("Destination folder(s) already exist.")

        ### save a json in bdd format containing only the unique original and augmented images
        if cfg.do_make_jsons_gan:
            print("Writing json to {}".format( destination_json_filepath))
            pandas_to_bddjson(cur_file.copy(), destination_json_filepath)

        ### copy original images associated with current split into new folder
        if cfg.do_copy_images_gan == True:
            print("Copying {} images to {}".format(cur_file.shape[0], gan_info_dict[split]["destination_path"]))
            for img in cur_file["path"]:
                img_path = os.path.join(img)
                copyfile(img_path, os.path.join(gan_info_dict[split]["destination_path"], os.path.basename(img_path)))

        ### append oversamples to cur_file (file names = original file name + copy1, copy2, etc.
        col_name = aug_set_n_over_base
        cf_shape_before = cur_file.shape[0]
        for i in range(cf_shape_before):
            n_over = int(cur_file.loc[i,col_name])
            for j in range(n_over):
                cur_file.loc[cur_file.shape[0],:] = cur_file.loc[i,:]
                cur_file = cur_file.reset_index(drop=True)
                # pd.concat([cur_file,cur_file.iloc[i,:]], ignore_index=True)
                # make physical copies of the oversamples, if requested
                if cfg.do_oversample_physically == True:
                    name_original = cur_file.loc[i, "name"]
                    name_copy = os.path.join(gan_info_dict[split]["destination_path"], os.path.basename(name_original.split(".")[0] + "_copy" + str(j+1) + "." + name_original.split(".")[1]))  # rename file by appending _copy1, _copy2, etc
                    print(name_original, '>>>\n', name_copy)
                    copyfile(name_original, name_copy)
                    cur_file.loc[cur_file.shape[0]-1, "name"] = name_copy  # store the new name
            if i%1000==0:
                print("Over-sampling done for {} of {} entries.".format(i, cf_shape_before))
                              
        ### shuffle
        cur_file.sample(frac=1.0, random_state=123).reset_index(drop=True)
                              
        ### save a json in bdd format containing also the over-samples
        if cfg.do_make_jsons_gan:
            pandas_to_bddjson(cur_file.copy(), destination_json_over_filepath)


23029
7676
0    6fab7e57-962008ac_transfer_AtoB.jpg
1    8f31df58-fe857502_transfer_AtoB.jpg
2    be2fbf0b-5d71c655_transfer_AtoB.jpg
3    34efb108-f3aa66ad_transfer_AtoB.jpg
4    5432ec70-5798a8c2_transfer_AtoB.jpg
Name: name, dtype: object
Writing json to /home/till/SharedFolder/CurrentDatasets/bdd100k_sorted_test/bdd100k_sorted_train_A_ganaug_025.json
Over-sampling done for 0 of 30705 entries.


KeyboardInterrupt: 

In [None]:
# ==================================================================================================================
# WRITE MAIN JSON
# ==================================================================================================================
# write a main json containing the combined information for all splits, including additional info columns
# first get relative path for images
# data.name = data.name.map(os.path.basename)
# now write
if cfg.version == 0:
    data.to_json(os.path.join(cfg.destination_path, cfg.destination_filename_stem + "main" + ".json"))
elif cfg.version == 1:
    data.to_json(os.path.join(cfg.destination_path, cfg.destination_filename_stem + "main" + ".json"), orient="records")


In [56]:
df = pd.read_json("/home/till/SharedFolder/CurrentDatasets/bdd100k_sorted_test/bdd100k_sorted_train_A_ganaug_025.json")