In [None]:
import os
import uuid

import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

from PIL import Image

In [None]:
data = "../data"
datasets = ["CIL-dataset"]
datasets_out = ["DeepGlobe", "MRD", "CIL"]
dataset_out = "../data/joint-dataset"

In [None]:
if not os.path.exists(dataset_out):
    os.mkdir(dataset_out)
    for ds in datasets_out:
        os.mkdir(os.path.join(dataset_out, ds))
        os.mkdir(os.path.join(dataset_out, ds, "images"))
        os.mkdir(os.path.join(dataset_out, ds, "groundtruth"))

In [None]:
im = Image.open(os.path.join(data, datasets[0], "train", "104_mask.png"))
im.size

In [None]:
# get patches of size (400, 400) from the image
def get_patches(im, size):  # sourcery skip: for-append-to-extend
    patches = []
    for i in range(0, im.size[0] - size[0], size[0]//2):
        for j in range(0, im.size[1] - size[1], size[1]//2):
            patches.append(im.crop((i, j, i + size[0], j + size[1])))
    return patches

patches = get_patches(im, (400, 400))

# Generate Patches from DeepGlobe
This Dataset can be downloaded here: https://www.kaggle.com/datasets/balraj98/deepglobe-road-extraction-dataset

In [None]:
dataset = datasets[0]

# get unique names
fnames = os.listdir(os.path.join(data, dataset, "train"))
fnames = list({name.split("_")[0] for name in fnames})

In [None]:
for fname in tqdm(fnames):
    img = Image.open(os.path.join(data, dataset, "train", f"{fname}_sat.jpg"))
    target = Image.open(os.path.join(data, dataset, "train", f"{fname}_mask.png"))

    # get patches of size (400, 400) from the image
    img_patches = get_patches(img, (400, 400))
    target_patches = get_patches(target, (400, 400))

    # save patches to disk
    idx = uuid.uuid4()
    for i in range(len(img_patches)):
        if np.sum(target_patches[i]) > 0:
            img_patches[i].save(os.path.join(dataset_out, "DeepGlobe", "images", f"{idx}-{i}.jpg"))
            target_patches[i].save(os.path.join(dataset_out, "DeepGlobe", "groundtruth", f"{idx}-{i}-mask.png"))

# Generate Patches from MRD
This dataset can be downloaded here: https://www.kaggle.com/datasets/balraj98/massachusetts-roads-dataset

In [None]:
dataset = f"{datasets[1]}/tiff"

for split in ["train", "test", "val"]:
    # get unique names
    fnames = os.listdir(os.path.join(data, dataset, split))
    
    for fname in tqdm(fnames):
        # load .tiff image
        img = Image.open(os.path.join(data, dataset, split, fname))

        # img = Image.open(os.path.join(data, dataset, "train", f"{fname}"))
        target = Image.open(os.path.join(data, dataset, f"{split}_labels", f"{fname}"[:-1]))

        # get patches of size (400, 400) from the image
        img_patches = get_patches(img, (400, 400))
        target_patches = get_patches(target, (400, 400))

        idx = uuid.uuid4()
        for i in range(len(img_patches)):
            keep = np.array(img_patches[i]).mean(axis=-1) > 254
            s = np.sum(keep)
            if s < 100 and np.sum(target_patches[i]) > 0:
                img_patches[i].save(os.path.join(dataset_out, "MRD", "images", f"{idx}-{i}.jpg"))
                target_patches[i].save(os.path.join(dataset_out, "MRD", "groundtruth", f"{idx}-{i}-mask.png"))

# Prepare CIL Data

In [None]:
dataset = datasets[0]
load_data = os.path.join(data, dataset, "training")

for fname in tqdm(os.listdir(os.path.join(load_data, "images"))):
    if fname.endswith(".png"):
        img = Image.open(os.path.join(load_data, "images", fname))
        target = Image.open(os.path.join(load_data, "groundtruth", fname))

        # convert image to jpg
        img = img.convert("RGB")
        target = target.convert("RGB")
        
        img.save(os.path.join(dataset_out, "CIL", "images", f"{fname.split('.')[0]}.jpg"))
        target.save(os.path.join(dataset_out, "CIL", "groundtruth", f"{fname.split('.')[0]}-mask.png"))

load_data = os.path.join(data, dataset, "test")
for fname in tqdm(os.listdir(os.path.join(load_data, "images"))):
    if fname.endswith(".png"):
        img = Image.open(os.path.join(load_data, "images", fname))
        # convert image to jpg
        img = img.convert("RGB")
        
        img.save(os.path.join(dataset_out, "CIL", "test", f"{fname.split('.')[0]}.jpg"))

# Explore Datasets

In [None]:
data = dataset_out
df = []

for dataset in ["MRD", "DeepGlobe", "CIL"]:
    masks = os.listdir(os.path.join(data, dataset, "groundtruth"))
    for i, mask in tqdm(enumerate(masks), total=len(masks)):
        if mask.endswith("-mask.png"):
            n_pixels = np.sum(np.array(Image.open(os.path.join(data, dataset, "groundtruth", mask))) > 0)
            df.append((mask, dataset, n_pixels))

In [None]:
dataframe = pd.DataFrame(df, columns=["filename", "dataset", "n_pixels"])
dataframe.head()

In [None]:
# plot hist per dataset of n_pixels in different plots using plt
plt.figure(figsize=(10, 15))
for i, dataset in enumerate(["MRD", "DeepGlobe", "CIL"]):
    plt.subplot(3, 1, i + 1)
    plt.hist(dataframe[dataframe["dataset"] == dataset]["n_pixels"], bins=100, range=(0, 200000))
    plt.title(dataset)

In [None]:
# count how many masks have more than count pixels
count = 10000
dataframe[dataframe["n_pixels"] > count].groupby("dataset").count()


# Create overview csv

In [None]:
# create dataframe containing all paths to the images
data = dataset_out
df = []

for dataset in ["CIL"]:
    images = os.listdir(os.path.join(data, dataset, "images"))
    for i, img in tqdm(enumerate(images), total=len(images)):
        fname = img.split(".")[0]
        fpath = os.path.join(dataset, "images", img)
        mask_path = os.path.join(dataset, "groundtruth", f"{fname}-mask.png")
        df.append((fname, dataset, fpath, mask_path, "train"))

for dataset in ["CIL"]:
    images = sorted(os.listdir(os.path.join(data, dataset, "test")))
    for i, img in tqdm(enumerate(images), total=len(images)):
        fname = img.split(".")[0]
        fpath = os.path.join(dataset, "test", img)
        df.append((fname, dataset, fpath, "", "test"))
        

df = pd.DataFrame(df, columns=["filename", "dataset", "fpath", 'mpath', "split"])
df.head()

In [None]:
df.to_csv(os.path.join(dataset_out, "dataset.csv"), index=False)