# Create Metadata for Datasets

In [1]:
import sys
sys.path.append('..')

In [2]:
import json
import logging


import numpy as np
from tqdm import tqdm

from src.datasets.data_utils import calculate_channel_mean_std
from src.utils.enums import DatasetEnum
from src.utils.io import list_dir, load_mask

In [3]:
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s - %(levelname)s] %(message)s",
)

In [4]:
metadata = {}

for dataset in DatasetEnum:
    if dataset == DatasetEnum.ALL:
        continue
    
    logging.info("-" * 50)
    logging.info(f"Calculating metadata for {dataset.value} dataset...")
    imgage_paths = list_dir("../data.nosync/processed/images")
    mask_paths = list_dir("../data.nosync/processed/masks")

    imgage_paths = [image for image in imgage_paths if dataset.value in image]
    mask_paths = [mask for mask in mask_paths if dataset.value in mask]

    imgage_paths = sorted(imgage_paths)
    mask_paths = sorted(mask_paths)

    img_mean, img_std = calculate_channel_mean_std(imgage_paths)

    # calculcate avg mask area
    logging.info("Calculating avg mask area...")
    mask_areas = []
    for mask_path in tqdm(mask_paths):
        mask = load_mask(mask_path)
        mask_areas.append(mask.mean())

    mean_mask_area = np.mean(mask_areas)
    logging.info(f"Mean mask area: {mean_mask_area}")

    metadata[dataset.value] = {
        "img_mean": list(img_mean),
        "img_std": list(img_std),
        "mean_mask_area": float(mean_mask_area),
    }

with open("../metadata.json", "w") as f:
    json.dump(metadata, f)

[2023-03-31 16:17:11,688 - INFO] --------------------------------------------------
[2023-03-31 16:17:11,688 - INFO] Calculating metadata for cil dataset...
[2023-03-31 16:17:11,864 - INFO] Calculating mean and std of the dataset...
100%|██████████| 144/144 [00:01<00:00, 133.95it/s]
[2023-03-31 16:17:12,948 - INFO] Mean: [0.50962999 0.5205989  0.51775995]
[2023-03-31 16:17:12,949 - INFO] Std: [0.21070724 0.20197992 0.19757078]
[2023-03-31 16:17:12,949 - INFO] Calculating avg mask area...
100%|██████████| 144/144 [00:00<00:00, 1264.40it/s]
[2023-03-31 16:17:13,064 - INFO] Mean mask area: 0.177976953125
[2023-03-31 16:17:13,065 - INFO] --------------------------------------------------
[2023-03-31 16:17:13,065 - INFO] Calculating metadata for epfl dataset...
[2023-03-31 16:17:13,231 - INFO] Calculating mean and std of the dataset...
100%|██████████| 339/339 [00:02<00:00, 133.68it/s]
[2023-03-31 16:17:15,769 - INFO] Mean: [0.33429297 0.33173469 0.29726037]
[2023-03-31 16:17:15,769 - INFO]

In [5]:
for dataset, data in metadata.items():
    logging.info("-" * 50)
    logging.info(f"Metadata for {dataset} dataset:")
    for key, value in data.items():
        logging.info(f"\t{key}: {value}")

[2023-03-31 16:26:00,196 - INFO] --------------------------------------------------
[2023-03-31 16:26:00,197 - INFO] Metadata for cil dataset:
[2023-03-31 16:26:00,197 - INFO] 	img_mean: [0.509629987745085, 0.5205988956971458, 0.5177599513207918]
[2023-03-31 16:26:00,198 - INFO] 	img_std: [0.21070724183488873, 0.201979917650027, 0.1975707809885112]
[2023-03-31 16:26:00,198 - INFO] 	mean_mask_area: 0.177976953125
[2023-03-31 16:26:00,198 - INFO] --------------------------------------------------
[2023-03-31 16:26:00,199 - INFO] Metadata for epfl dataset:
[2023-03-31 16:26:00,199 - INFO] 	img_mean: [0.3342929740009161, 0.33173468910878934, 0.29726036663497735]
[2023-03-31 16:26:00,199 - INFO] 	img_std: [0.18086472904005996, 0.17581956578691596, 0.17715555982745668]
[2023-03-31 16:26:00,199 - INFO] 	mean_mask_area: 0.22136139380530975
[2023-03-31 16:26:00,199 - INFO] --------------------------------------------------
[2023-03-31 16:26:00,200 - INFO] Metadata for roadtracer dataset:
[2023-