# Cleanup dataset
Released datasets from the original authors does not comply with `torchvision` dataset convetions. This notebook will help you reformat them.

In [4]:
%mkdir workspace
%cd workspace

/mnt/d/courses/RANSAC-Flow/notebooks/workspace


In [178]:
import io
import zipfile
from pathlib import Path

import pandas as pd


## MegaDepth
Please download it from their [Google Drive](https://drive.google.com/file/d/1SikcOvCJ-zznOyCRJCTGtpKtTp01Jx5g/view?usp=sharing) and save it as `MegaDepth.zip`. This includes training (coarse aligned and none-aligned), validation, and testing set. 

In [204]:
src_zip = zipfile.ZipFile('MegaDepth.zip', 'r')
dst_zip = zipfile.ZipFile('MegaDepth_cleansed.zip', 'x')

src_root_path = zipfile.Path(src_zip, at='MegaDepth')

Reorganize training images from `<class>_[123].jpg` to `<class>/[123].jpg`. (Run time 3.5 min.)

In [205]:
root_dir = Path("train")

src_path = src_root_path / "MegaDepth_Train"
for file in src_path.iterdir():
    cls_name, fname = file.name.split("_", maxsplit=1)
    dst_path = root_dir / cls_name / fname

    data = file.read_bytes()
    dst_zip.writestr(str(dst_path), data, compress_type=zipfile.ZIP_DEFLATED)


Move validation set. (Run time 1 min.)
- `images` (renamed from `img`) contains all images of any given set, class name here does *not* relate to training set.
- `matches.csv` (renamed from `corr.csv`) contains all correspondences and matching image paths.
- `affine.pkl` (renamed from `coarse.pkl`) contains affine transformation matrix for given image paths in `corr.csv`.

In [206]:
root_dir = Path("validate")

# move single files
src_path = src_root_path / "Val"
for file in src_path.iterdir():
    # we deal with images later
    if file.is_dir():
        continue

    rename = {"corr.csv": "matches.csv", "coarse.pkl": "affine.pkl"}
    dst_path = root_dir / rename[file.name]

    data = file.read_bytes()
    dst_zip.writestr(str(dst_path), data, compress_type=zipfile.ZIP_DEFLATED)

# move images, these already comply with torchvision format
src_path = src_root_path / 'Val'/"img"
for cls_name in src_path.iterdir():
    for file in cls_name.iterdir():
        # NOTE these *.name are actually zipfile.Path, not pathlib.Path
        dst_path = root_dir / "images" / str(int(cls_name.name)) / file.name

        data = file.read_bytes()
        dst_zip.writestr(str(dst_path), data, compress_type=zipfile.ZIP_DEFLATED)


Move testing set. (Run time 1.5 min.)
- `images` (renamed from `test1600Pairs`) contains all images of any given set, class name here does *not* relate to training set.
- `matches.csv` (renamed from `test1600Pairs.csv`) contains all correspondences and matching image paths. Update `scene` column to save class names.

In [207]:
root_dir = Path("test")

# move single files
src_path = src_root_path / "Test"
for file in src_path.iterdir():
    # we deal with images later
    if file.is_dir():
        continue

    rename = {"test1600Pairs.csv": "matches.csv"}
    dst_path = root_dir / rename[file.name]

    # extract to DataFrame
    data = file.read_bytes()
    buffer = io.BytesIO(data)
    df = pd.read_csv(buffer)

    df_src = df["source_image"].str.split(pat="_", n=1, expand=True)
    df_src.columns = ["scene", "source_image"]
    df_dst = df["target_image"].str.split(pat="_", n=1, expand=True)
    df_dst.columns = ["scene", "target_image"]

    mask = df_src["scene"] != df_dst["scene"]
    if mask.any():
        raise RuntimeError("found source-target image pairs from different scenes")

    # we want to consolidate scenes (class names)
    df[["scene", "source_image"]] = df_src
    df["target_image"] = df_dst["target_image"]

    # turn to numeric to remove leading zeros
    df["scene"] = df["scene"].astype(int)

    # restore from DataFrame
    buffer = io.BytesIO()
    df.to_csv(buffer, index=False)
    buffer.seek(0)
    data = buffer.read()

    dst_zip.writestr(str(dst_path), data, compress_type=zipfile.ZIP_DEFLATED)

src_path = src_root_path / "Test" / "test1600Pairs"
for file in src_path.iterdir():
    cls_name, fname = file.name.split("_", maxsplit=1)
    dst_path = root_dir / "images" / str(int(cls_name)) / fname

    data = file.read_bytes()
    dst_zip.writestr(str(dst_path), data, compress_type=zipfile.ZIP_DEFLATED)


Make sure you execute this cell to save the final zip!

In [208]:
src_zip.close()
dst_zip.close()