In [1]:
# Run first the download_from_data_store.py script, 
# which requires:
# > bash /cnvrg/install_gcloud.sh
# > pip install git+https://github.com/capeanalytics/data_store
import glob
import os
import pandas as pd
from tqdm import tqdm
import shutil

In [2]:
df = pd.read_parquet("/data/chips/chips/dataset.parquet").sample(frac=1, random_state=42)

In [3]:
df.head(3)

Unnamed: 0,geometry,geometry_id,geometry_labels,imagery_date,imagery_source,filename,entry_hash
2094991,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00)\x00\x0...",595743,fair,2022-05-26 00:00:00+00:00,sv2:nearmap_vertical_jpg:4146b212-eb57-11ec-80...,fair/8ead064573e07fa5d2c60fb822e5488e3412113d6...,8ead064573e07fa5d2c60fb822e5488e3412113d64b2b7...
1659637,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x18\x00...,160285,unknown,2022-03-23 00:00:00+00:00,sv2:nearmap_vertical_jpg:67e4dbe4-b059-11ec-a2...,unknown/20a02385b6afc7affaeec4f637c5d6fdd3330b...,20a02385b6afc7affaeec4f637c5d6fdd3330b3c7e7d3f...
450363,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00%\x00\x0...",450551,poor,2022-01-07 00:00:00+00:00,sv2:nearmap_vertical_jpg:a876c8fa-7951-11ec-a7...,poor/7f13e6001d3db7d018b26a11d4c318de727b47288...,7f13e6001d3db7d018b26a11d4c318de727b4728826588...


In [4]:
df.index.drop_duplicates().shape

(2197170,)

In [5]:
df.shape

(2197170, 7)

In [6]:
# save 20k for validation
n = 20000
dd = {
    "val": df.iloc[:n].reset_index(names="index").to_dict(orient="records"),
    "train": df.iloc[n:].reset_index(names="index").to_dict(orient="records")
}

In [7]:
lookup = {
    "severe": "n00000001",
    "poor": "n00000002",
    "fair": "n00000003",
    "good": "n00000004",
    "excellent": "n00000005",
    "unknown": "n00000006"
}

In [8]:
data_dir = "/data/2m"

In [9]:
shutil.rmtree(data_dir, ignore_errors=True)

In [10]:
for ds in dd.keys():
    
    split_dir = f"{data_dir}/{ds}"
    os.makedirs(split_dir, exist_ok=True)
    
    [os.makedirs(f"{split_dir}/{class_id}", exist_ok=True) for class_id in lookup.values()]
    
    for row in tqdm(dd[ds], total=len(dd[ds])):
        
        label_id = lookup[row['geometry_labels']]
        
        input_filename = f"/data/chips/chips/{row['filename']}"
        
        output_filename = f"{split_dir}/{label_id}/{label_id}_{row['index']}.png"
        
        os.symlink(input_filename, output_filename)

100% 20000/20000 [00:00<00:00, 26035.06it/s]
100% 2177170/2177170 [02:10<00:00, 16620.58it/s]


In [11]:
# Let's generate labels.txt
with open(f"{data_dir}/labels.txt", "w+") as fp:
    for k, v in lookup.items():
        fp.write(f"{v},{k}\n")

In [12]:
!cat /data/2m/labels.txt

n00000001,severe
n00000002,poor
n00000003,fair
n00000004,good
n00000005,excellent
n00000006,unknown


In [14]:
import sys
sys.path = ['/cnvrg/'] + sys.path

In [15]:
from dinov2.data.datasets import Cape

for split in Cape.Split:
    dataset = Cape(split=split, root=data_dir, extra=data_dir)
    dataset.dump_extra()