In [1]:
# Run first the download_from_data_store.py script, 
# which requires:
# > bash /cnvrg/install_gcloud.sh
# > pip install git+https://github.com/capeanalytics/data_store
import glob
import os
import pandas as pd
from tqdm import tqdm
import shutil

In [2]:
df = pd.read_parquet("/data/chips/chips/dataset.parquet").sample(frac=1, random_state=42)

In [3]:
df.head(3)

Unnamed: 0,geometry,geometry_id,geometry_labels,imagery_date,imagery_source,filename,entry_hash
2094991,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00)\x00\x0...",595743,fair,2022-05-26 00:00:00+00:00,sv2:nearmap_vertical_jpg:4146b212-eb57-11ec-80...,fair/8ead064573e07fa5d2c60fb822e5488e3412113d6...,8ead064573e07fa5d2c60fb822e5488e3412113d64b2b7...
1659637,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x18\x00...,160285,unknown,2022-03-23 00:00:00+00:00,sv2:nearmap_vertical_jpg:67e4dbe4-b059-11ec-a2...,unknown/20a02385b6afc7affaeec4f637c5d6fdd3330b...,20a02385b6afc7affaeec4f637c5d6fdd3330b3c7e7d3f...
450363,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00%\x00\x0...",450551,poor,2022-01-07 00:00:00+00:00,sv2:nearmap_vertical_jpg:a876c8fa-7951-11ec-a7...,poor/7f13e6001d3db7d018b26a11d4c318de727b47288...,7f13e6001d3db7d018b26a11d4c318de727b4728826588...


In [4]:
df.index.drop_duplicates().shape

(2197170,)

In [5]:
df.shape

(2197170, 7)

In [6]:
# save 20k for validation
n = 20000
dd = {
    "val": df.iloc[:n].reset_index(names="index").to_dict(orient="records"),
    "train": df.iloc[n:].reset_index(names="index").to_dict(orient="records")
}

In [7]:
lookup = {
    "severe": "n00000001",
    "poor": "n00000002",
    "fair": "n00000003",
    "good": "n00000004",
    "excellent": "n00000005",
    "unknown": "n00000006"
}

In [8]:
data_dir = "/data/2m"

In [9]:
shutil.rmtree(data_dir, ignore_errors=True)

In [10]:
for ds in dd.keys():
    
    split_dir = f"{data_dir}/{ds}"
    os.makedirs(split_dir, exist_ok=True)
    
    [os.makedirs(f"{split_dir}/{class_id}", exist_ok=True) for class_id in lookup.values()]
    
    for row in tqdm(dd[ds], total=len(dd[ds])):
        
        label_id = lookup[row['geometry_labels']]
        
        input_filename = f"/data/chips/chips/{row['filename']}"
        
        output_filename = f"{split_dir}/{label_id}/{label_id}_{row['index']}.png"
        
        os.symlink(input_filename, output_filename)

100% 20000/20000 [00:00<00:00, 59874.71it/s]
100% 2177170/2177170 [00:58<00:00, 37308.75it/s]


In [11]:
# Let's generate labels.txt
with open(f"{data_dir}/labels.txt", "w+") as fp:
    for k, v in lookup.items():
        fp.write(f"{v},{k}\n")

In [12]:
!cat /data/2m/labels.txt

n00000001,severe
n00000002,poor
n00000003,fair
n00000004,good
n00000005,excellent
n00000006,unknown


In [14]:
import sys
sys.path = ['/cnvrg/'] + sys.path

In [15]:
from dinov2.data.datasets import Cape

for split in Cape.Split:
    dataset = Cape(split=split, root=data_dir, extra=data_dir)
    dataset.dump_extra()

In [1]:
from data_store import DataStore
from data_store.data_containers import Experiments, Dataset
from data_store.storage import GCSStorage
import glob
import os
import yaml
import pandas as pd
import glob

import nest_asyncio
nest_asyncio.apply()



In [2]:
gs = GCSStorage(
    # endpoint="http://0.0.0.0:4443", 
    path=f"gs://cape-ml-projects-data/data_stores"
)
ds = DataStore(name="dinov2", storage=gs)

Populating queue...
Download started...
Queue fully populated. Found 1 files to download
Processed 1 files in 5.1 s. In queue: 219
Processed 1 files in 10.1 s. In queue: 128
Downloaded 1 files in 11.4 s


In [3]:
ds.tree()

data_stores/dinov2/
├── chips_raw_data/
│   ├── raw_data.gpkg
│   └── raw_data.parquet
├── experiments/
│   ├── a100x4/
│   │   ├── DIN-121/
│   │   │   ├── config.yaml
│   │   │   ├── training_metrics.json
│   │   │   └── eval/
│   │   │       ├── training_112499/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_12499/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_124999/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_24999/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_37499/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_49999/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_62499/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_74999/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       ├── training_87499/
│   │   │       │   └── teacher_checkpoint.pth
│   │   │       └── tr

In [None]:
ds['rcr_evaluation_imagenet'].clone("/data/rcr_evaluation_imagenet")

Populating queue...
Download started...
Processed 0 files in 5.5 s. In queue: 17026
Processed 691 files in 10.7 s. In queue: 33303
Processed 7464 files in 15.8 s. In queue: 44262
Processed 25962 files in 20.9 s. In queue: 44037
Queue fully populated. Found 90008 files to download
Processed 43783 files in 25.9 s. In queue: 46447
Processed 60688 files in 30.9 s. In queue: 29542
Processed 83290 files in 36.0 s. In queue: 6938
Processed 90008 files in 41.0 s. In queue: 0
PID 2901382: Got exception HTTPSConnectionPool(host='storage.googleapis.com', port=443): Max retries exceeded with url: /download/storage/v1/b/cape-ml-projects-data/o/data_stores%2Fdinov2%2Frcr_evaluation_imagenet%2Ftrain%2Fn00000000%2Fn00000000_61628.png?alt=media (Caused by SSLError(SSLZeroReturnError(6, 'TLS/SSL connection has been closed (EOF) (_ssl.c:997)'))) when trying to download data_stores/dinov2/rcr_evaluation_imagenet/train/n00000000/n00000000_61628.png
PID 2901382: Got exception HTTPSConnectionPool(host='stora