In [1]:
import pandas as pd
nearmap_df_1 = pd.read_parquet("/data/finetune_mtmv_nearmap_rg/chipping/chips/dataset.parquet")
nearmap_df_1["chips_path"] = "/data/finetune_mtmv_nearmap_rg/chipping/chips"
nearmap_df_2 = pd.read_parquet("/data/evaluate_mtmv_nearmap_rg/chips/dataset.parquet")
nearmap_df_2["chips_path"] = "/data/evaluate_mtmv_nearmap_rg/chips"

# /data/evaluate_mtmv_nearmap_rg and /data/evaluate_mtmv_nearmap_rg are old rg test and train sets (multiple worker votes) from these aws cnvrg datasets:
# gs://cape-ml-historical-data/202307_aws_cnvrg_datasets/evaluate_mtmv_nearmap_rg
# gs://cape-ml-historical-data/202307_aws_cnvrg_datasets/finetune_mtmv_nearmap_rg

In [2]:
import random

from collections import Counter

def majority_vote(vote_list):
    """Probably the 1,000th implementation of majority vote at cape.
    But this is a pretty fast one, with random tie-breaking.
    """
    counter = Counter(vote_list)
    return random.choice([x for x in counter if counter[x] == max(counter.values())])

In [3]:
df = nearmap_df_1
columns_to_keep = ["geometry", "imagery_source", "imagery_date"]

groupby = "geometry_id"
if groupby is not None and groupby not in columns_to_keep:
    columns_to_keep.append(groupby)

# group votes, keep only required columns
if "geometry_labels" in df.columns:

    # Labeled dataset. Let's aggregate:

    cols_aggregations = {"geometry_labels": list}
    # If this dataset contains many votes, keep them
    if "label_vote_id" in df.columns:
        cols_aggregations["label_vote_id"] = list

    # Keep a few columns for bedrock datasets
    for col in ["cache_key", "dataset_path", "dataset_format", "filename", "chips_path"]:
        if col in df.columns:
            # Keep first
            cols_aggregations[col] = "first"  # type: ignore

    df = df.groupby(columns_to_keep)[list(cols_aggregations.keys())].agg(cols_aggregations).reset_index()
    df.geometry_labels = df.geometry_labels.apply(lambda x: majority_vote(x))
    df = df.drop(columns="label_vote_id")
X_train_nearmap = df

In [4]:
df = nearmap_df_2
columns_to_keep = ["geometry", "imagery_source", "imagery_date"]

groupby = "geometry_id"
if groupby is not None and groupby not in columns_to_keep:
    columns_to_keep.append(groupby)

# group votes, keep only required columns
if "geometry_labels" in df.columns:

    # Labeled dataset. Let's aggregate:

    cols_aggregations = {"geometry_labels": list}
    # If this dataset contains many votes, keep them
    if "label_vote_id" in df.columns:
        cols_aggregations["label_vote_id"] = list

    # Keep a few columns for bedrock datasets
    for col in ["cache_key", "dataset_path", "dataset_format", "filename", "chips_path"]:
        if col in df.columns:
            # Keep first
            cols_aggregations[col] = "first"  # type: ignore

    df = df.groupby(columns_to_keep)[list(cols_aggregations.keys())].agg(cols_aggregations).reset_index()
    df.geometry_labels = df.geometry_labels.apply(lambda x: majority_vote(x))
    df = df.drop(columns="label_vote_id")
X_val_nearmap = df

In [5]:
len(X_train_nearmap)

43968

In [6]:
len(X_val_nearmap)

11562

In [7]:
rg_labels = list(X_train_nearmap["geometry_labels"].unique())

In [8]:
import os
base_dir = "/cnvrg/rg_evaluation/train"
for label in rg_labels:
    os.makedirs(os.path.join(base_dir, label), exist_ok=True)

base_dir = "/cnvrg/rg_evaluation/val"
for label in rg_labels:
    os.makedirs(os.path.join(base_dir, label), exist_ok=True)


In [9]:
X_val_nearmap.columns

Index(['geometry', 'imagery_source', 'imagery_date', 'geometry_id',
       'geometry_labels', 'filename', 'chips_path'],
      dtype='object')

In [10]:
X = X_val_nearmap

dataset_path = "/cnvrg/rg_evaluation/val"

for single_file, pred, chips_dir in zip(X.filename, X.geometry_labels, X.chips_path):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]   
    os.symlink(image_full_path, os.path.join(dataset_path, pred, filename))


In [11]:
X = X_train_nearmap

dataset_path = "/cnvrg/rg_evaluation/train"

for single_file, pred, chips_dir in zip(X.filename, X.geometry_labels, X.chips_path):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]   
    os.symlink(image_full_path, os.path.join(dataset_path, pred, filename))


In [12]:
import glob
import os
import shutil

root = "/cnvrg"
dataset = "rg_evaluation"

out_dataset = f"{dataset}_imagenet"
shutil.rmtree(f"{root}/{out_dataset}", ignore_errors=True)

In [13]:
files = glob.glob(f"{root}/{dataset}/train/**/*.png", recursive=True)
all_labels = set(
    [
        x.split(os.path.sep)[-2] for x in files
    ]
)
all_labels = {f"n{i:08d}": l for i, l in enumerate(sorted(all_labels))}
all_labels_i = {v:k for k,v in all_labels.items()}
all_labels

{'n00000000': 'flat',
 'n00000001': 'gable',
 'n00000002': 'hip',
 'n00000003': 'mixed_w_wind_credit',
 'n00000004': 'mixed_wo_wind_credit',
 'n00000005': 'unknown'}

In [14]:
os.makedirs(f"{root}/{out_dataset}/train", exist_ok=True)
os.makedirs(f"{root}/{out_dataset}/val", exist_ok=True)

with open(f"{root}/{out_dataset}/labels.txt", "w+") as fp:
    for i, l in all_labels.items():
        fp.write(f"{i},{l}\n")

In [15]:
for i, f in enumerate(files):
    split, class_label, filename = f.split(os.path.sep)[-3:]

    assert split in ['train', 'val', 'test'], f"{split} not a split"

    assert class_label in all_labels.values()

    d = f"{root}/{out_dataset}/{split}/{all_labels_i[class_label]}"

    os.makedirs(d, exist_ok=True)

    os.symlink(f, f"{d}/{all_labels_i[class_label]}_{i}.png")

In [16]:
files = glob.glob(f"{root}/{dataset}/val/**/*.png", recursive=True)
for i, f in enumerate(files):
    split, class_label, filename = f.split(os.path.sep)[-3:]

    assert split in ['train', 'val', 'test'], f"{split} not a split"

    assert class_label in all_labels.values()

    d = f"{root}/{out_dataset}/{split}/{all_labels_i[class_label]}"

    os.makedirs(d, exist_ok=True)

    os.symlink(f, f"{d}/{all_labels_i[class_label]}_{i}.png")

In [17]:
root = "/cnvrg"

out_dataset = f"{dataset}_imagenet"
from dinov2.data.datasets import ImageNet
for split in [ImageNet.Split.TRAIN, ImageNet.Split.VAL]:
    dataset = ImageNet(split=split, root=f"{root}/{out_dataset}", extra=f"{root}/{out_dataset}")
    dataset.dump_extra()

In [21]:
import os

count = 0
for root_dir, cur_dir, files in os.walk(r'/cnvrg/rg_evaluation_imagenet/train'):
    count += len(files)
print('file count train:', count)
count = 0
for root_dir, cur_dir, files in os.walk(r'/cnvrg/rg_evaluation_imagenet/val'):
    count += len(files)
print('file count val:', count)

file count train: 43968
file count val: 11562
