In [1]:
import pandas as pd
from data_science_tools.core import temporary_configuration
from data_science_tools.datasets import AttrGeomSet, GeomLabelSet
from data_science_tools.datasets import RawGeomVoteSet, GeomLabelSet

rcr_mnemonics = ["rcr_v3_1/20201110_v3_train",
"rcr_v3_1/20201110_v3_val",
"rcr_v3_1/20201201_nearmap_conus_al_1",
"rcr_v3_1/20201201_nearmap_aus_al_1",
"rcr_v3_1/20210111_al2_temp_nearmap_can",
"rcr_v3_1/20210111_al2_temp_nearmap_conus",
"rcr_v3_1/20201110_v3_test"                 
]
all_dfs = []
with temporary_configuration() as tmp_config:
    tmp_config["datasets"]["perform_checks"] = False
    for m in rcr_mnemonics:
        ds = RawGeomVoteSet.from_mnemonic(m)
        df = ds.to_pandas()
        # group votes, keep only required columns
        columns_to_keep = ["geometry", "imagery_source", "imagery_date"]
        # Labeled dataset. Let's aggregate:
        cols_aggregations = {"geometry_labels": list}
        # If this dataset contains many votes, keep them
        if "label_vote_id" in df.columns:
            cols_aggregations["label_vote_id"] = list
        
        # Keep a few columns for bedrock datasets
        for col in ["cache_key", "dataset_path", "dataset_format"]:
            if col in df.columns:
                # Keep first
                cols_aggregations[col] = "first"  # type: ignore
        
        df_agg = df.groupby(columns_to_keep)[list(cols_aggregations.keys())].agg(cols_aggregations).reset_index()

        all_dfs.append(df_agg)
concatenated_df = pd.concat(all_dfs, axis=0)

In [2]:
len(concatenated_df)

245285

In [43]:
df = concatenated_df.drop_duplicates(subset='geometry')

In [44]:
def most_frequent(l):
    return max(set(l), key = l.count)

df["geometry_label"] = df.geometry_labels.apply(lambda g: most_frequent(g))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["geometry_label"] = df.geometry_labels.apply(lambda g: most_frequent(g))


In [45]:
len(df)

242786

In [46]:
df = df.drop(columns=['geometry_labels', 'label_vote_id'])
df = df.rename(columns={'geometry_label': 'geometry_labels'})

In [47]:
df

Unnamed: 0,geometry,imagery_source,imagery_date,geometry_labels
0,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x00\x01...,sv2:nearmap_vertical_jpg:49085fd0-b968-11e7-a5...,2017-09-09 00:00:00+00:00,2_poor
1,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x00\x01...,sv2:nearmap_vertical_jpg:223cb98a-c008-11e8-b4...,2018-09-07 00:00:00+00:00,multiple_roofs
2,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x00\x01...,sv2:nearmap_vertical_jpg:4603d90e-1192-11e8-b0...,2018-01-24 00:00:00+00:00,5_excellent
3,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x00\x01...,sv2:nearmap_vertical_jpg:22f2e790-bb6d-11e8-b1...,2018-09-07 00:00:00+00:00,multiple_roofs
4,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x00\x01...,sv2:nearmap_vertical_jpg:35c9a9c4-e41e-11e8-85...,2018-10-25 00:00:00+00:00,4_good
...,...,...,...,...
9309,b'\x01\x06\x00\x00\x00\x07\x00\x00\x00\x01\x03...,sv2:nearmap_vertical_jpg:a2fde4d0-c8d8-11e8-be...,2018-09-23 00:00:00+00:00,no_roof
9310,b'\x01\x06\x00\x00\x00\x07\x00\x00\x00\x01\x03...,sv2:nearmap_vertical_jpg:85032e56-2e3f-11e8-8d...,2018-03-08 00:00:00+00:00,no_roof
9311,b'\x01\x06\x00\x00\x00\x08\x00\x00\x00\x01\x03...,sv2:nearmap_vertical_jpg:b78d907e-ad55-11e8-95...,2018-08-06 00:00:00+00:00,5_excellent
9312,b'\x01\x06\x00\x00\x00\x08\x00\x00\x00\x01\x03...,sv2:nearmap_vertical_jpg:a6cb78ea-c1ba-11e8-85...,2018-09-17 00:00:00+00:00,multiple_roofs


In [48]:
df.to_parquet("/cnvrg/rcr_df_for_chipping.parquet")

In [None]:
#################### DO chipping #############################

In [50]:
df = pd.read_parquet("/data/rcr_for_dino_chips/chips/dataset.parquet")

In [52]:
df.columns

Index(['geometry', 'imagery_source', 'imagery_date', 'geometry_labels',
       'filename', 'entry_hash'],
      dtype='object')

In [53]:
val_fraction = 0.4

# Split train, val, test (after grouping by datarow_id!)
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=val_fraction, n_splits=1, random_state = 7)
split = splitter.split(df, groups=df['geometry'])
train_inds, val_inds = next(split)


X_train = df.iloc[train_inds]
X_val = df.iloc[val_inds]


In [54]:
rcr_labels = list(X_train["geometry_labels"].unique())

In [57]:
import os
base_dir = "/cnvrg/rcr_evaluation/train"
for label in rcr_labels:
    os.makedirs(os.path.join(base_dir, label), exist_ok=True)

base_dir = "/cnvrg/rcr_evaluation/val"
for label in rcr_labels:
    os.makedirs(os.path.join(base_dir, label), exist_ok=True)

In [None]:
X = X_val

dataset_path = "/cnvrg/rcr_evaluation/val"
chips_dir = "/data/rcr_for_dino_chips/chips"
for single_file, pred in zip(X.filename, X.geometry_labels):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]   
    os.symlink(image_full_path, os.path.join(dataset_path, pred, filename))

In [None]:
X = X_train

dataset_path = "/cnvrg/rcr_evaluation/train"
chips_dir = "/data/rcr_for_dino_chips/chips"
for single_file, pred in zip(X.filename, X.geometry_labels):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]   
    os.symlink(image_full_path, os.path.join(dataset_path, pred, filename))