In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
df = pd.read_parquet("/cnvrg/dinov2_mtmv_ds/chips/dataset.parquet")

# Assign RCRv3 and RG GT to the samples

In [2]:

rg_labels = ["flat", "gable", "hip", "mixed_w_wind_credit", "mixed_wo_wind_credit", "unknown", "invalid"]
rcr_labels = ["1_severe", "2_poor", "3_fair", "4_good", "5_excellent", "no_roof", "multiple_roofs", "rating_impossible"]

def get_rcr_and_rg_prediction(row):
    rcr_prediction = None
    rg_prediction = None
    rg_preds = row["roof_geometry"]
    if rg_preds!=None:
        rg_prediction = rg_labels[np.argmax(eval(rg_preds))]
    rcr_preds = row["roof_condition_rating_3.0"]
    if rcr_preds!=None:
        rcr_prediction = rcr_labels[np.argmax(eval(rcr_preds))]
    return pd.Series({"rcr_prediction": rcr_prediction, "rg_prediction": rg_prediction})

In [3]:
rcr_rg_results = df.apply(lambda row: get_rcr_and_rg_prediction(row), axis=1)

In [4]:
df_with_class = pd.concat([df, rcr_rg_results], axis=1)


In [5]:
cols_to_keep = ['input_survey_id',
 'input_datatype',
 'aoi_type',
'geometry',
 'imagery_source',
 'filename',
 'entry_hash',
 'vendor',
'rcr_prediction',
 'rg_prediction']
df_with_class = df_with_class[cols_to_keep]
df_with_class.head()


Unnamed: 0,input_survey_id,input_datatype,aoi_type,geometry,imagery_source,filename,entry_hash,vendor,rcr_prediction,rg_prediction
0,1626872,pnoa_midres_vertical_jpg,midres_roof,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00\x10\x00...,x:pnoa_midres_vertical_jpg:x:1626872,f3/f35f282f5ca665ca18a68814205d4f0e0f66abdaee6...,f35f282f5ca665ca18a68814205d4f0e0f66abdaee6faa...,pnoa_midres_vertical_jpg,,unknown
1,1391,nearmap_vertical_jpg,PL_roof,b'\x01\x03\x00\x00\x00\x01\x00\x00\x00A\x00\x0...,x:nearmap_vertical_jpg:x:1391,d6/d6ce5e1dd0c1acbec39bfda25b45c922a9748af0635...,d6ce5e1dd0c1acbec39bfda25b45c922a9748af0635169...,nearmap_vertical_jpg,4_good,mixed_wo_wind_credit
2,3609,nearmap_vertical_jpg,PL_roof,b'\x01\x06\x00\x00\x00\x02\x00\x00\x00\x01\x03...,x:nearmap_vertical_jpg:x:3609,2c/2ca44505a4be9c3091296ca1a304fcf569dec7debd0...,2ca44505a4be9c3091296ca1a304fcf569dec7debd0433...,nearmap_vertical_jpg,3_fair,gable
5,1583,nearmap_vertical_jpg,PL_roof,b'\x01\x06\x00\x00\x00\x04\x00\x00\x00\x01\x03...,x:nearmap_vertical_jpg:x:1583,49/49be83be8a1968c96e3470a6c838d3f07c729829c44...,49be83be8a1968c96e3470a6c838d3f07c729829c445c3...,nearmap_vertical_jpg,1_severe,flat
7,1625395,nearmap_vertical_jpg,PL_roof,"b""\x01\x03\x00\x00\x00\x01\x00\x00\x00$\x00\x0...",x:nearmap_vertical_jpg:x:1625395,63/63ddbb1d490fb00c3099be6a15a0e516dc8060fb388...,63ddbb1d490fb00c3099be6a15a0e516dc8060fb388223...,nearmap_vertical_jpg,3_fair,mixed_wo_wind_credit


In [6]:
df_with_class.to_parquet("/cnvrg/dinov2_mtmv_ds/chips/dataset_with_label.parquet")

In [7]:
df_with_class = pd.read_parquet("/cnvrg/dinov2_mtmv_ds/chips/dataset_with_label.parquet")

In [8]:
len(df_with_class)

600983

In [9]:
df_with_class_unique_geometries = df_with_class.drop_duplicates(subset=["geometry"])
len(df_with_class_unique_geometries)

566724

# split into train, val, test

In [18]:

val_fraction = 0.1

from sklearn.model_selection import train_test_split
X_train, X_val= train_test_split(df_with_class_unique_geometries, test_size=val_fraction, random_state=42)




In [19]:
print(len(X_train))
print(len(X_val))


510051
56673


# Create new folder structure, assign GT classes and create sym links

In [20]:
import os
base_dir = "/cnvrg/mtmv_livarea_garages/train"
for label in rg_labels:
    os.makedirs(os.path.join(base_dir, label), exist_ok=True)
for label in rcr_labels:
    os.makedirs(os.path.join(base_dir, label), exist_ok=True)
base_dir = "/cnvrg/mtmv_livarea_garages/val"
for label in rg_labels:
    os.makedirs(os.path.join(base_dir, "rg", label), exist_ok=True)
for label in rcr_labels:
    os.makedirs(os.path.join(base_dir, "rcr", label), exist_ok=True)


In [21]:
os.makedirs("/cnvrg/mtmv_livarea_garages/train/no_label", exist_ok=True)
os.makedirs("/cnvrg/mtmv_livarea_garages/val/no_label", exist_ok=True)
os.makedirs("/cnvrg/mtmv_livarea_garages/val/no_label", exist_ok=True)

# folders for samples without label information

# Train and val set

In [22]:
X = X_train

In [23]:
X.rcr_prediction.isnull().sum()

224687

In [24]:
X.rg_prediction.isnull().sum()

60096

In [25]:
num_rcr_gt = 0
num_rg_gt = 0
desired_num_rcr_gt = int(len(X)/2)
dataset_path = "/cnvrg/mtmv_livarea_garages/train"

for single_file, rcr_pred, rg_pred in zip(X.filename, X.rcr_prediction, X.rg_prediction):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]
    # check whether we rcr gt
    if rcr_pred and num_rcr_gt<desired_num_rcr_gt:
        num_rcr_gt+=1
        # put into corresponding rcr gt folder
        os.symlink(image_full_path, os.path.join(dataset_path, rcr_pred, filename))
    elif rg_pred:
        num_rg_gt +=1
        # put into corresponding rg gt folder
        os.symlink(image_full_path, os.path.join(dataset_path, rg_pred, filename))
    else:
        os.symlink(image_full_path, os.path.join(dataset_path, "no_label", filename))

In [26]:
X = X_val
num_rcr_gt = 0
num_rg_gt = 0
desired_num_rcr_gt = int(len(X)/2)
dataset_path = "/cnvrg/mtmv_livarea_garages/val"

for single_file, rcr_pred, rg_pred in zip(X.filename, X.rcr_prediction, X.rg_prediction):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]
    # check whether we rcr gt
    if rcr_pred and num_rcr_gt<desired_num_rcr_gt:
        num_rcr_gt+=1
        # put into corresponding rcr gt folder
        os.symlink(image_full_path, os.path.join(dataset_path, "rcr", rcr_pred, filename))
    elif rg_pred:
        num_rg_gt +=1
        # put into corresponding rg gt folder
        os.symlink(image_full_path, os.path.join(dataset_path, "rg", rg_pred, filename))
    else:
        os.symlink(image_full_path, os.path.join(dataset_path, "no_label", filename))