# Load garage datasets

In [1]:
import pandas as pd

df_train_1 = pd.read_parquet("/cnvrg/pj_garages_train_3-3/chips/dataset.parquet")
df_train_1["chips_path"] = "/cnvrg/pj_garages_train_3-3/chips"
df_train_2 = pd.read_parquet("/cnvrg/garage_presence_train_chips_sampled_iteration_2/chips/dataset.parquet")
df_train_2["chips_path"] = "/cnvrg/garage_presence_train_chips_sampled_iteration_2/chips"
df_train_3 = pd.read_parquet("/cnvrg/garage_presence_train_chips/chips/dataset.parquet")
df_train_3["chips_path"] = "/cnvrg/garage_presence_train_chips/chips"

df_test = pd.read_parquet("/cnvrg/pj_garages_test_1-1/chips/dataset.parquet")
df_test["chips_path"] = "/cnvrg/pj_garages_test_1-1/chips"


In [2]:
df = pd.concat([df_test, df_train_1, df_train_2, df_train_3], axis=0)

In [3]:
len(df)

604302

In [4]:
df = df.drop_duplicates(subset=["geometry",'imagery_date', 'imagery_source'])

In [5]:
def map_garage_gt(garage_label):
    label_parts = garage_label.split(";")
    with_garage = label_parts[0]
    if len(label_parts)>1 and with_garage=="with_garage":
        type = label_parts[1]
        return with_garage+"_"+type
    else:
        return with_garage

#garage_labels = ['unknown','with_garage_unknown','with_garage_basement', :'with_garage_attached', 'no_garage', 'with_garage_carport', 'with_garage_built_in']

In [6]:
df["garage_gt_simplified"] = df['geometry_labels'].apply(lambda x: map_garage_gt(x))


In [7]:
garage_labels = list(df["garage_gt_simplified"].unique())
garage_labels

['unknown',
 'with_garage_built_in',
 'with_garage_carport',
 'with_garage_attached',
 'with_garage_basement',
 'no_garage',
 'with_garage_unknown']

In [8]:
len(df)

166540

In [9]:
df.columns

Index(['attribute_geometry_id', 'contact_id', 'datarow_id', 'geometry',
       'geometry_id', 'geometry_labels', 'imagery_date', 'imagery_source',
       'label_vote_id', 'label_vote_status', 'filename', 'entry_hash',
       'chips_path', 'garage_gt_simplified'],
      dtype='object')

In [10]:

val_fraction = 0.1

# Split train, val, test (after grouping by datarow_id!)
from sklearn.model_selection import GroupShuffleSplit 

splitter = GroupShuffleSplit(test_size=val_fraction, n_splits=1, random_state = 7)
split = splitter.split(df, groups=df['datarow_id'])
train_inds, val_inds = next(split)


X_train = df.iloc[train_inds]
X_val = df.iloc[val_inds]


In [11]:
len(X_train)

149887

In [12]:
len(X_val)

16653

In [13]:
"""import shutil
base_dir = "/cnvrg/dinov2_mtmv_livarea_garages/train"
for label in garage_labels:
    shutil.rmtree(os.path.join(base_dir, label))

base_dir = "/cnvrg/dinov2_mtmv_livarea_garages/val"
for label in garage_labels:
    shutil.rmtree(os.path.join(base_dir, "garages", label))

base_dir = "/cnvrg/dinov2_mtmv_livarea_garages/test"
for label in garage_labels:
    shutil.rmtree(os.path.join(base_dir, "garages", label))
"""

'import shutil\nbase_dir = "/cnvrg/dinov2_mtmv_livarea_garages/train"\nfor label in garage_labels:\n    shutil.rmtree(os.path.join(base_dir, label))\n\nbase_dir = "/cnvrg/dinov2_mtmv_livarea_garages/val"\nfor label in garage_labels:\n    shutil.rmtree(os.path.join(base_dir, "garages", label))\n\nbase_dir = "/cnvrg/dinov2_mtmv_livarea_garages/test"\nfor label in garage_labels:\n    shutil.rmtree(os.path.join(base_dir, "garages", label))\n'

In [14]:
import os
base_dir = "/cnvrg/mtmv_livarea_garages/train"
for label in garage_labels:
    os.makedirs(os.path.join(base_dir, label), exist_ok=True)

base_dir = "/cnvrg/mtmv_livarea_garages/val"
for label in garage_labels:
    os.makedirs(os.path.join(base_dir, "garages", label), exist_ok=True)



In [15]:
X = X_val

dataset_path = "/cnvrg/mtmv_livarea_garages/val"

for single_file, pred, chips_dir in zip(X.filename, X.garage_gt_simplified, X.chips_path):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]   
    os.symlink(image_full_path, os.path.join(dataset_path, "garages", pred, filename))


In [16]:
X = X_train

dataset_path = "/cnvrg/mtmv_livarea_garages/train"

for single_file, pred, chips_dir in zip(X.filename, X.garage_gt_simplified, X.chips_path):
    image_full_path = os.path.join(chips_dir, single_file)
    filename = image_full_path.split("/")[-1]   
    os.symlink(image_full_path, os.path.join(dataset_path, pred, filename))