# Prepare Dataset for Fine-Tuning of the HEIG-VD Model

In [1]:
import os

projsoilsroot = "/proj-soils"
config_train_gt = os.path.join(projsoilsroot, "config/train", "config-train_gt-10cm.yaml")
config_train_scratch = os.path.join(projsoilsroot, "config/train", "config-train_scratch-10cm.yaml")

## GT

### 1. Snap to 10cm

In [2]:
import os
import numpy as np
import geopandas as gpd

root = os.path.join(projsoilsroot,"data/GT/20240216/0-0-0-raw/test")
out_folder = os.path.join(projsoilsroot,"data/GT/20240216/1-snapped")

if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    print(f"The directory {out_folder} was created.")

for file in os.listdir(root):
    if not file.endswith(".shp"):
        continue

    gt = gpd.read_file(os.path.join(root, file))
    bbox = list(gt.total_bounds)

    deviation_from_snap = np.array(bbox).round(1) - bbox
    plus_xmin, plus_ymin, _, _ = deviation_from_snap

    gt.geometry = gt.geometry.translate(plus_xmin, plus_ymin)
    gt.to_file(os.path.join(out_folder, file.replace(".shp", ".gpkg")))


### 2. Assign classes

In [3]:
source_folder = os.path.join(projsoilsroot,"data/GT/20240216/1-snapped")
out_folder = os.path.join(projsoilsroot,"data/GT/20240216/2-cleaned")

if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    print(f"The directory {out_folder} was created.")

The directory /proj-soils/data/GT/20240216/2-cleaned was created.


In [4]:
class_mapping = {
    np.nan: 0,
    "batiment": 1,
    "toit_vegetalise": 2,
    "surface_non_beton": 3,
    "surface_beton": 4,
    "eau_bassin": 5,
    "roche_dure_meuble": 6,
    "eau_naturelle": 7,
    "roseliere": 8,
    "sol_neige": 9,
    "sol_vegetalise": 10,
    "surface_riparienne": 11,
    "sol_divers": 12,
    "sol_vigne": 13,
    "sol_agricole": 14,
    "sol_bache": 15,
    "sol_serre_temporaire": 16,
    "serre_permanente": 17
}

soil_classes = [9, 10, 12, 13, 14, 15, 16]

package_mapping = {
    0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 6, 9: 7, 10: 8,
    11: 8, 12: 8, 13: 9, 14: 10, 15: 10, 16: 10, 17: 11
}

cl12_mapping = {
    0: 0, 
    1: 1, # batiment
    2: 1, # toit_vegetalise -> batiment
    3: 2, # surface_non_beton
    4: 3, # surface_beton
    5: 5, # eau_bassin -> eau_naturelle
    6: 4, # roche_dure_meuble
    7: 5, # eau_naturelle
    8: 6, # roseliere
    9: 7, # sol_neige
    10: 8, # sol_vegetalise
    11: 8, # surface_riparienne -> sol_vegetalise
    12: 9, # sol_divers
    13: 10, # sol_vigne
    14: 11, # sol_agricole
    15: 12, # sol_bache
    16: 12, # sol_serre_temporaire -> sol_bache
    17: 1, # serre_permanente -> batiment
    }


for root, dir, files in os.walk(source_folder):
        for file in files:
            if not file.endswith((".shp", ".gpkg")):
                continue
            
            gt = gpd.read_file(os.path.join(root, file))

            gt["CLASSE_SEN_ID"] = gt.apply(
                lambda x: class_mapping[x["CLASSE_SEN"]], axis=1)
            gt["package_id"] = gt.apply(
                lambda x: package_mapping[x["CLASSE_SEN_ID"]], axis=1)
            gt["cl12_id"] = gt.apply(
                lambda x: cl12_mapping[x["CLASSE_SEN_ID"]], axis=1)
            gt["SOIL"] = gt.apply(
                lambda x: x["CLASSE_SEN_ID"] in soil_classes, axis=1)


            # gt["geometry"] = gt.make_valid()

            gt = gt.dropna(subset = ["CLASSE_SEN_ID"])
            gt = gt.explode(index_parts=False)
            gt = gt.loc[gt["geometry"].geom_type=='Polygon']

            gt = gt[['CLASSE_SEN', 'CLASSE_SEN_ID', 'package_id', 'cl12_id', 'SOIL', 'geometry']]

            if file.endswith(".shp"):
                gt.to_file(os.path.join(out_folder, file.rstrip(".shp")+".gpkg"), driver="GPKG")
            else:
                gt.to_file(os.path.join(out_folder, file), driver="GPKG")

In [5]:
gt.head()

Unnamed: 0,CLASSE_SEN,CLASSE_SEN_ID,package_id,cl12_id,SOIL,geometry
0,surface_beton,4,3,3,False,"POLYGON ((2578518.000 1183926.801, 2578518.000..."
1,surface_beton,4,3,3,False,"POLYGON ((2578620.400 1183830.493, 2578620.400..."
2,surface_non_beton,3,2,2,False,"POLYGON ((2578630.020 1183962.259, 2578625.904..."
3,surface_beton,4,3,3,False,"POLYGON ((2578620.400 1183944.070, 2578619.407..."
4,surface_non_beton,3,2,2,False,"POLYGON ((2578620.400 1183956.393, 2578620.400..."


### 3. Rasterize

In [6]:
! python utilities/rasterize_gt.py --config_file {config_train_gt}

The directory /proj-soils/data/GT/20240216/3-rasterized-12cl/10cm was created.
[32m2024-04-09 11:23:23.865[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m125[0m - [1mLOG_FILE = '/proj-soils/logs/rasterize_gt.log'[0m
[32m2024-04-09 11:23:23.867[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m126[0m - [1mPOLYGON_FOLDER = '/proj-soils/data/GT/20240216/2-cleaned'[0m
[32m2024-04-09 11:23:23.867[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m127[0m - [1mFIELD = 'cl12_id'[0m
[32m2024-04-09 11:23:23.867[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m128[0m - [1mOUT_TIFF_FOLDER = '/proj-soils/data/GT/20240216/3-rasterized-12cl/10cm'[0m
[32m2024-04-09 11:23:23.867[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m129[0m - [1mCLASS_MAPPING = None[0m
[32m2024-04-09 11:23:23.867[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m130[0m - [1mMASK_PATH = None[0m
[32m2024-

### 4. Cut tiff to predefined grid of Daniel

In [7]:
! python utilities/cut_tiff_to_grid.py --config_file {config_train_gt}

[32m2024-04-09 11:23:52.638[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m262[0m - [1mTIFF_FOLDER = '/proj-soils/data/GT/20240216/3-rasterized-12cl/10cm'[0m
[32m2024-04-09 11:23:52.639[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m263[0m - [1mOUT_FOLDER = '/proj-soils/data/GT/20240216/4-cut-to-grid-12cl/10cm'[0m
[32m2024-04-09 11:23:52.639[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m264[0m - [1mGRID_PATH = '/proj-soils/data/grids/recursive_grids_max204-8m_51-2m.gpkg'[0m
[32m2024-04-09 11:23:52.640[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m265[0m - [1mGRID_QUERY = 'depth == 0'[0m
[32m2024-04-09 11:23:52.640[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m266[0m - [1mCELL_LENGTH = 512[0m
[32m2024-04-09 11:23:52.640[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m267[0m - [1mID_COLUMN = 'str_ids'[0m
[32m2024-04-09 11:23:52.640[0m | [1mINFO   

## Scratch

### 1. RGBI -> RGB

In [5]:
! python utilities/rgbi2rgb.py --config_file {config_train_scratch}

[32m2024-04-09 13:13:13.847[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m77[0m - [1mSOURCE_FOLDER = '/proj-soils/data/scratch/0-AOIs/'[0m
[32m2024-04-09 13:13:13.847[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m78[0m - [1mTARGET_FOLDER = '/proj-soils/data/scratch/1-rgb'[0m
[32m2024-04-09 13:13:13.847[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m79[0m - [1mLOG_FILE = '/proj-soils/logs/rgbi2rgb.log'[0m
[32m2024-04-09 13:13:13.847[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m81[0m - [1mStarted Programm[0m
[32m2024-04-09 13:13:13.858[0m | [1mINFO    [0m | [36m__main__[0m:[36mrgbi2rgb[0m:[36m31[0m - [1mProcessing scratch_20200318_1318_12501_0_47-48_aoi25.tif[0m
[32m2024-04-09 13:13:23.290[0m | [1mINFO    [0m | [36m__main__[0m:[36mrgbi2rgb[0m:[36m31[0m - [1mProcessing scratch_20200319_1043_12501_0_49-50-51-53-54_aoi40.tif[0m
[32m2024-04-09 13:13:36.487[0m | [1mINFO  

### 2. Ensure every file has resolution 10cm

In [3]:
! python utilities/rescale_tif.py --config_file {config_train_scratch}

[32m2024-04-09 13:08:59.190[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m123[0m - [1mTIFF_FOLDER = '/proj-soils/data/scratch/1-rgb'[0m
[32m2024-04-09 13:08:59.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m124[0m - [1mOUT_FOLDER = '/proj-soils/data/scratch/2-10cm'[0m
[32m2024-04-09 13:08:59.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m125[0m - [1mTARGET_RES = 0.1[0m
[32m2024-04-09 13:08:59.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m126[0m - [1mLOG_FILE = '/proj-soils/logs/rescale.log'[0m
[32m2024-04-09 13:08:59.191[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m129[0m - [1mStarted Programm[0m
[32m2024-04-09 13:08:59.431[0m | [1mINFO    [0m | [36m__main__[0m:[36mrescale_tif[0m:[36m55[0m - [1mscratch_20200324_1038_12501_0_33-34-35_aoi37.tif is already at target resolution[0m
[32m2024-04-09 13:09:14.663[0m | [1mINFO    [0m | [36m__main__[

### 3. Cut tiff to grid

In [4]:
! python utilities/cut_tiff_to_grid.py --config_file {config_train_scratch}

[32m2024-04-09 13:09:20.760[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m262[0m - [1mTIFF_FOLDER = '/proj-soils/data/scratch/2-10cm'[0m
[32m2024-04-09 13:09:20.762[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m263[0m - [1mOUT_FOLDER = '/proj-soils/data/scratch/3-cut-to-grid/10cm/512px'[0m
[32m2024-04-09 13:09:20.762[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m264[0m - [1mGRID_PATH = '/proj-soils/data/grids/recursive_grids_max204-8m_51-2m.gpkg'[0m
[32m2024-04-09 13:09:20.763[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m265[0m - [1mGRID_QUERY = 'depth == 0'[0m
[32m2024-04-09 13:09:20.763[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m266[0m - [1mCELL_LENGTH = 512[0m
[32m2024-04-09 13:09:20.763[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m267[0m - [1mID_COLUMN = 'str_ids'[0m
[32m2024-04-09 13:09:20.763[0m | [1mINFO    [0m | [36m__main__[

## Create Dataset

In [6]:
! python utilities/random_split.py --config_file ../config/train/config-train_gt-10cm.yaml

[32m2025-03-26 16:02:48.635[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m160[0m - [1mSOURCE_IPT_FOLDER = '/proj-soils/data_vm_bis/aoi_train2/im/1-ipt-10cm'[0m
[32m2025-03-26 16:02:48.635[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m161[0m - [1mSOURCE_TGT_FOLDER = '/proj-soils/data_vm_bis/aoi_train2/gt/3-opt/'[0m
[32m2025-03-26 16:02:48.635[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m162[0m - [1mTARGET_ROOT = '/proj-soils/data_vm_bis/aoi_train2/dataset/'[0m
[32m2025-03-26 16:02:48.635[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m163[0m - [1mSEED = 6[0m
[32m2025-03-26 16:02:48.636[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m164[0m - [1mSPLIT_FILE = None[0m
[32m2025-03-26 16:02:48.636[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m165[0m - [1mLOG_FILE = '/proj-soils/logs/random_split.log'[0m
[32m2025-03-26 16:02:48.636[0m | [1mINFO    [0