# Prepare GT for quantitative evaluation

## 1. Clean GT

- Clean up spelling mistakes
- Merge classes


In [35]:
import geopandas as gpd

import difflib
import os
import numpy as np

from shapely.validation import explain_validity

In [36]:
source_dir = "../data/0-GT-BS/0-0-almost_raw/"
target_dir = "../data/0-GT-BS/0-cleaned"

if not os.path.exists(target_dir):
    os.makedirs(target_dir)
    print(f"The directory {target_dir} was created.")

The directory ../data/0-GT-BS/0-cleaned was created.


Get set (all_cats) of all the different class strings present

In [37]:
nancount = 0

all_cats = set()
for root, dir, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".gpkg"):
                continue
                
            gt = gpd.read_file(os.path.join(root, file))
            # for cat in list(gt["CLASSE_SEN"]):
            for row in gt.iterrows():
                cat = row[1]["CLASSE_SEN"]
                all_cats.add(cat)

                if not isinstance(cat, str):
                    nancount += 1
                    assert row[1]["geometry"] is None
                
                if not explain_validity(row[1]["geometry"]):
                    print(file)

                
print(f'{nancount = }')

nancount = 0


In [38]:
print(f'{all_cats = }')

all_cats = {'eau_naturelle', 'sol_divers', 'batiment', 'surface_non_beton', 'sol_vegetalise', 'surface_beton'}


### Get rid of typos and merge classes that are not present in IGN classes
Use difflibs get_close_matches() to get close matches to the "real" 
categories and store them in as dictionary {(false) string: true string}

In [14]:
real_cats = ["batiment", "eau_bassin", "eau_naturelle", "roche_dure_meuble",
    "roseliere", "serre_permanente", "sol_agricole", "sol_bache", "sol_divers",
    "sol_neige", "sol_serre_temporaire", "sol_vegetalise", "sol_vigne",
    "surface_beton", "surface_non_beton", "surface_riparienne", "toit_vegetalise"]

all_cats = [cat for cat in all_cats if isinstance(cat, str)] 
all_cats = list(all_cats)

if len(all_cats) == len(real_cats):
    print("Classses seem to be consistent, no string matching required") # CM: why this test insure that there is no typos ?
    clean_dic = {cat: cat for cat in all_cats}

else:
    # filter out nans (are floats)
    found_cats = []

    clean_dic = {}
    for real_cat in real_cats:
        
        matches = difflib.get_close_matches(real_cat, all_cats, n=5, cutoff=0.8)
        for match in matches:
            clean_dic[match] = real_cat

        found_cats.extend(matches)

    # assert that for all cats exactly one match has been found
    assert len(found_cats) == len(all_cats)

AssertionError: 

In [None]:
# merge_dic = {
#     "toit_vegetalise": "batiment",
#     "roseliere": "eau",
#     "terrain_de_sport": "terre_vegetalisee",
#     "infrastructure": "revetement_impermeable"
# }

# merge_dic = {
#     "toit_vegetalise": "batiment",
#     "roche_dure_meuble": "surface_non_beton",
#     "roseliere": "eau_naturelle",
#     "sol_serre_temporaire": "serre_permanente",
#     "sol_bache": "sol_agricole",
#     "surface_riparienne": "sol_vegetalise"
#     # "terrain_de_sport": "sol_vegetalise",
#     # "eau_bassin": "eau_naturelle",
# }

Add new column with correct strings and merged classes

In [39]:
# def clean(x):
#     new = []
#     for row in x["CLASSE_SEN"]:
#         if isinstance(row, float): # in case of nan (is a float)
#             new.append("unknown")
#             continue
#         new.append(clean_dic[row])
#     return new

# def merge(x):
#     new = []
#     # for row in x["CLEAN_CLASSE_SEN"]:
#     for row in x["CLASSE_SEN"]:
#         if row in merge_dic:
#             new.append(merge_dic[row])
#         else:
#             new.append(row)
#     return new

class_mapping = {
    np.nan: 0,
    "batiment": 1,
    "toit_vegetalise": 2,
    "surface_non_beton": 3,
    "surface_beton": 4,
    "eau_bassin": 5,
    "roche_dure_meuble": 6,
    "eau_naturelle": 7,
    "roseliere": 8,
    "sol_neige": 9,
    "sol_vegetalise": 10,
    "surface_riparienne": 11,
    "sol_divers": 12,
    "sol_vigne": 13,
    "sol_agricole": 14,
    "sol_bache": 15,
    "sol_serre_temporaire": 16,
    "serre_permanente": 17
}
package_mapping = {
    0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 6, 9: 7, 10: 8,
    11: 8, 12: 8, 13: 9, 14: 10, 15: 10, 16: 10, 17: 11
}

soil_classes = [9, 10, 12, 13, 14, 15, 16]

for root, dir, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".gpkg"):
                continue
            
            # print(os.path.join(root, file))
            gt = gpd.read_file(os.path.join(root, file))

            # gt["CLEAN_CLASSE_SEN"] = clean(gt)
            # gt["CLEAN_CLASSE_SEN"] = merge(gt)
            
            gt["CLASSE_SEN_ID"] = gt.apply(
                lambda x: class_mapping[x["CLASSE_SEN"]], axis=1)
            gt["SOIL"] = gt.apply(
                lambda x: x["CLASSE_SEN_ID"] in soil_classes, axis=1)
            gt["package_id"] = gt.apply(
                lambda x: package_mapping[x["CLASSE_SEN_ID"]], axis=1)

            
            gt["geometry"] = gt.make_valid()

            gt = gt.dropna(subset = ["CLASSE_SEN"])
            gt = gt.explode(index_parts=False)
            gt = gt.loc[gt["geometry"].geom_type=='Polygon']
            gt.to_file(os.path.join(target_dir, file.rstrip(".shp")+".gpkg"))

In [40]:
gt.head()

Unnamed: 0,OBJID,ART_TXT,class_name,CLASSE_SEN,CLASSE_SEN_ID,SOIL,package_id,geometry
0,1,Building,batiment,batiment,1,False,1,"POLYGON ((2611035.276 1267869.836, 2611015.434..."
1,2,Vegetated Soil,sol_vegetatlise,sol_vegetalise,10,True,8,"POLYGON ((2611071.409 1267894.394, 2611078.314..."
2,3,Vegetated Soil,sol_vegetatlise,sol_vegetalise,10,True,8,"POLYGON ((2611084.509 1267900.341, 2611088.623..."
3,4,Building,batiment,batiment,1,False,1,"POLYGON ((2611105.582 1267891.213, 2611110.335..."
4,5,Vegetated Soil,sol_vegetatlise,sol_vegetalise,10,True,8,"POLYGON ((2611106.268 1267891.516, 2611107.127..."


# 2. Rasterize

In [41]:
! python utilities/rasterize_gt.py --config_file ../config/eval/config-eval_gt.yaml

[32m2024-10-08 14:12:54.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m125[0m - [1mLOG_FILE = '/proj-soils/logs/rasterize_gt.log'[0m
[32m2024-10-08 14:12:54.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m126[0m - [1mPOLYGON_FOLDER = '/proj-soils/data/0-GT-BS/0-cleaned/'[0m
[32m2024-10-08 14:12:54.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m127[0m - [1mFIELD = 'CLASSE_SEN'[0m
[32m2024-10-08 14:12:54.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m128[0m - [1mOUT_TIFF_FOLDER = '/proj-soils/data_vm_bis/'[0m
[32m2024-10-08 14:12:54.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m129[0m - [1mCLASS_MAPPING = {'batiment': 1, 'toit_vegetalise': 2, 'surface_non_beton': 3, 'surface_beton': 4, 'eau_bassin': 5, 'roche_dure_meuble': 6, 'eau_naturelle': 7, 'roseliere': 8, 'sol_neige': 9, 'sol_vegetalise': 10, 'surface_riparienne': 11, 'sol_divers': 12, 'sol_vigne': 1

# 3. Reclassify

In [None]:
# Reclassify carefully depending on what future evaluation has to be done !

In [51]:
! python utilities/reclassify.py --config_file ../config/eval/config-eval_gt.yaml

[32m2024-10-08 14:39:24.455[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m84[0m - [1mSOURCE_FOLDER = '/proj-soils/data_vm_bis/aoi_basel/1-rasterized/'[0m
[32m2024-10-08 14:39:24.456[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m85[0m - [1mTARGET_FOLDER = '/proj-soils/data_vm_bis/aoi_basel/2-reclassified/'[0m
[32m2024-10-08 14:39:24.456[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m86[0m - [1mMAPPING = {0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 5, 6: 4, 7: 5, 8: 6, 9: 7, 10: 8, 11: 8, 12: 9, 13: 10, 14: 11, 15: 12, 16: 12, 17: 1}[0m
[32m2024-10-08 14:39:24.456[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m87[0m - [1mLOG_FILE = '/proj-soils/logs/reclassify.log'[0m
[32m2024-10-08 14:39:24.456[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m89[0m - [1mStarted Programm[0m
[32m2024-10-08 14:39:24.456[0m | [1mINFO    [0m | [36m__main__[0m:[36mreclassify[0m:[36m38[0m - [1mRec

# 4. Cut tiff to predefined grid of Daniel

In [53]:
! python utilities/cut_tiff_to_grid.py --config_file ../config/eval/config-eval_gt.yaml

[32m2024-10-08 14:40:34.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m263[0m - [1mTIFF_FOLDER = '/proj-soils/data_vm_bis/aoi_basel/2-reclassified/'[0m
[32m2024-10-08 14:40:34.110[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m264[0m - [1mOUT_FOLDER = '/proj-soils/data_vm_bis/aoi_basel/3-cut/'[0m
[32m2024-10-08 14:40:34.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m265[0m - [1mGRID_PATH = '/proj-soils/data_vm_bis/aoi_basel/tiles_51.2_0_0.shp'[0m
[32m2024-10-08 14:40:34.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m266[0m - [1mGRID_QUERY = None[0m
[32m2024-10-08 14:40:34.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m267[0m - [1mCELL_LENGTH = 512[0m
[32m2024-10-08 14:40:34.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m268[0m - [1mID_COLUMN = 'RN'[0m
[32m2024-10-08 14:40:34.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<mo

# 5. Rescale GT ?

# 6. Addition by CM

In [54]:
! python utilities/cut_border.py --config_file ../config/config-utilities.yaml

In [6]:
! python utilities/calculate_metrics.py --config_file ../config/eval/config-eval_heigvd-145k_mixed-10cm.yaml

[32m2024-10-08 15:08:27.704[0m |                    [1mINFO    [0m | [36m<module>[0m                    :[36m617[0m - [1mPRED_FOLDER = '/proj-soils/data_vm_bis/aoi_basel/cut/'[0m
[32m2024-10-08 15:08:27.704[0m |                    [1mINFO    [0m | [36m<module>[0m                    :[36m618[0m - [1mGT_FOLDER = '/proj-soils/data_vm_bis/aoi_basel/4-cut-border/'[0m
[32m2024-10-08 15:08:27.705[0m |                    [1mINFO    [0m | [36m<module>[0m                    :[36m619[0m - [1mCONF_MATRIX_MODEL = ''[0m
[32m2024-10-08 15:08:27.705[0m |                    [1mINFO    [0m | [36m<module>[0m                    :[36m620[0m - [1mCLASSES = ['batiment', 'surface_non_beton', 'surface_beton', 'roche_dure_meuble', 'eau_naturelle', 'roseliere', 'sol_neige', 'sol_vegetalise', 'sol_divers', 'sol_vigne', 'sol_agricole', 'sol_bache'][0m
[32m2024-10-08 15:08:27.705[0m |                    [1mINFO    [0m | [36m<module>[0m                    :[36m621[0m -