# Prepare GT for quantitative evaluation

## 1. Clean GT

- Clean up spelling mistakes
- Merge classes


In [None]:
import geopandas as gpd

import difflib
import os
import numpy as np

from shapely.validation import explain_validity

In [None]:
source_dir = "../data/GT/20231004/0-0-almost_raw/"
target_dir = "../data/GT/20231004/0-cleaned"

if not os.path.exists(target_dir):
    os.makedirs(target_dir)
    print(f"The directory {target_dir} was created.")

Get set (all_cats) of all the different class strings present

In [None]:
nancount = 0

all_cats = set()
for root, dir, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".shp"):
                continue
                
            gt = gpd.read_file(os.path.join(root, file))
            # for cat in list(gt["CLASSE_SEN"]):
            for row in gt.iterrows():
                cat = row[1]["CLASSE_SEN"]
                all_cats.add(cat)

                if not isinstance(cat, str):
                    nancount += 1
                    assert row[1]["geometry"] is None
                
                if not explain_validity(row[1]["geometry"]):
                    print(file)

                
print(f'{nancount = }')

In [None]:
print(f'{all_cats = }')

### Get rid of typos and merge classes that are not present in IGN classes
Use difflibs get_close_matches() to get close matches to the "real" 
categories and store them in as dictionary {(false) string: true string}

In [None]:
real_cats = ["batiment", "eau_bassin", "eau_naturelle", "roche_dure_meuble",
    "roseliere", "serre_permanente", "sol_agricole", "sol_bache", "sol_divers",
    "sol_neige", "sol_serre_temporaire", "sol_vegetalise", "sol_vigne",
    "surface_beton", "surface_non_beton", "surface_riparienne", "toit_vegetalise"]

all_cats = [cat for cat in all_cats if isinstance(cat, str)] 
all_cats = list(all_cats)

if len(all_cats) == len(real_cats):
    print("Classses seem to be consistent, no string matching required") # Review: why does this test insure that there is no typos?
    clean_dic = {cat: cat for cat in all_cats}

else:
    # filter out nans (are floats)
    found_cats = []

    clean_dic = {}
    for real_cat in real_cats:
        
        matches = difflib.get_close_matches(real_cat, all_cats, n=5, cutoff=0.8)
        for match in matches:
            clean_dic[match] = real_cat

        found_cats.extend(matches)

    # assert that for all cats exactly one match has been found
    assert len(found_cats) == len(all_cats)

In [None]:
# merge_dic = {
#     "toit_vegetalise": "batiment",
#     "roseliere": "eau",
#     "terrain_de_sport": "terre_vegetalisee",
#     "infrastructure": "revetement_impermeable"
# }

# merge_dic = {
#     "toit_vegetalise": "batiment",
#     "roche_dure_meuble": "surface_non_beton",
#     "roseliere": "eau_naturelle",
#     "sol_serre_temporaire": "serre_permanente",
#     "sol_bache": "sol_agricole",
#     "surface_riparienne": "sol_vegetalise"
#     # "terrain_de_sport": "sol_vegetalise",
#     # "eau_bassin": "eau_naturelle",
# }

Add new column with correct strings and merged classes

In [None]:
# def clean(x):
#     new = []
#     for row in x["CLASSE_SEN"]:
#         if isinstance(row, float): # in case of nan (is a float)
#             new.append("unknown")
#             continue
#         new.append(clean_dic[row])
#     return new

# def merge(x):
#     new = []
#     # for row in x["CLEAN_CLASSE_SEN"]:
#     for row in x["CLASSE_SEN"]:
#         if row in merge_dic:
#             new.append(merge_dic[row])
#         else:
#             new.append(row)
#     return new

class_mapping = {
    np.nan: 0,
    "batiment": 1,
    "toit_vegetalise": 2,
    "surface_non_beton": 3,
    "surface_beton": 4,
    "eau_bassin": 5,
    "roche_dure_meuble": 6,
    "eau_naturelle": 7,
    "roseliere": 8,
    "sol_neige": 9,
    "sol_vegetalise": 10,
    "surface_riparienne": 11,
    "sol_divers": 12,
    "sol_vigne": 13,
    "sol_agricole": 14,
    "sol_bache": 15,
    "sol_serre_temporaire": 16,
    "serre_permanente": 17
}
package_mapping = {
    0: 0, 1: 1, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 6, 9: 7, 10: 8,
    11: 8, 12: 8, 13: 9, 14: 10, 15: 10, 16: 10, 17: 11
}

soil_classes = [9, 10, 12, 13, 14, 15, 16]

for root, dir, files in os.walk(source_dir):
        for file in files:
            if not file.endswith(".shp"):
                continue
            
            gt = gpd.read_file(os.path.join(root, file))

            # gt["CLEAN_CLASSE_SEN"] = clean(gt)
            # gt["CLEAN_CLASSE_SEN"] = merge(gt)
            
            gt["CLASSE_SEN_ID"] = gt.apply(
                lambda x: class_mapping[x["CLASSE_SEN"]], axis=1)
            gt["SOIL"] = gt.apply(
                lambda x: x["CLASSE_SEN_ID"] in soil_classes, axis=1)
            gt["package_id"] = gt.apply(
                lambda x: package_mapping[x["CLASSE_SEN_ID"]], axis=1)

            
            gt["geometry"] = gt.make_valid()

            gt = gt.dropna(subset = ["CLASSE_SEN"])
            gt = gt.explode(index_parts=False)
            gt = gt.loc[gt["geometry"].geom_type=='Polygon']
            gt.to_file(os.path.join(target_dir, file.rstrip(".shp")+".gpkg"))

In [None]:
gt.head()

# 2. Rasterize

In [None]:
! python utilities/rasterize_gt.py --config_file ../config/config-eval_gt.yaml

# 3. Reclassify

In [None]:
! python utilities/reclassify.py --config_file ../config/config-eval_gt.yaml

# 4. Cut tiff to predefined grid of Daniel

In [None]:
! python utilities/cut_tiff_to_grid.py --config_file ../config/config-eval_gt.yaml

# 5. Rescale GT