In [None]:
## Load data

In [None]:
import geopandas as gpd
import numpy as np
import pickle

# Load handcrafted features
mesh = gpd.read_file("/path/to/your/data/data.gpkg")

# Label assignment
mesh["label"] = np.where(
    (mesh["vegetation"] <= 0.95)
    & (mesh["ghsl"] >= 0.5)
    & (mesh["osm"] <= 0.5)
    & (mesh["favelas"] > 0.9),
    1,
    np.where(
        (mesh["vegetation"] <= 0.95)
        & (mesh["ghsl"] >= 0.5)
        & (mesh["osm"] <= 0.5)
        & (mesh["favelas"] == 0),
        0,
        np.nan,
    ),
)

# Filter valid labeled samples
dataset = mesh[mesh["label"].notna()].copy()

# Load zones shapefile
zones = gpd.read_file("/path/to/your/data/zones.shp")

# Compute centroids and perform spatial join with zones
dataset["centroid"] = dataset.geometry.centroid
points_zones = gpd.sjoin(
    dataset.set_geometry("centroid"),
    zones[["fid", "geometry"]],
    how="left",
    predicate="within",
)
dataset["zone"] = points_zones["fid"]
dataset = dataset.drop(columns=["centroid"])
dataset = dataset[dataset["zone"].notna()]

# Load precomputed deep features
with open("/path/to/your/data/deep_features.pkl", "rb") as f:
    features, ids = pickle.load(f)

# Ensure ID type consistency
id_type = type(dataset["id"].iloc[0])
ids = [id_type(i) for i in ids]

# Map deep learning features to dataset entries
id_to_features = dict(zip(ids, features))
dataset["dl_features"] = dataset["id"].map(id_to_features)

In [None]:
## Cross-validation: single view with deep features

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score

zones = dataset['zone'].unique().tolist()
zones.reverse()

f1_scores = [[] for _ in range(len(zones))]
precision_scores = [[] for _ in range(len(zones))]
recall_scores = [[] for _ in range(len(zones))]
kappa_scores = [[] for _ in range(len(zones))]

for _ in range(10):
    folds = []
    for z in zones:
        dataset_zone = dataset[dataset['zone'] == z]
        X, y = np.array(dataset_zone['dl_features'].tolist()), dataset_zone['label'].values

        class_0 = X[y == 0]
        class_1 = X[y == 1]

        if len(class_0) > len(class_1):
            class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1))
            X_balanced = np.vstack([class_0_downsampled, class_1])
            y_balanced = np.hstack([np.zeros(len(class_0_downsampled)), np.ones(len(class_1))])
        else:
            class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0))
            X_balanced = np.vstack([class_0, class_1_downsampled])
            y_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(class_1_downsampled))])

        p = np.random.permutation(len(y_balanced))
        X_balanced, y_balanced = X_balanced[p], y_balanced[p]

        folds.append([X_balanced, y_balanced])

    for i in range(len(folds)):
        X_test, y_test = folds[i][0], folds[i][1]

        X_train = np.vstack([fold[0] for j, fold in enumerate(folds) if j != i])
        y_train = np.hstack([fold[1] for j, fold in enumerate(folds) if j != i])

        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        precision_scores[i].append(precision_score(y_test, y_pred))
        recall_scores[i].append(recall_score(y_test, y_pred))
        f1_scores[i].append(f1_score(y_test, y_pred))
        kappa_scores[i].append(cohen_kappa_score(y_test, y_pred))

for i in range(len(zones)):
    print(f"Precision zone {i+1}: {np.mean(precision_scores[i]):.2f} +/- {np.std(precision_scores[i]):.2f}")
    print(f"Recall zone {i+1}: {np.mean(recall_scores[i]):.2f} +/- {np.std(recall_scores[i]):.2f}")
    print(f"F1-score zone {i+1}: {np.mean(f1_scores[i]):.2f} +/- {np.std(f1_scores[i]):.2f}")
    print(f"Kappa zone {i+1}: {np.mean(kappa_scores[i]):.2f} +/- {np.std(kappa_scores[i]):.2f}\n")

print(f"Precision: {np.mean([np.mean(f) for f in precision_scores]):.2f} +/- {np.std([np.mean(f) for f in precision_scores]):.2f}")
print(f"Recall: {np.mean([np.mean(f) for f in recall_scores]):.2f} +/- {np.std([np.mean(f) for f in recall_scores]):.2f}")
print(f"F1-score: {np.mean([np.mean(f) for f in f1_scores]):.2f} +/- {np.std([np.mean(f) for f in f1_scores]):.2f}")
print(f"Kappa: {np.mean([np.mean(f) for f in kappa_scores]):.2f} +/- {np.std([np.mean(f) for f in kappa_scores]):.2f}")

In [None]:
## Cross-validation: early fusion baseline

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score

zones = dataset['zone'].unique().tolist()
zones.reverse()

f_cols = ['slope', 'profile_co', 'nodes', 'roads', 'mean_conne', 'min_connex', 'max_connex']

f1_scores = [[] for _ in range(len(zones))]
precision_scores = [[] for _ in range(len(zones))]
recall_scores = [[] for _ in range(len(zones))]
kappa_scores = [[] for _ in range(len(zones))]

for _ in range(10):
    folds = []
    for z in zones:
        dataset_zone = dataset[dataset['zone'] == z]
        X, y = np.hstack(
            (dataset_zone[f_cols].values, np.array(dataset_zone['dl_features'].tolist()))
        ), dataset_zone['label'].values

        class_0 = X[y == 0]
        class_1 = X[y == 1]

        if len(class_0) > len(class_1):
            class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1))
            X_balanced = np.vstack([class_0_downsampled, class_1])
            y_balanced = np.hstack([np.zeros(len(class_0_downsampled)), np.ones(len(class_1))])
        else:
            class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0))
            X_balanced = np.vstack([class_0, class_1_downsampled])
            y_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(class_1_downsampled))])

        p = np.random.permutation(len(y_balanced))
        X_balanced, y_balanced = X_balanced[p], y_balanced[p]

        folds.append([X_balanced, y_balanced])

    for i in range(len(folds)):
        X_test, y_test = folds[i][0], folds[i][1]

        X_train = np.vstack([fold[0] for j, fold in enumerate(folds) if j != i])
        y_train = np.hstack([fold[1] for j, fold in enumerate(folds) if j != i])

        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)

        precision_scores[i].append(precision_score(y_test, y_pred))
        recall_scores[i].append(recall_score(y_test, y_pred))
        f1_scores[i].append(f1_score(y_test, y_pred))
        kappa_scores[i].append(cohen_kappa_score(y_test, y_pred))

for i in range(len(zones)):
    print(f"Precision zone {i+1}: {np.mean(precision_scores[i]):.2f} +/- {np.std(precision_scores[i]):.2f}")
    print(f"Recall zone {i+1}: {np.mean(recall_scores[i]):.2f} +/- {np.std(recall_scores[i]):.2f}")
    print(f"F1-score zone {i+1}: {np.mean(f1_scores[i]):.2f} +/- {np.std(f1_scores[i]):.2f}")
    print(f"Kappa zone {i+1}: {np.mean(kappa_scores[i]):.2f} +/- {np.std(kappa_scores[i]):.2f}\n")

print(f"Precision: {np.mean([np.mean(f) for f in precision_scores]):.2f} +/- {np.std([np.mean(f) for f in precision_scores]):.2f}")
print(f"Recall: {np.mean([np.mean(f) for f in recall_scores]):.2f} +/- {np.std([np.mean(f) for f in recall_scores]):.2f}")
print(f"F1-score: {np.mean([np.mean(f) for f in f1_scores]):.2f} +/- {np.std([np.mean(f) for f in f1_scores]):.2f}")
print(f"Kappa: {np.mean([np.mean(f) for f in kappa_scores]):.2f} +/- {np.std([np.mean(f) for f in kappa_scores]):.2f}")

In [None]:
## Cross-validation: late fusion (MUSICA)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.metrics import f1_score, precision_score, recall_score, cohen_kappa_score

zones = dataset['zone'].unique().tolist()
zones.reverse()

f_cols = ['slope', 'profile_co', 'nodes', 'roads', 'mean_conne', 'min_connex', 'max_connex']

f1_scores = [[] for _ in range(len(zones))]
precision_scores = [[] for _ in range(len(zones))]
recall_scores = [[] for _ in range(len(zones))]
kappa_scores = [[] for _ in range(len(zones))]

h_preds = [[] for _ in range(len(zones))]
dl_preds = [[] for _ in range(len(zones))]
labels = [[] for _ in range(len(zones))]

for _ in range(10):
    h_folds = []
    dl_folds = []
    for z in zones:
        dataset_zone = dataset[dataset['zone'] == z]
        handcrafted_X = dataset_zone[f_cols].values
        dl_X = np.array(dataset_zone['dl_features'].tolist())
        y = dataset_zone['label'].values

        random_state = np.random.randint(1, 42)

        # Balance handcrafted features
        class_0 = handcrafted_X[y == 0]
        class_1 = handcrafted_X[y == 1]
        if len(class_0) > len(class_1):
            class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=random_state)
            X_balanced = np.vstack([class_0_downsampled, class_1])
            y_balanced = np.hstack([np.zeros(len(class_0_downsampled)), np.ones(len(class_1))])
        else:
            class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0), random_state=random_state)
            X_balanced = np.vstack([class_0, class_1_downsampled])
            y_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(class_1_downsampled))])
        p = np.random.permutation(len(y_balanced))
        X_balanced, y_balanced = X_balanced[p], y_balanced[p]
        h_folds.append([X_balanced, y_balanced])

        # Balance deep features
        class_0 = dl_X[y == 0]
        class_1 = dl_X[y == 1]
        if len(class_0) > len(class_1):
            class_0_downsampled = resample(class_0, replace=False, n_samples=len(class_1), random_state=random_state)
            X_balanced = np.vstack([class_0_downsampled, class_1])
            y_balanced = np.hstack([np.zeros(len(class_0_downsampled)), np.ones(len(class_1))])
        else:
            class_1_downsampled = resample(class_1, replace=False, n_samples=len(class_0), random_state=random_state)
            X_balanced = np.vstack([class_0, class_1_downsampled])
            y_balanced = np.hstack([np.zeros(len(class_0)), np.ones(len(class_1_downsampled))])
        X_balanced, y_balanced = X_balanced[p], y_balanced[p]
        dl_folds.append([X_balanced, y_balanced])

    for i in range(len(h_folds)):
        # Train handcrafted model
        X_test, y_test = h_folds[i][0], h_folds[i][1]
        X_train = np.vstack([fold[0] for j, fold in enumerate(h_folds) if j != i])
        y_train = np.hstack([fold[1] for j, fold in enumerate(h_folds) if j != i])
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        h_y_pred = clf.predict(X_test)
        h_y_probas = np.take(clf.predict_proba(np.array(X_test)), 0, axis=1)

        # Train deep features model
        X_test, y_test = dl_folds[i][0], dl_folds[i][1]
        X_train = np.vstack([fold[0] for j, fold in enumerate(dl_folds) if j != i])
        y_train = np.hstack([fold[1] for j, fold in enumerate(dl_folds) if j != i])
        clf = RandomForestClassifier()
        clf.fit(X_train, y_train)
        dl_y_pred = clf.predict(X_test)
        dl_y_probas = np.take(clf.predict_proba(np.array(X_test)), 0, axis=1)

        labels[i].append(y_test)
        h_preds[i].append(h_y_pred.copy())
        dl_preds[i].append(dl_y_pred.copy())

        # Late fusion
        epsilon = 1e-10
        w_h = np.abs(0.5 - h_y_probas + epsilon)
        w_dl = np.abs(0.5 - dl_y_probas + epsilon)
        w_h_normalized = w_h / (w_h + w_dl)
        w_dl_normalized = w_dl / (w_h + w_dl)

        h_y_pred[h_y_pred == 0] = -1
        dl_y_pred[dl_y_pred == 0] = -1
        pred_combination = w_h_normalized * h_y_pred + w_dl_normalized * dl_y_pred
        y_pred = (pred_combination >= 0).astype(int)

        precision_scores[i].append(precision_score(y_test, y_pred))
        recall_scores[i].append(recall_score(y_test, y_pred))
        f1_scores[i].append(f1_score(y_test, y_pred))
        kappa_scores[i].append(cohen_kappa_score(y_test, y_pred))

for i in range(len(zones)):
    print(f"Precision zone {i+1}: {np.mean(precision_scores[i]):.2f} +/- {np.std(precision_scores[i]):.2f}")
    print(f"Recall zone {i+1}: {np.mean(recall_scores[i]):.2f} +/- {np.std(recall_scores[i]):.2f}")
    print(f"F1-score zone {i+1}: {np.mean(f1_scores[i]):.2f} +/- {np.std(f1_scores[i]):.2f}")
    print(f"Kappa zone {i+1}: {np.mean(kappa_scores[i]):.2f} +/- {np.std(kappa_scores[i]):.2f}\n")

print(f"Precision: {np.mean([np.mean(f) for f in precision_scores]):.2f} +/- {np.std([np.mean(f) for f in precision_scores]):.2f}")
print(f"Recall: {np.mean([np.mean(f) for f in recall_scores]):.2f} +/- {np.std([np.mean(f) for f in recall_scores]):.2f}")
print(f"F1-score: {np.mean([np.mean(f) for f in f1_scores]):.2f} +/- {np.std([np.mean(f) for f in f1_scores]):.2f}")
print(f"Kappa: {np.mean([np.mean(f) for f in kappa_scores]):.2f} +/- {np.std([np.mean(f) for f in kappa_scores]):.2f}")

In [None]:
## Visualization of prediction-label distributions

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

medC1 = dl_preds[1][0]
medA1 = h_preds[1][0]
labels1 = labels[1][0]

pointsC1 = np.column_stack((medC1, labels1))
unique_pointsC1, countsC1 = np.unique(pointsC1, axis=0, return_counts=True)
xC1, yC1, sizesC1 = unique_pointsC1[:, 0], unique_pointsC1[:, 1], countsC1 * 300

pointsA1 = np.column_stack((medA1, labels1))
unique_pointsA1, countsA1 = np.unique(pointsA1, axis=0, return_counts=True)
xA1, yA1, sizesA1 = unique_pointsA1[:, 0], unique_pointsA1[:, 1], countsA1 * 300

medC2 = dl_preds[0][0]
medA2 = h_preds[0][0]
labels2 = labels[0][0]

pointsC2 = np.column_stack((medC2, labels2))
unique_pointsC2, countsC2 = np.unique(pointsC2, axis=0, return_counts=True)
xC2, yC2, sizesC2 = unique_pointsC2[:, 0], unique_pointsC2[:, 1], countsC2 * 300

pointsA2 = np.column_stack((medA2, labels2))
unique_pointsA2, countsA2 = np.unique(pointsA2, axis=0, return_counts=True)
xA2, yA2, sizesA2 = unique_pointsA2[:, 0], unique_pointsA2[:, 1], countsA2 * 300

fig, axs = plt.subplots(1, 2, figsize=(8, 4))

axs[0].scatter(xC1, yC1, s=sizesC1, color='#1f77b4', alpha=1, edgecolors="#1f77b4", label='C')
axs[0].scatter(xA1, yA1, s=sizesA1, color='#2ca02c', facecolors='none', edgecolors="#2ca02c", label='A', linewidths=1.5)
axs[0].set_xticks([0, 1])
axs[0].set_xticklabels(['F', 'I'], fontsize=16)
axs[0].set_yticks([0, 1])
axs[0].set_yticklabels(['F', 'I'], fontsize=16)
axs[0].set_xlim(0, 1)
axs[0].set_ylim(0, 1)
axs[0].set_xlabel('Prediction', fontsize=16, labelpad=-10)
axs[0].set_ylabel('Label', fontsize=16, labelpad=-10)
axs[0].set_aspect('equal')

axs[1].scatter(xC2, yC2, s=sizesC2, color='#1f77b4', alpha=1, edgecolors="#1f77b4")
axs[1].scatter(xA2, yA2, s=sizesA2, color='#2ca02c', facecolors='none', edgecolors="#2ca02c", linewidths=1.5)
axs[1].set_xticks([0, 1])
axs[1].set_xticklabels(['F', 'I'], fontsize=16)
axs[1].set_yticks([0, 1])
axs[1].set_yticklabels(['F', 'I'], fontsize=16)
axs[1].set_xlim(0, 1)
axs[1].set_ylim(0, 1)
axs[1].set_xlabel('Prediction', fontsize=16, labelpad=-10)
axs[1].set_ylabel('Label', fontsize=16, labelpad=-10)
axs[1].set_aspect('equal')

legend_elements = [
    Line2D([1], [1], marker='o', color='w', label='Deep features', markerfacecolor='#1f77b4', markersize=18),
    Line2D([1], [1], marker='o', color="#2ca02c", label='Handcrafted features', markerfacecolor='none', markersize=16, markeredgewidth=1.5),
]

for ax in axs:
    ax.tick_params(axis="both", which="both", labelsize=16)

fig.legend(handles=legend_elements, loc="lower center", bbox_to_anchor=(0.5, -0.15), fontsize=16, ncol=2)

plt.savefig("comp.png", dpi=300, bbox_inches="tight")
plt.show()