# **Photovoltaic panel segmentation on building facades**

Ayca Duran*, Pedram Mirabian, Panagiotis Karapiperis, Christoph Waibel,
Bernd Bickel and Arno Schlueter

## Step 9: Test Set Evaluation (Paper Results)

This notebook reproduces the results presented in the paper, specifically in Table 3, Table 4, Table 5 and Table 8.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

'''
GLOBAL SETUP
'''
from pathlib import Path
try:
    from google.colab import drive
    drive.mount('/content/drive', force_remount= False)
    repo_path = Path("/content/drive/MyDrive/PVFINDER")
except:
    repo_path = Path.cwd().parent

human_path =        repo_path / "files" / "05_micro_results" / "human"
mrcnn_path =        repo_path / "files" / "05_micro_results" / "maskrcnn"
segf_path =         repo_path / "files" / "05_micro_results" / "segformer"

In [10]:
# ----------------------------
# Helpers
# ----------------------------
def safe_div(num, den, default=np.nan):
    """Elementwise num/den with no divide-by-zero warnings."""
    num = np.asarray(num, dtype=float)
    den = np.asarray(den, dtype=float)
    out = np.full_like(num, default, dtype=float)
    np.divide(num, den, out=out, where=(den != 0))
    return out

def add_pixel_metrics(df):
    """Adds PA/Precision/Recall/F1/IoU columns from pixel counts."""
    tp = df["TP_pixels"].to_numpy(dtype=float)
    fp = df["FP_pixels"].to_numpy(dtype=float)
    fn = df["FN_pixels"].to_numpy(dtype=float)
    tn = df["TN_pixels"].to_numpy(dtype=float)

    df = df.copy()
    df["gt_has_pv"]   = (tp + fn) > 0
    df["pred_has_pv"] = (tp + fp) > 0

    df["PA"]        = safe_div(tp + tn, tp + tn + fp + fn)
    df["Precision"] = safe_div(tp, tp + fp)
    df["Recall"]    = safe_div(tp, tp + fn)
    df["F1"]         = safe_div(2 * tp, 2 * tp + fp + fn)
    
    #missed PV images → Precision = 0 (not NaN)
    miss = df["gt_has_pv"] & (~df["pred_has_pv"])
    df.loc[miss, "Precision"] = 0.0

    # IoU; define IoU=1 when both GT and pred are empty (tp=fp=fn=0)
    den_iou = tp + fp + fn
    iou = safe_div(tp, den_iou)
    iou[den_iou == 0] = 1.0
    df["IoU"] = iou
    return df

def image_level_confusion(df):
    """Image-level TP/TN/FP/FN based on presence/absence of ANY PV pixel."""
    gt = (df["TP_pixels"] + df["FN_pixels"]) > 0
    pr = (df["TP_pixels"] + df["FP_pixels"]) > 0
    return pd.Series({
        "TP": int((gt &  pr).sum()),
        "TN": int((~gt & ~pr).sum()),
        "FP": int((~gt &  pr).sum()),
        "FN": int((gt & ~pr).sum()),
    })

def pixel_micro_metrics(df):
    """Pixel-level micro metrics: sum pixels across df, then compute metrics."""
    TP = float(df["TP_pixels"].sum())
    FP = float(df["FP_pixels"].sum())
    FN = float(df["FN_pixels"].sum())
    TN = float(df["TN_pixels"].sum())

    return pd.Series({
        "PA":        (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) else np.nan,
        "Precision": TP / (TP + FP) if (TP + FP) else np.nan,
        "Recall":    TP / (TP + FN) if (TP + FN) else np.nan,
        "F1":        (2*TP) / (2*TP + FP + FN) if (2*TP + FP + FN) else np.nan,
        "IoU":       TP / (TP + FP + FN) if (TP + FP + FN) else np.nan,
    })
    
def iou_aggregations(df):
    """
    Returns:
      - mIoU_image_all: mean of per-image IoU over ALL images
      - mIoU_image_gtpv: mean of per-image IoU over images with GT PV (paper)
      - IoU_micro_all: global IoU from summed pixels over ALL images
      - IoU_micro_gtpv: global IoU from summed pixels over images with GT PV
    """
    # per-image mean IoU
    mIoU_image_all = df["IoU"].mean()

    df_gtpv = df.loc[(df["TP_pixels"] + df["FN_pixels"]) > 0]
    mIoU_image_gtpv = df_gtpv["IoU"].mean()

    # micro IoU (global)
    def micro_iou(d):
        TP = float(d["TP_pixels"].sum())
        FP = float(d["FP_pixels"].sum())
        FN = float(d["FN_pixels"].sum())
        den = TP + FP + FN
        return (TP / den) if den else np.nan

    IoU_micro_all = micro_iou(df)
    IoU_micro_gtpv = micro_iou(df_gtpv)

    return pd.Series({
        "mIoU_image_all": mIoU_image_all,
        "mIoU_image_gtpv": mIoU_image_gtpv,
        "IoU_micro_all": IoU_micro_all,
        "IoU_micro_gtpv": IoU_micro_gtpv,
        "n_images_all": int(df["image"].nunique()) if "image" in df.columns else int(len(df)),
        "n_images_gtpv": int(df_gtpv["image"].nunique()) if "image" in df_gtpv.columns else int(len(df_gtpv)),
    })

def evaluate_model(df):
    df = add_pixel_metrics(df)

    conf = image_level_confusion(df)
    pix_micro = pixel_micro_metrics(df)

    # macro over ALL images (kept mainly for PA / reference)
    pix_macro_all = df[["PA","Precision","Recall","F1","IoU"]].mean()

    # macro over GT-PV images only (recommended for Precision/Recall/F1/IoU)
    gtpv = df[df["gt_has_pv"]]
    pix_macro_gtpv = pd.Series({
        "Precision": gtpv["Precision"].mean(),
        "Recall":    gtpv["Recall"].mean(),
        "F1":        gtpv["F1"].mean(),
        "IoU":       gtpv["IoU"].mean(),
    })

    iou_aggs = iou_aggregations(df)
    
    # dfm = add_pixel_metrics(df)
    # print(check_for_nans(dfm))
    # print(audit_empty_cases(dfm))

    return {
        "confusion": conf,
        "pixel_micro": pix_micro,
        "pixel_macro_all": pix_macro_all,
        "pixel_macro_gtpv": pix_macro_gtpv,
        "iou_aggregations": iou_aggs,
        "n_images": df["image"].nunique() if "image" in df.columns else len(df),
        "n_images_gtpv": gtpv["image"].nunique() if "image" in gtpv.columns else len(gtpv),
    }

def load_maskrcnn_csv(path):
    df = pd.read_csv(path)

    df = df.rename(columns={
        "image_id": "image",
        "tp_pixels": "TP_pixels",
        "fp_pixels": "FP_pixels",
        "fn_pixels": "FN_pixels",
        "tn_pixels": "TN_pixels",
    })

    # Ensure numeric (some rows may have empty strings)
    for c in ["TP_pixels", "FP_pixels", "FN_pixels", "TN_pixels"]:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

    return df[["image", "TP_pixels", "FP_pixels", "FN_pixels", "TN_pixels"]]

def load_segformer_csv(path):
    df = pd.read_csv(path)

    df = df.rename(columns={
        "pr_filename": "image",
        "tp_green": "TP_pixels",
        "fp_orange": "FP_pixels",
        "fn_red": "FN_pixels",
        "tn_black": "TN_pixels",
    })

    for c in ["TP_pixels", "FP_pixels", "FN_pixels", "TN_pixels"]:
        df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0)

    return df[["image", "TP_pixels", "FP_pixels", "FN_pixels", "TN_pixels"]]

def check_for_nans(df):
    metrics = ["Precision", "Recall", "F1", "IoU", "PA"]
    return df[metrics].isna().sum()

def audit_empty_cases(df):
    tp = df["TP_pixels"]
    fp = df["FP_pixels"]
    fn = df["FN_pixels"]

    empty_empty = (tp == 0) & (fp == 0) & (fn == 0)
    gt_only     = (tp == 0) & (fn > 0)
    pred_only   = (fp > 0) & (tp == 0)
    normal      = (tp > 0)

    return pd.Series({
        "empty_empty": int(empty_empty.sum()),
        "gt_only": int(gt_only.sum()),
        "pred_only": int(pred_only.sum()),
        "normal": int(normal.sum()),
        "total": int(len(df)),
    })

def evaluate_mean_annotator(df_workers: pd.DataFrame) -> dict:
    """
    Mean annotator baseline:
      1) compute pixel metrics per (image, worker) row vs dataset GT
      2) average metrics across workers WITHIN each image -> one row per image
      3) report macro over GT-positive images (main-text), micro in appendix

    Assumes df_workers has columns:
      image, TP_pixels, FP_pixels, FN_pixels, TN_pixels
    """
    df = add_pixel_metrics(df_workers)

    # --- Per-image mean over workers (so each image counts equally)
    per_image = (
        df.groupby("image", as_index=False)
          .agg(
              TP_pixels=("TP_pixels", "mean"),
              FP_pixels=("FP_pixels", "mean"),
              FN_pixels=("FN_pixels", "mean"),
              TN_pixels=("TN_pixels", "mean"),
              # gt_has_pv is identical for all workers on the same image
              gt_has_pv=("gt_has_pv", "first"),
          )
    )

    # Recompute metrics from averaged pixel counts (keeps definitions consistent)
    per_image = add_pixel_metrics(per_image)

    # Evaluate with the SAME pipeline as models
    res = evaluate_model(per_image)

    return {
        "per_image_df": per_image,        # optional, useful for debugging
        "results": res,                   # same keys as model evaluation
    }
    
def print_results(name, res):
    print(f"\n=== {name} ===")
    print("\nImage-level confusion matrix:")
    print(res["confusion"].to_string())

    # print("\nPixel-level metrics (MACRO; per-image mean):")
    # print(res["pixel_macro_all"].round(2).to_string())
    
    print("\nPixel-level metrics (MACRO; per-image mean; GT-PV images only):")
    print(res["pixel_macro_gtpv"].round(2).to_string())

    print("\nPixel-level metrics (MICRO; global):")
    print(res["pixel_micro"].round(2).to_string())

    # print("\nIoU aggregations:")
    # print(res["iou_aggregations"].round(2).to_string())

    print("\n#images:", res["n_images"])

## HUMANS

### Human Best

In [None]:

# Load + filter
df = pd.read_csv(Path(human_path / "humans.csv")).rename(columns={"nput.image_url": "input.image_url"})
df = df.drop(df[df["AssignmentStatus"] == "Rejected"].index)

# Add per-row (image x worker) pixel metrics vs DATASET GT
df = add_pixel_metrics(df)

# ------------------------------------------------------------
# Human baseline: Best-of-five (ORACLE) vs dataset GT
# ------------------------------------------------------------
# Pick the worker with max IoU *against dataset GT* for each image
best_oracle = df.loc[df.groupby("image")["IoU"].idxmax()].copy()

# Evaluate exactly like models
res_oracle = evaluate_model(best_oracle)

# ---- What you should report in MAIN TEXT (GT-positive macro only)
print("\n=== Human baseline: Best-of-five (oracle) | MAIN TEXT metrics ===")
print(res_oracle["pixel_macro_gtpv"].round(2).to_string())
print(f"n_images_gtpv = {res_oracle['n_images_gtpv']} / n_images = {res_oracle['n_images']}")

# ---- What you can report in APPENDIX (global micro + image-level presence)
print("\n=== APPENDIX: micro/global pixel metrics (oracle) ===")
print(res_oracle["pixel_micro"].round(2).to_string())

print("\n=== APPENDIX: image-level presence confusion (oracle) ===")
print(res_oracle["confusion"].to_string())

#print("\n=== APPENDIX: IoU aggregations (oracle; keep only what you actually report) ===")
# Suggest: report only IoU_micro_all (appendix) and mIoU_image_gtpv (main),
# avoid mIoU_image_all if you don't want to explain empty-empty IoU inflation.
#print(res_oracle["iou_aggregations"].to_string())



=== Human baseline: Best-of-five (oracle) | MAIN TEXT metrics ===
Precision    0.82
Recall       0.84
F1           0.81
IoU          0.75
n_images_gtpv = 53 / n_images = 105

=== APPENDIX: micro/global pixel metrics (oracle) ===
PA           0.98
Precision    0.90
Recall       0.83
F1           0.86
IoU          0.76

=== APPENDIX: image-level presence confusion (oracle) ===
TP    49
TN    52
FP     0
FN     4


In [6]:
df

Unnamed: 0,input.image_url,WorkerId,AssignmentStatus,image,worker,det_category,TP_pixels,FP_pixels,FN_pixels,TN_pixels,gt_has_pv,pred_has_pv,PA,Precision,Recall,F1,IoU
0,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A1IZ4NX41GKU4X,Approved,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A1IZ4NX41GKU4X,TN,0,0,0,1480000,False,False,1.000000,,,,1.0
1,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A7V4CVENA0DYV,Approved,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A7V4CVENA0DYV,TN,0,0,0,1480000,False,False,1.000000,,,,1.0
2,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A2EI075XZT9Y2S,Approved,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A2EI075XZT9Y2S,TN,0,0,0,1480000,False,False,1.000000,,,,1.0
3,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A3TUMZ954ORSUC,Approved,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A3TUMZ954ORSUC,TN,0,0,0,1480000,False,False,1.000000,,,,1.0
4,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A2PYXFVGNJPPX0,Approved,0_jpg.rf.e7fb12646e7f85bb25dbc8c4852d30a9.jpg,A2PYXFVGNJPPX0,TN,0,0,0,1480000,False,False,1.000000,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,A1IZ4NX41GKU4X,Approved,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,A1IZ4NX41GKU4X,TN,0,0,0,1465000,False,False,1.000000,,,,1.0
543,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,ABVM2KJ7CRNZ0,Approved,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,ABVM2KJ7CRNZ0,FP,0,20732,0,1444268,False,True,0.985848,0.0,,0.0,0.0
544,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,A7V4CVENA0DYV,Approved,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,A7V4CVENA0DYV,TN,0,0,0,1465000,False,False,1.000000,,,,1.0
545,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,A3AA5G6HENO6VJ,Approved,93_jpg.rf.f70be11a1a6715ae40845ab07e49e5a8.jpg,A3AA5G6HENO6VJ,TN,0,0,0,1465000,False,False,1.000000,,,,1.0


### Majortiy vote

In [None]:
import pandas as pd
import numpy as np

# --- Load majority-vote consensus file (1 row per image)
maj = pd.read_csv(Path(human_path / "human_pixel_majority.csv"))

# Optional: keep naming consistent if needed
# maj = maj.rename(columns={"nput.image_url": "input.image_url"})

# Add per-image pixel metrics
maj = add_pixel_metrics(maj)

# Evaluate exactly like models
res_maj = evaluate_model(maj)

# ----------------------------
# MAIN TEXT: macro over GT-positive images only
# ----------------------------
print("\n=== Human baseline: Majority vote (3-of-5) | MAIN TEXT metrics ===")
print(res_maj["pixel_macro_gtpv"].round(2).to_string())
print(f"n_images_gtpv = {res_maj['n_images_gtpv']} / n_images = {res_maj['n_images']}")

# ----------------------------
# APPENDIX: micro/global + image-level presence confusion (optional)
# ----------------------------
print("\n=== APPENDIX: micro/global pixel metrics (majority) ===")
print(res_maj["pixel_micro"].round(2).to_string())

print("\n=== APPENDIX: image-level presence confusion (majority) ===")
print(res_maj["confusion"].to_string())

# If you include IoU aggregations in appendix, pick only the ones you want:
# - mIoU_image_gtpv aligns with main text IoU
# - IoU_micro_all aligns with micro/global
#print("\n=== Optional: IoU aggregations (majority) ===")
#print(res_maj["iou_aggregations"][["mIoU_image_gtpv","IoU_micro_all","n_images_all","n_images_gtpv"]].to_string())



=== Human baseline: Majority vote (3-of-5) | MAIN TEXT metrics ===
Precision    0.75
Recall       0.89
F1           0.78
IoU          0.71
n_images_gtpv = 31 / n_images = 105

=== APPENDIX: micro/global pixel metrics (majority) ===
PA           0.96
Precision    0.46
Recall       0.93
F1           0.61
IoU          0.44

=== APPENDIX: image-level presence confusion (majority) ===
TP    31
TN    52
FP    22
FN     0


### Mean anotator

In [None]:
# Load + filter
df = pd.read_csv(Path(human_path / "humans.csv")).rename(columns={"nput.image_url": "input.image_url"})
df = df.drop(df[df["AssignmentStatus"] == "Rejected"].index)

# If your MTurk export includes rejected rows:
if "AssignmentStatus" in df.columns:
    df = df[df["AssignmentStatus"] != "Rejected"].copy()

out = evaluate_mean_annotator(df)
res_mean = out["results"]

# Mean annotator category distribution
det_cat = dict.fromkeys(df.det_category.unique(), 0)
for cat in df.det_category:
    det_cat[cat] += 1
average_cat = {cat: det_cat[cat] / 5 for cat in det_cat}
average_cat = pd.Series(average_cat)

print("\n=== MAIN TEXT: Mean annotator (GT-positive macro) ===")
print(res_mean["pixel_macro_gtpv"].round(2).to_string())
print(f"n_images_gtpv = {res_mean['n_images_gtpv']} / n_images = {res_mean['n_images']}")

print("\n=== APPENDIX: Mean annotator (micro/global) ===")
print(res_mean["pixel_micro"].round(2).to_string())

print("\n=== APPENDIX: image-level presence confusion (optional) ===")
print(print(average_cat.round(0).astype(int).to_string()))



=== MAIN TEXT: Mean annotator (GT-positive macro) ===
Precision    0.77
Recall       0.48
F1           0.55
IoU          0.42
n_images_gtpv = 53 / n_images = 105

=== APPENDIX: Mean annotator (micro/global) ===
PA           0.94
Precision    0.79
Recall       0.47
F1           0.59
IoU          0.41

=== APPENDIX: image-level presence confusion (optional) ===
TN    47
TP    31
FN    22
FP     5
None


## MODELS

In [None]:
maskrcnn_dir = mrcnn_path

maskrcnn_results = {}

for csv_path in maskrcnn_dir.glob("*.csv"):
    name = csv_path.stem
    df = load_maskrcnn_csv(csv_path)
    maskrcnn_results[name] = evaluate_model(df)
    
segformer_dir = segf_path

segformer_results = {}

for csv_path in segformer_dir.glob("*.csv"):
    name = csv_path.stem
    df = load_segformer_csv(csv_path)
    segformer_results[name] = evaluate_model(df)



In [16]:
for name, res in maskrcnn_results.items():
    print_results(f"Mask R-CNN – {name}", res)

for name, res in segformer_results.items():
    print_results(f"SegFormer – {name}", res)



=== Mask R-CNN – MRCNN_stratAug_BestIoU_Model45_t85 ===

Image-level confusion matrix:
TP    46
TN    44
FP     8
FN     7

Pixel-level metrics (MACRO; per-image mean; GT-PV images only):
Precision    0.62
Recall       0.56
F1           0.52
IoU          0.43

Pixel-level metrics (MICRO; global):
PA           0.92
Precision    0.55
Recall       0.61
F1           0.58
IoU          0.41

#images: 105

=== Mask R-CNN – MRCNN_strat_Model43_t90 ===

Image-level confusion matrix:
TP    44
TN    46
FP     6
FN     9

Pixel-level metrics (MACRO; per-image mean; GT-PV images only):
Precision    0.62
Recall       0.47
F1           0.47
IoU          0.39

Pixel-level metrics (MICRO; global):
PA           0.93
Precision    0.61
Recall       0.55
F1           0.58
IoU          0.41

#images: 105

=== Mask R-CNN – MRCNN_stratAug_Model7_t95_BestPrecision ===

Image-level confusion matrix:
TP    42
TN    46
FP     6
FN    11

Pixel-level metrics (MACRO; per-image mean; GT-PV images only):
Precision  

MAIN TEXT:


Table 3 (Pixel-level segmentation quality (PRIMARY)): <br>
Macro-averaged IoU over GT-PV images <br>
Macro F1 over GT-PV images<br>
Macro Precision / Recall over GT-PV images<br>

Table 4 (Image-level detection performance)

APPENDIX:

Micro GLOBAL