# Statistical comparison of errors

In [1]:
import os
import warnings

import geopandas as gpd
import matplotlib.pyplot as plt
import momepy as mm
import numpy as np
import pandas as pd
import scipy.stats

from core import stats, utils

Create a folder for evaluation data.

In [2]:
os.makedirs("../../evaluation-buildings/", exist_ok=True)

In [None]:
# which variables to evaluate
eval_vars = [
    "edge_count",
    "edge_length",
    "node_count",
    "avg_degree",
    "stroke_count",
    "stroke_length_sum",
    "stroke_length_max",
    "coordinate_count",
]

# which methods to evaluate
methods_to_evaluate = [
    "manual",
    # "cityseer",
    "original",
    # "osmnx",
    # "parenx-voronoi",
    # "parenx-skeletonize",
    "neatnet",
    "neatnet-buildings",
]

methods_to_compare = [
    # "cityseer",
    # "osmnx",
    # "parenx-voronoi",
    # "parenx-skeletonize",
    "neatnet",
    "neatnet-buildings",
]

Generate evaluation grids and fill the with evaluation variables. At the same time, compute Euclidean distance between each method and a manual baseline.

In [4]:
for fua in utils.fua_city:
    print(f"Generating for {fua}")
    # read in base data
    meta = utils.read_sample_data()
    geom = meta.loc[meta.eFUA_ID == fua, "geometry"]
    city = meta.loc[meta.eFUA_ID == fua, "eFUA_name"].values[0]

    gdf_orig = utils.read_original(fua)
    proj_crs = gdf_orig.crs

    # Make grid
    base_grid = utils.make_grid(fua, 9, proj_crs)

    # get info on cells with revised data
    deltas = gpd.read_file(f"../../revision/{fua}/deltas_updated.gpkg")

    # read results from all methods into dict
    methods = {}

    for method in methods_to_evaluate:
        print(f"   Reading in results for {method}")
        gdf = utils.read_results(fua, method, proj_crs)

        # print("     getting graph")
        gdf = gdf[~gdf.normalize().duplicated()].copy().reset_index(drop=True)
        G = mm.gdf_to_nx(gdf, length="length", integer_labels=True)

        nodes, edges = mm.nx_to_gdf(G)

        # add node degrees
        # print("     adding node degree")
        nodes = stats.add_node_degree(nodes, G)

        # add stroke IDs
        # print("     measuring coins")
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            if method == "parenx-skeletonize":
                coins = mm.COINS(
                    edges.set_geometry(edges.simplify(0.5)),
                    angle_threshold=120,
                    flow_mode=True,
                )
            else:
                coins = mm.COINS(edges, angle_threshold=120, flow_mode=True)
        edges["stroke_id"] = coins.stroke_attribute()
        stroke_gdf = coins.stroke_gdf()

        methods[method] = {}
        methods[method]["gdf"] = gdf
        methods[method]["graph"] = G
        methods[method]["nodes"] = nodes
        methods[method]["edges"] = edges

        ### grid with stats eval for this method only
        grid = base_grid.copy()

        # print("     measuring grid edge")
        # add ratio columns to grid
        grid[["edge_count", "edge_length", "coordinate_count"]] = grid.apply(
            lambda x: stats.get_edge_stats(edges, x.geometry),  # noqa: B023
            axis=1,
            result_type="expand",
        )

        # print("     measuring grid node")
        grid[["node_count", "node_degrees", "avg_degree"]] = grid.apply(
            lambda x: stats.get_node_stats(nodes, x.geometry),  # noqa: B023
            axis=1,
            result_type="expand",
        )

        # print("     measuring stroke")
        grid[["stroke_count", "stroke_length_sum", "stroke_length_max"]] = grid.apply(
            lambda x: stats.get_stroke_stats(edges, stroke_gdf, x.geometry),  # noqa: B023
            axis=1,
            result_type="expand",
        )

        # add info on which cells have to be verified still
        grid["to_verify_total"] = deltas["to_verify_total"]

        # save grid to dict
        methods[method]["grid"] = grid

        # save to a file
        grid.to_file(f"../../evaluation-buildings/{fua}.gpkg", layer=method)

Generating for 1133
   Reading in results for revised_manual
   Reading in results for original
   Reading in results for neatnet
   Reading in results for neatnet-buildings
Generating for 869
   Reading in results for revised_manual
   Reading in results for original
   Reading in results for neatnet
   Reading in results for neatnet-buildings
Generating for 4617
   Reading in results for revised_manual
   Reading in results for original
   Reading in results for neatnet
   Reading in results for neatnet-buildings
Generating for 809
   Reading in results for revised_manual
   Reading in results for original
   Reading in results for neatnet
   Reading in results for neatnet-buildings
Generating for 1656
   Reading in results for revised_manual
   Reading in results for original
   Reading in results for neatnet
   Reading in results for neatnet-buildings
Generating for 4881
   Reading in results for revised_manual
   Reading in results for original
   Reading in results for neatnet
  

## Statistical comparison

Compare the results with the manually simplified networks using xi correlation.

In [6]:
for fua in utils.fua_city:
    os.makedirs(f"../../plots/evaluation-buildings/{fua}", exist_ok=True)

    data = {}
    for method in methods_to_evaluate:
        data[method] = gpd.read_file(
            f"../../evaluation-buildings/{fua}.gpkg", layer=method
        )

    # plot the distance by eval variable
    fig, axs = plt.subplots(8, figsize=(6, 18))

    for i, eval_var in enumerate(eval_vars):
        stat_result = pd.DataFrame(columns=["statistics", "pvalue"])
        for method in methods_to_compare:
            result = scipy.stats.chatterjeexi(
                data[method][eval_var].fillna(0),
                data["revised_manual"][eval_var].fillna(0),
            )
            stat_result.loc[method] = [result.statistic, result.pvalue]

        if (stat_result.pvalue > 0.01).any():
            print(fua, eval_var, "pvalue issue")
        stat_result.statistics.plot.barh(ax=axs.flat[i])
        axs.flat[i].axvline(stat_result.statistics.max(), color="coral", linestyle="--")
        axs.flat[i].set_xlabel(f"xi for {eval_var}")
        plt.tight_layout()
        fig.savefig(
            f"../../plots/evaluation-buildings/{fua}/xi_correlation.png",
            dpi=300,
            bbox_inches="tight",
        )
        plt.close()

Getting tables.

In [7]:
tables = {}
for fua in utils.fua_city:
    tables[fua] = {}
    data = {}
    for method in methods_to_evaluate:
        data[method] = gpd.read_file(
            f"../../evaluation-buildings/{fua}.gpkg", layer=method
        )

    for eval_var in eval_vars:
        stat_result = pd.DataFrame(columns=["statistics", "pvalue"])
        for method in methods_to_compare + ["original"]:
            result = scipy.stats.chatterjeexi(
                data[method][eval_var].fillna(0),
                data["revised_manual"][eval_var].fillna(0),
            )
            stat_result.loc[method] = [result.statistic, result.pvalue]

        tables[fua][eval_var] = stat_result

Combining the tables.

In [8]:
combined = pd.DataFrame(columns=["variable", "case", "original"] + methods_to_compare)
combined[combined.columns.drop("variable")] = combined[
    combined.columns.drop("variable")
].astype(float)

i = 0
for fua in utils.fua_city:
    for eval_var in eval_vars:
        combined.loc[i] = tables[fua][eval_var].statistics
        combined.loc[i, ["variable", "case"]] = [eval_var, fua]
        i += 1

  combined.loc[i, ["variable", "case"]] = [eval_var, fua]


Preparing the final output

In [9]:
combined["case"] = combined["case"].map(utils.fua_city)

In [10]:
combined = combined.set_index(["variable", "case"]).sort_index(level=0)
combined

Unnamed: 0_level_0,Unnamed: 1_level_0,original,neatnet,neatnet-buildings
variable,case,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
avg_degree,Aleppo,0.682729,0.910745,0.91217
avg_degree,Auckland,0.439375,0.803478,0.827448
avg_degree,Bucaramanga,0.675186,0.911149,0.913228
avg_degree,Douala,0.578427,0.956306,0.957746
avg_degree,Liège,0.638495,0.849321,0.85662
avg_degree,Salt Lake City,0.396668,0.692184,0.696842
avg_degree,Wuhan,0.612158,0.728728,0.739966
coordinate_count,Aleppo,0.837659,0.915682,0.920385
coordinate_count,Auckland,0.727216,0.782729,0.82171
coordinate_count,Bucaramanga,0.826967,0.909425,0.921732


In [12]:
combined.to_csv("combined-buildings.csv")

Doing the same for Euclidean distance between distributions

In [13]:
all_deltas = {}
for fua in utils.fua_city:
    all_deltas[fua] = {}
    data = {}
    for method in methods_to_evaluate:
        data[method] = gpd.read_file(
            f"../../evaluation-buildings/{fua}.gpkg", layer=method
        )

    # get euclidean distance between the distributions
    for eval_var in eval_vars:
        all_deltas[fua][eval_var] = {}
        for method in ["original"] + methods_to_compare:
            delta_comp = (
                data["revised_manual"][eval_var] - data[method][eval_var]
            ) ** 2

            delta = np.sqrt((delta_comp).sum())
            all_deltas[fua][eval_var][method] = delta

Plot the figures.

In [14]:
for fua in utils.fua_city:
    for eval_var in eval_vars:
        # plot the distance by eval variable
        fig, axs = plt.subplots(8, figsize=(6, 18))
        for i, eval_var in enumerate(eval_vars):
            s = pd.Series(all_deltas[fua][eval_var])
            s.plot.barh(ax=axs.flat[i])
            axs.flat[i].axvline(s.min(), color="coral", linestyle="--")
            axs.flat[i].set_xlabel(eval_var)
        axs[-1].set_xscale("log")
        plt.tight_layout()
        fig.savefig(
            f"../../plots/evaluation-buildings/{fua}/euclidean_distance.png",
            dpi=300,
            bbox_inches="tight",
        )
        plt.close()

Combine all together to get a single table.

In [15]:
combined_deltas = pd.DataFrame(
    columns=["variable", "case", "original"] + methods_to_compare
)
combined_deltas[combined_deltas.columns.drop("variable")] = combined_deltas[
    combined_deltas.columns.drop("variable")
].astype(float)

i = 0
for fua in utils.fua_city:
    for eval_var in eval_vars:
        combined_deltas.loc[i] = [eval_var, fua] + list(
            all_deltas[fua][eval_var].values()
        )
        i += 1

In [16]:
combined_deltas["case"] = combined_deltas["case"].map(utils.fua_city)

In [17]:
combined_deltas = combined_deltas.set_index(["variable", "case"]).sort_index(level=0)

In [18]:
combined_deltas = combined_deltas.round(2)
combined_deltas

Unnamed: 0_level_0,Unnamed: 1_level_0,original,neatnet,neatnet-buildings
variable,case,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
avg_degree,Aleppo,85.59,22.85,22.38
avg_degree,Auckland,33.7,18.4,16.43
avg_degree,Bucaramanga,65.62,15.97,16.15
avg_degree,Douala,38.65,8.19,8.81
avg_degree,Liège,83.43,31.75,30.78
avg_degree,Salt Lake City,43.51,26.45,25.97
avg_degree,Wuhan,130.35,71.62,70.02
coordinate_count,Aleppo,1382.17,356.36,334.88
coordinate_count,Auckland,2454.29,634.07,468.56
coordinate_count,Bucaramanga,2164.92,631.22,509.44


In [19]:
combined_deltas.to_csv("combined_deltas-buildings.csv")