# Statistical comparison of errors

In [1]:
import os
import warnings

import geopandas as gpd
import matplotlib.pyplot as plt
import momepy as mm
import numpy as np
import pandas as pd
import scipy.stats

from core import stats, utils

Create a folder for evaluation data.

In [2]:
os.makedirs("../../evaluation/", exist_ok=True)

In [None]:
# which variables to evaluate
eval_vars = [
    "edge_count",
    "edge_length",
    "node_count",
    "avg_degree",
    "stroke_count",
    "stroke_length_sum",
    "stroke_length_max",
    "coordinate_count",
]

# which methods to evaluate
methods_to_evaluate = [
    "manual",
    "cityseer",
    "original",
    "osmnx",
    "parenx-voronoi",
    "parenx-skeletonize",
    "neatnet",
]

methods_to_compare = [
    "cityseer",
    "osmnx",
    "parenx-voronoi",
    "parenx-skeletonize",
    "neatnet",
]

Generate evaluation grids and fill the with evaluation variables. At the same time, compute Euclidean distance between each method and a manual baseline.

In [4]:
for fua in utils.fua_city:
    print(f"Generating for {fua}")
    # read in base data
    meta = utils.read_sample_data()
    geom = meta.loc[meta.eFUA_ID == fua, "geometry"]
    city = meta.loc[meta.eFUA_ID == fua, "eFUA_name"].values[0]

    gdf_orig = utils.read_original(fua)
    proj_crs = gdf_orig.crs

    # Make grid
    base_grid = utils.make_grid(fua, 9, proj_crs)

    # get info on cells with revised data
    deltas = gpd.read_file(f"../../revision/{fua}/deltas_updated.gpkg")

    # read results from all methods into dict
    methods = {}

    for method in methods_to_evaluate:
        print(f"   Reading in results for {method}")
        gdf = utils.read_results(fua, method, proj_crs)

        # print("     getting graph")
        gdf = gdf[~gdf.normalize().duplicated()].copy().reset_index(drop=True)
        G = mm.gdf_to_nx(gdf, length="length", integer_labels=True)

        nodes, edges = mm.nx_to_gdf(G)

        # add node degrees
        # print("     adding node degree")
        nodes = stats.add_node_degree(nodes, G)

        # add stroke IDs
        # print("     measuring coins")
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", category=UserWarning)
            if method == "parenx-skeletonize":
                coins = mm.COINS(
                    edges.set_geometry(edges.simplify(0.5)),
                    angle_threshold=120,
                    flow_mode=True,
                )
            else:
                coins = mm.COINS(edges, angle_threshold=120, flow_mode=True)
        edges["stroke_id"] = coins.stroke_attribute()
        stroke_gdf = coins.stroke_gdf()

        methods[method] = {}
        methods[method]["gdf"] = gdf
        methods[method]["graph"] = G
        methods[method]["nodes"] = nodes
        methods[method]["edges"] = edges

        ### grid with stats eval for this method only
        grid = base_grid.copy()

        # print("     measuring grid edge")
        # add ratio columns to grid
        grid[["edge_count", "edge_length", "coordinate_count"]] = grid.apply(
            lambda x: stats.get_edge_stats(edges, x.geometry),  # noqa: B023
            axis=1,
            result_type="expand",
        )

        # print("     measuring grid node")
        grid[["node_count", "node_degrees", "avg_degree"]] = grid.apply(
            lambda x: stats.get_node_stats(nodes, x.geometry),  # noqa: B023
            axis=1,
            result_type="expand",
        )

        # print("     measuring stroke")
        grid[["stroke_count", "stroke_length_sum", "stroke_length_max"]] = grid.apply(
            lambda x: stats.get_stroke_stats(edges, stroke_gdf, x.geometry),  # noqa: B023
            axis=1,
            result_type="expand",
        )

        # add info on which cells have to be verified still
        grid["to_verify_total"] = deltas["to_verify_total"]

        # save grid to dict
        methods[method]["grid"] = grid

        # save to a file
        grid.to_file(f"../../evaluation/{fua}.gpkg", layer=method)

Generating for 1133
   Reading in results for revised_manual
   Reading in results for cityseer
   Reading in results for original
   Reading in results for osmnx
   Reading in results for parenx-voronoi
   Reading in results for parenx-skeletonize
   Reading in results for neatnet
Generating for 869
   Reading in results for revised_manual
   Reading in results for cityseer
   Reading in results for original
   Reading in results for osmnx
   Reading in results for parenx-voronoi
   Reading in results for parenx-skeletonize
   Reading in results for neatnet
Generating for 4617
   Reading in results for revised_manual
   Reading in results for cityseer
   Reading in results for original
   Reading in results for osmnx
   Reading in results for parenx-voronoi
   Reading in results for parenx-skeletonize
   Reading in results for neatnet
Generating for 809
   Reading in results for revised_manual
   Reading in results for cityseer
   Reading in results for original
   Reading in results 

## Statistical comparison

Compare the results with the manually simplified networks using xi correlation.

In [None]:
for fua in utils.fua_city:
    data = {}
    for method in methods_to_evaluate:
        data[method] = gpd.read_file(f"../../evaluation/{fua}.gpkg", layer=method)

    # plot the distance by eval variable
    fig, axs = plt.subplots(8, figsize=(6, 18))

    for i, eval_var in enumerate(eval_vars):
        stat_result = pd.DataFrame(columns=["statistics", "pvalue"])
        for method in methods_to_compare:
            result = scipy.stats.chatterjeexi(
                data[method][eval_var].fillna(0),
                data["manual"][eval_var].fillna(0),
            )
            stat_result.loc[method] = [result.statistic, result.pvalue]

        if (stat_result.pvalue > 0.01).any():
            print(fua, eval_var, "pvalue issue")
        stat_result.statistics.plot.barh(ax=axs.flat[i])
        axs.flat[i].axvline(stat_result.statistics.max(), color="coral", linestyle="--")
        axs.flat[i].set_xlabel(f"xi for {eval_var}")
        plt.tight_layout()
        fig.savefig(
            f"../../plots/evaluation/{fua}/xi_correlation.png",
            dpi=300,
            bbox_inches="tight",
        )
        plt.close()

Getting tables.

In [None]:
tables = {}
for fua in utils.fua_city:
    tables[fua] = {}
    data = {}
    for method in methods_to_evaluate:
        data[method] = gpd.read_file(f"../../evaluation/{fua}.gpkg", layer=method)

    for eval_var in eval_vars:
        stat_result = pd.DataFrame(columns=["statistics", "pvalue"])
        for method in methods_to_compare + ["original"]:
            result = scipy.stats.chatterjeexi(
                data[method][eval_var].fillna(0),
                data["manual"][eval_var].fillna(0),
            )
            stat_result.loc[method] = [result.statistic, result.pvalue]

        tables[fua][eval_var] = stat_result

Combinidng the tables.

In [7]:
combined = pd.DataFrame(columns=["variable", "case", "original"] + methods_to_compare)
combined[combined.columns.drop("variable")] = combined[
    combined.columns.drop("variable")
].astype(float)

i = 0
for fua in utils.fua_city:
    for eval_var in eval_vars:
        combined.loc[i] = tables[fua][eval_var].statistics
        combined.loc[i, ["variable", "case"]] = [eval_var, fua]
        i += 1

  combined.loc[i, ["variable", "case"]] = [eval_var, fua]


Preparing the final output

In [8]:
combined["case"] = combined["case"].map(utils.fua_city)

In [9]:
combined = combined.set_index(["variable", "case"]).sort_index(level=0)
combined

Unnamed: 0_level_0,Unnamed: 1_level_0,original,cityseer,osmnx,parenx-voronoi,parenx-skeletonize,neatnet
variable,case,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
avg_degree,Aleppo,0.683894,0.81836,0.732887,0.851845,0.774428,0.912391
avg_degree,Auckland,0.439375,0.699569,0.498237,0.745444,0.709857,0.804845
avg_degree,Bucaramanga,0.676782,0.75894,0.697984,0.868573,0.834364,0.9119
avg_degree,Douala,0.58128,0.828115,0.664304,0.898912,0.794438,0.956817
avg_degree,Liège,0.641419,0.78027,0.662997,0.811241,0.7784,0.850784
avg_degree,Salt Lake City,0.391298,0.600047,0.4432,0.610153,0.444008,0.703854
avg_degree,Wuhan,0.613327,0.689933,0.641525,0.692901,0.610928,0.737205
coordinate_count,Aleppo,0.837659,0.883614,0.874824,0.815559,0.609463,0.915682
coordinate_count,Auckland,0.727216,0.801653,0.75388,0.631359,0.525575,0.782729
coordinate_count,Bucaramanga,0.826967,0.843693,0.829918,0.808682,0.601564,0.909425


In [10]:
combined.to_csv("combined.csv")

Doing the same for Euclidean distance between distributions

In [None]:
all_deltas = {}
for fua in utils.fua_city:
    all_deltas[fua] = {}
    data = {}
    for method in methods_to_evaluate:
        data[method] = gpd.read_file(f"../../evaluation/{fua}.gpkg", layer=method)

    # get euclidean distance between the distributions
    for eval_var in eval_vars:
        all_deltas[fua][eval_var] = {}
        for method in ["original"] + methods_to_compare:
            delta_comp = (data["manual"][eval_var] - data[method][eval_var]) ** 2

            delta = np.sqrt((delta_comp).sum())
            all_deltas[fua][eval_var][method] = delta

Plot the figures.

In [12]:
for fua in utils.fua_city:
    for eval_var in eval_vars:
        # plot the distance by eval variable
        fig, axs = plt.subplots(8, figsize=(6, 18))
        for i, eval_var in enumerate(eval_vars):
            s = pd.Series(all_deltas[fua][eval_var])
            s.plot.barh(ax=axs.flat[i])
            axs.flat[i].axvline(s.min(), color="coral", linestyle="--")
            axs.flat[i].set_xlabel(eval_var)
        axs[-1].set_xscale("log")
        plt.tight_layout()
        fig.savefig(
            f"../../plots/evaluation/{fua}/euclidean_distance.png",
            dpi=300,
            bbox_inches="tight",
        )
        plt.close()

Combine all together to get a single table.

In [13]:
combined_deltas = pd.DataFrame(
    columns=["variable", "case", "original"] + methods_to_compare
)
combined_deltas[combined_deltas.columns.drop("variable")] = combined_deltas[
    combined_deltas.columns.drop("variable")
].astype(float)

i = 0
for fua in utils.fua_city:
    for eval_var in eval_vars:
        combined_deltas.loc[i] = [eval_var, fua] + list(
            all_deltas[fua][eval_var].values()
        )
        i += 1

In [14]:
combined_deltas["case"] = combined_deltas["case"].map(utils.fua_city)

In [15]:
combined_deltas = combined_deltas.set_index(["variable", "case"]).sort_index(level=0)

In [16]:
combined_deltas = combined_deltas.round(2)
combined_deltas

Unnamed: 0_level_0,Unnamed: 1_level_0,original,cityseer,osmnx,parenx-voronoi,parenx-skeletonize,neatnet
variable,case,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
avg_degree,Aleppo,85.59,31.67,58.65,32.94,35.89,22.85
avg_degree,Auckland,33.7,17.34,25.11,19.16,19.29,18.4
avg_degree,Bucaramanga,65.62,27.13,59.54,20.66,21.91,15.97
avg_degree,Douala,38.65,14.97,26.02,11.98,14.62,8.19
avg_degree,Liège,83.43,35.5,62.22,33.19,36.26,31.75
avg_degree,Salt Lake City,43.51,30.64,35.25,31.18,32.98,26.45
avg_degree,Wuhan,130.35,85.92,111.43,85.67,89.9,71.62
coordinate_count,Aleppo,1382.17,610.82,733.47,1280.78,93814.59,356.36
coordinate_count,Auckland,2454.29,577.0,777.37,816.03,33692.94,634.07
coordinate_count,Bucaramanga,2164.92,590.31,701.81,1139.08,144891.31,631.22


In [17]:
combined_deltas.to_csv("combined_deltas.csv")