In [1]:
import gc
import glob

import geopandas as gpd
import momepy as mm
import numpy as np
import pandas as pd
from libpysal.graph import Graph, read_parquet

regions_datadir = "/data/uscuni-ulce/"
data_dir = "/data/uscuni-ulce/processed_data/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")
graph_dir = data_dir + "neigh_graphs/"

In [2]:
region_hulls = gpd.read_parquet(regions_datadir + "regions/" + "regions_hull.parquet")

In [3]:
# 12199 - hills, small test
# 69300 - prague medium
# 226 - germany somewhere, largest cluster
# 106928 + 1 - big one in poland
# 106928 - small test
for region_id, region_hull in region_hulls.iterrows():
    if region_id < 226:
        continue
    break
region_id

226

In [4]:
df_buildings = gpd.read_parquet(data_dir + f"buildings/buildings_{region_id}.parquet")
df_buildings.shape

(3642166, 3)

In [5]:
df_tessellation = gpd.read_parquet(
    data_dir + f"/tessellations/tessellation_{region_id}.parquet"
)
df_tessellation.shape

(3981984, 2)

In [6]:
queen_1 = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")
bgraph = queen_1.subgraph(df_buildings.index.values)

In [8]:
# %%time
# queen_3 = queen_1.higher_order(k=3, lower_order=True, diagonal=True)

In [9]:
# queen_1._adjacency.shape, queen_3._adjacency.shape

In [10]:
# %%time
# queen_3 = queen_1.higher_order(k=3, lower_order=True, diagonal=True)
# bgraph3 = queen_3.subgraph(df_buildings.index.values)
# mibd = mm.mean_interbuilding_distance(df_buildings, bgraph, bgraph3)

In [11]:
# !conda install -c conda-forge graphblas-algorithms -y

In [7]:
import graphblas as gb
import graphblas_algorithms as ga
from graphblas_algorithms import all_pairs_shortest_path_length

In [8]:
sorted_ids = df_tessellation.sort_values("geometry").index.values

In [100]:
def lazy_higher_order(graph, k, n_splits, iteration_order=None):
    # convert to graphblass graph
    G = ga.Graph(gb.io.from_scipy_sparse(graph.sparse))
    ids = graph.unique_ids.values
    id_to_numeric = pd.Series(np.arange(len(ids)), index=ids)

    if iteration_order is None:
        iteration_order = ids

    for source in np.array_split(iteration_order, n_splits):
        nodes = id_to_numeric.loc[source]
        knn_matrix = all_pairs_shortest_path_length(
            G, nodes=nodes, cutoff=k, expand_output=True
        )
        wk = gb.io.to_scipy_sparse(knn_matrix)
        sparray = wk.tocoo(copy=False)
        sorter = sparray.row.argsort()
        head = ids[sparray.row][sorter]
        tail = ids[sparray.col][sorter]

        unique_head = np.unique(head)
        unique_tail = np.unique(tail)

        buffers = np.setdiff1d(unique_tail, unique_head, assume_unique=True)
        buffers.sort()

        ## need to add buffers from tail to focals, since graph constructor drops them
        graph_head = np.append(head, buffers)
        graph_tail = np.append(tail, buffers)
        graph_weights = np.ones(len(graph_head))
        graph_weights[len(head) :] = 0

        partial_higher = Graph.from_arrays(graph_head, graph_tail, graph_weights)

        yield partial_higher

In [11]:
%%time
res = pd.Series(np.nan, index=df_buildings.index.values)
for partial_higher in lazy_higher_order(
    queen_1, k=3, n_splits=50, iteration_order=sorted_ids
):
    partial_focals = np.setdiff1d(partial_higher.unique_ids, partial_higher.isolates)

    # -=-==--=-=
    partial_buildings = df_buildings.loc[
        partial_higher.unique_ids[partial_higher.unique_ids >= 0]
    ]
    partial_bgraph = bgraph.subgraph(partial_buildings.index.values)
    partial_bgraph3 = partial_higher.subgraph(partial_buildings.index.values)

    partial_result = mm.mean_interbuilding_distance(
        df_buildings.loc[partial_buildings.index.values],
        partial_bgraph,
        partial_bgraph3,
    )

    partial_focal_buildings = partial_buildings.index[
        partial_buildings.index.isin(partial_focals)
    ]
    # -=-=-=-=-=

    res.loc[partial_focal_buildings] = partial_result.loc[partial_focal_buildings]

    del partial_bgraph
    del partial_bgraph3
    del partial_buildings
    del partial_focal_buildings
    del partial_higher
    del partial_result
    gc.collect()

processed graph  (70967,)
processed graph  (72077,)
processed graph  (75049,)
processed graph  (72740,)
processed graph  (74576,)
processed graph  (75443,)
processed graph  (73830,)
processed graph  (75793,)
processed graph  (75656,)
processed graph  (74454,)
processed graph  (74914,)
processed graph  (75424,)
processed graph  (75262,)
processed graph  (73503,)
processed graph  (74093,)
processed graph  (75077,)
processed graph  (74485,)
processed graph  (74319,)
processed graph  (74802,)
processed graph  (74415,)
processed graph  (75822,)
processed graph  (73094,)
processed graph  (71452,)
processed graph  (75341,)
processed graph  (73024,)
processed graph  (60957,)
processed graph  (71187,)
processed graph  (66229,)
processed graph  (65101,)
processed graph  (71116,)
processed graph  (72650,)
processed graph  (71196,)
processed graph  (73668,)
processed graph  (73687,)
processed graph  (72527,)
processed graph  (73474,)
processed graph  (72736,)
processed graph  (72465,)
processed gr

In [12]:
res.to_frame().to_parquet(f"data/mean_interbuilding_distance_{region_id}.parquet")

In [None]:
assert np.allclose(res, mibd)

1

In [14]:
queen_1._adjacency.shape

(25667238,)

### test function

In [120]:
import geopandas as gpd
import momepy as mm
import numpy as np
import pandas as pd
from pandas.testing import assert_series_equal

In [121]:
test_file_path = mm.datasets.get_path("bubenec")
df_tessellation = gpd.read_file(test_file_path, layer="tessellation")

df_tessellation["area"] = df_tessellation.geometry.area
cont_graph1 = Graph.build_contiguity(df_tessellation).assign_self_weight()
cont_graph3 = cont_graph1.higher_order(k=3, lower_order=True, diagonal=True)

fuzzy_graph1 = Graph.build_fuzzy_contiguity(
    df_tessellation, buffer=1e-6
).assign_self_weight()
fuzzy_graph3 = fuzzy_graph1.higher_order(k=3, lower_order=True, diagonal=True)

knn_graph1 = Graph.build_knn(df_tessellation.centroid, k=3).assign_self_weight()
knn_graph3 = knn_graph1.higher_order(k=3, lower_order=True, diagonal=True)

In [122]:
def lazy_higher_order(graph, k, n_splits, iteration_order=None):
    A = graph.transform("B").sparse
    ids = graph.unique_ids.values
    id_to_numeric = pd.Series(np.arange(len(ids)), index=ids)

    if iteration_order is None:
        iteration_order = ids

    for source in np.array_split(iteration_order, n_splits):
        nodes = id_to_numeric.loc[source]

        ## get higher order topological neighbours
        Q = A[nodes, :].copy()
        for _ in range(1, k):
            next_step = Q @ A
            Q += next_step

        sparray = Q.tocoo(copy=False)
        sorter = sparray.row.argsort()
        head = source[sparray.row][sorter]
        tail = ids[sparray.col][sorter]

        unique_head = np.unique(head)
        unique_tail = np.unique(tail)

        buffers = np.setdiff1d(unique_tail, unique_head, assume_unique=True)
        buffers.sort()

        ## need to add buffers from tail to focals, since graph constructor drops them
        graph_head = np.append(head, buffers)
        graph_tail = np.append(tail, buffers)
        graph_weights = np.ones(len(graph_head))
        graph_weights[len(head) :] = 0

        partial_higher = Graph.from_arrays(graph_head, graph_tail, graph_weights)

        yield partial_higher

In [123]:
higher_order_k = 3

In [148]:
def partial_apply(graph, higher_order_k, n_splits, func, **kwargs):
    res = pd.Series(np.nan, index=graph.unique_ids)
    for partial_higher in lazy_higher_order(graph, k=higher_order_k, n_splits=n_splits):
        partial_focals = np.setdiff1d(
            partial_higher.unique_ids, partial_higher.isolates
        )

        partial_result = func(partial_focals, partial_higher, **kwargs)

        res.loc[partial_focals] = partial_result.loc[partial_focals]
    return res

In [149]:
def sum_area(partical_focals, partial_higher, y):
    return partial_higher.describe(
        y.loc[partial_higher.unique_ids], statistics=["sum"]
    )["sum"]

In [150]:
res = partial_apply(
    graph=cont_graph1,
    higher_order_k=3,
    n_splits=2,
    func=sum_area,
    y=df_tessellation["area"],
)
expected = cont_graph3.describe(df_tessellation["area"], statistics=["sum"])["sum"]
assert_series_equal(res, expected, check_names=False)

In [151]:
res = partial_apply(
    graph=fuzzy_graph1,
    higher_order_k=3,
    n_splits=2,
    func=sum_area,
    y=df_tessellation["area"],
)
expected = fuzzy_graph3.describe(df_tessellation["area"], statistics=["sum"])["sum"]
assert_series_equal(res, expected, check_names=False)

In [152]:
res = partial_apply(
    graph=knn_graph1,
    higher_order_k=3,
    n_splits=2,
    func=sum_area,
    y=df_tessellation["area"],
)
expected = knn_graph3.describe(df_tessellation["area"], statistics=["sum"])["sum"]
assert_series_equal(res, expected, check_names=False)

In [153]:
### custom indices


# string
string_tess = df_tessellation.set_index(map(str, df_tessellation.index.values))
graph1 = Graph.build_contiguity(string_tess, rook=False).assign_self_weight()
graph3 = graph1.higher_order(k=3, lower_order=True, diagonal=True)

old_expected = cont_graph3.describe(df_tessellation["area"], statistics=["sum"])["sum"]
new_expected = graph3.describe(string_tess["area"], statistics=["sum"])["sum"]
assert_series_equal(old_expected, new_expected, check_index=False)


res = partial_apply(
    graph=graph1, higher_order_k=3, n_splits=2, func=sum_area, y=string_tess["area"]
)
assert_series_equal(new_expected, res, check_names=False)

In [154]:
## negative
ii = df_tessellation.index.values
ii[:10] = np.arange(-10, 0)
neg_tess = df_tessellation.set_index(ii)
graph1 = Graph.build_contiguity(neg_tess, rook=False).assign_self_weight()
graph3 = graph1.higher_order(k=3, lower_order=True, diagonal=True)

new_expected = graph3.describe(neg_tess["area"], statistics=["sum"])["sum"]
assert_series_equal(old_expected, new_expected, check_index=False)


res = partial_apply(
    graph=graph1, higher_order_k=3, n_splits=2, func=sum_area, y=neg_tess["area"]
)
assert_series_equal(new_expected, res, check_names=False)

In [48]:
# m = df_tessellation.explore()
# # m = df_tessellation.iloc[[1]].explore(m=m, color='red')
# m = df_tessellation.iloc[knn_graph3[1].index.values].explore(m=m, color='red')
# m = df_tessellation.iloc[p[1].index.values].explore(m=m, color='green')
# m = knn_graph1.explore(df_tessellation, m=m)
# m