merge all into momepy

for each region

1. generate clean buildings
2. generate clean 3streets
3. generate elements, including nodes...
4. generate ngraphs
5. generate primary chars
6. generate context data

Clustering

7. generate regional clusters (to figure out)
8. generate final merged classification
9. finally evaluate clusters (to figure out)

Prediction model

10. generate prediction model data - first 6 steps are the same (TBC)
11. replace clustering steps with modeling steps (TBC)

In [1]:
import datetime
import gc
import glob

import geopandas as gpd
import momepy as mm
import numpy as np
import pandas as pd
from libpysal.graph import Graph

regions_datadir = "/data/uscuni-ulce/"
data_dir = "/data/uscuni-ulce/processed_data/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")

In [2]:
def read_region_buildings(typed_dict, region_ids, region_hull, region_id):
    typed_region_buildings = typed_dict.loc[region_ids[region_id]].values
    read_mask = region_hull.buffer(100)

    res = None
    for filepath in eubucco_files:
        gdf = gpd.read_file(
            filepath,
            engine="pyogrio",
            columns=["id", "geometry"],
            bbox=read_mask.bounds,
        )
        typed_gdf_buildings = typed_dict.loc[gdf["id"].values].values
        to_keep = np.isin(
            typed_gdf_buildings, typed_region_buildings, assume_unique=True
        )

        res = pd.concat((res, gdf[to_keep]))
    buildings = res

    return buildings

In [3]:
building_region_mapping = pd.read_parquet(
    regions_datadir + "regions/" + "id_to_region.parquet", engine="pyarrow"
)
typed_dict = pd.Series(
    np.arange(building_region_mapping["id"].values.shape[0]),
    index=building_region_mapping["id"].values,
)
region_ids = building_region_mapping.groupby("region")["id"].unique()
del building_region_mapping  # its 2/3 gb

In [164]:
region_hulls = gpd.read_parquet(regions_datadir + "regions/" + "regions_hull.parquet")

In [165]:
for region_id, region_hull in region_hulls.iterrows():
    region_hull = region_hull["convex_hull"]

    if region_id != 12199:
        continue

    n_workers = -1

    print("----", "Processing region: ", region_id, datetime.datetime.now())
    buildings = read_region_buildings(typed_dict, region_ids, region_hull, region_id)

    break

---- Processing region:  12199 2024-05-22 17:49:44.331497


In [175]:
buildings = gpd.read_parquet(data_dir + f"/buildings/buildings_{region_id}.parquet")

In [176]:
tessellation = gpd.read_parquet(
    data_dir + f"tessellations/tessellation_{region_id}.parquet"
)  #

In [177]:
buildings.geom_type.value_counts()

Polygon    7884
Name: count, dtype: int64

In [178]:
tessellation.geom_type.value_counts()

Polygon         8425
MultiPolygon     107
Name: count, dtype: int64

### generate graphs

In [21]:
def process_tessellation_graph(region_id):
    ## tessellation graphs
    tessellation = gpd.read_parquet(
        data_dir + f"tessellations/tessellation_{region_id}.parquet"
    )

    graph = Graph.build_contiguity(tessellation, rook=False).assign_self_weight()
    graph.to_parquet(
        data_dir + "neigh_graphs/" + f"tessellation_graph_{region_id}_knn1.parquet"
    )
    print("Build graph knn=1")

    graph3 = graph.higher_order(k=3, lower_order=True)
    graph3.to_parquet(
        data_dir + "neigh_graphs/" + f"tessellation_graph_{region_id}_knn3.parquet"
    )
    print("Build graph knn=3")

    del graph
    del graph3
    gc.collect()

In [22]:
def process_buildings_graph(region_id):
    buildings = gpd.read_parquet(data_dir + f"/buildings/buildings_{region_id}.parquet")
    graph = Graph.build_contiguity(
        buildings.set_index("id"), rook=False
    ).assign_self_weight()
    graph.to_parquet(
        data_dir + "neigh_graphs/" + f"building_graph_{region_id}_knn1.parquet"
    )
    print("Build graph knn=1")

    del buildings
    del graph
    gc.collect()

In [27]:
def process_edges_graph(region_id):
    streets = gpd.read_parquet(data_dir + f"/streets/streets_{region_id}.parquet")

    graph = Graph.build_contiguity(
        streets.set_index("osm_id"), rook=False
    ).assign_self_weight()
    graph.to_parquet(
        data_dir + "neigh_graphs/" + f"street_graph_{region_id}_knn1.parquet"
    )
    print("Build graph knn=1")

    graph3 = graph.higher_order(k=3, lower_order=True)
    graph3.to_parquet(
        data_dir + "neigh_graphs/" + f"street_graph_{region_id}_knn3.parquet"
    )
    print("Build graph knn=3")

    del streets
    del graph
    del graph3
    gc.collect()

In [28]:
def process_enclosure_graph(region_id):
    ## tessellation graphs
    inputdf = gpd.read_parquet(data_dir + f"enclosures/enclosure_{region_id}.parquet")

    graph = Graph.build_contiguity(inputdf, rook=False).assign_self_weight()
    graph.to_parquet(
        data_dir + "neigh_graphs/" + f"enclosure_graph_{region_id}_knn1.parquet"
    )

    del inputdf
    del graph
    gc.collect()
    print("Build graph knn=1")

In [42]:
def process_nodes_graph(region_id):
    ## tessellation graphs
    inputdf = gpd.read_parquet(data_dir + f"streets/streets_{region_id}.parquet")

    nx_graph = mm.gdf_to_nx(streets)
    nx_graph = mm.node_degree(nx_graph)
    _, _, w = mm.nx_to_gdf(nx_graph, spatial_weights=True)

    graph = Graph.from_W(w)
    graph.to_parquet(
        data_dir + "neigh_graphs/" + f"nodes_graph_{region_id}_knn1.parquet"
    )
    print("Build graph knn=1")

    graph5 = graph.higher_order(k=5, lower_order=True).assign_self_weight()
    graph5.to_parquet(
        data_dir + "neigh_graphs/" + f"nodes_graph_{region_id}_knn5.parquet"
    )
    del graph
    del graph5
    del inputdf
    del nx_graph
    gc.collect()
    print("Build graph knn=5")

In [4]:
for region_id, region_hull in region_hulls.iterrows():
    if region_id != 12199:
        continue

    break
    print(
        datetime.datetime.now(),
        "----Processing ------",
        region_id,
    )
    region_hull = region_hull["convex_hull"]

    process_tessellation_graph(region_id)
    process_buildings_graph(region_id)
    process_edges_graph(region_id)
    process_enclosure_graph(region_id)
    process_nodes_graph(region_id)

In [5]:
from scipy import sparse

graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [6]:
def mem_higher_order(graph, k):
    # find neighbour levels
    binary = graph.transform("B")
    sp = sparse.csr_matrix(binary.sparse)
    wk = sum(sp**x for x in range(2, k + 1))

    ## extract focals and neighbours
    ids = graph.unique_ids.values
    sparray = wk.tocoo(copy=False)
    sorter = sparray.row.argsort()
    head = ids[sparray.row][sorter]
    tail = ids[sparray.col][sorter]

    # clear up memory
    del sorter
    del sparray
    del wk
    del sp
    del graph
    # del ids
    del binary
    gc.collect()

    adjacency = pd.Series(
        1,
        index=pd.MultiIndex.from_arrays([head, tail], names=["focal", "neighbor"]),
        name="weight",
    )

    # clear more mem
    del head
    del tail
    gc.collect()

    adjacency = adjacency.reindex(ids, level=1, copy=False)

    return Graph(adjacency, is_sorted=True)

In [7]:
new_g3 = mem_higher_order(graph, 3)

In [8]:
old_g3 = graph.higher_order(k=3, lower_order=True).assign_self_weight()

In [9]:
old_g3 = np.stack(old_g3.adjacency.index)
new_g3 = np.stack(new_g3.adjacency.index)

In [10]:
assert (old_g3 == new_g3).all()

## generate chars

In [1]:
import datetime
import glob

import geopandas as gpd
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet

regions_datadir = "/data/uscuni-ulce/"
data_dir = "/data/uscuni-ulce/processed_data/"
graph_dir = data_dir + "neigh_graphs/"
eubucco_files = glob.glob(regions_datadir + "eubucco_raw/*")

In [2]:
region_hulls = gpd.read_parquet(regions_datadir + "regions/" + "regions_hull.parquet")

In [3]:
for region_id, region_hull in region_hulls.iterrows():
    if region_id < 107685:
        continue

    break

In [4]:
graph = read_parquet(
    data_dir + "neigh_graphs/" + f"tessellation_graph_{region_id}_knn1.parquet"
)

In [5]:
graph.unique_ids.shape

(3021332,)

In [6]:
binary = graph.transform("B")

In [7]:
sp = sparse.csr_matrix(binary.sparse)

In [8]:
k = 3

In [9]:
wk = sum(sp**x for x in range(2, k + 1))

In [10]:
%%time
sparray = wk.tocoo(copy=False)

CPU times: user 977 ms, sys: 568 ms, total: 1.54 s
Wall time: 1.54 s


In [11]:
%%time
sorter = sparray.row.argsort()

CPU times: user 16.6 s, sys: 1.16 s, total: 17.7 s
Wall time: 17.7 s


In [12]:
ids = graph.unique_ids.values

In [13]:
%%time
head = ids[sparray.row][sorter]
tail = ids[sparray.col][sorter]

CPU times: user 8.86 s, sys: 3.92 s, total: 12.8 s
Wall time: 12.8 s


In [14]:
del sorter
del sparray
del wk
del sp
del graph
del binary

gc.collect()

69

In [15]:
%%time
adjacency = pd.Series(
    1,
    index=pd.MultiIndex.from_arrays([head, tail], names=["focal", "neighbor"]),
    name="weight",
)

CPU times: user 16.8 s, sys: 6.11 s, total: 22.9 s
Wall time: 22.9 s


In [16]:
del head
del tail
gc.collect()

0

In [17]:
%%time
adjacency = adjacency.reindex(ids, level=1)

CPU times: user 11min 33s, sys: 23 s, total: 11min 56s
Wall time: 11min 56s


In [19]:
g = Graph(adjacency, is_sorted=True)

In [75]:
k = 3
graph3 = mem_higher_order(graph, k)
graph3.to_parquet(
    data_dir + "neigh_graphs/" + f"tessellation_graph_{region_id}_knn{k}.parquet"
)
print("Built tess graph knn=", k)

In [15]:
buildings = gpd.read_parquet(data_dir + f"/buildings/buildings_{region_id}.parquet")
streets = gpd.read_parquet(data_dir + f"/streets/streets_{region_id}.parquet")
tessellation = gpd.read_parquet(
    data_dir + f"/tessellations/tessellation_{region_id}.parquet"
)
enclosures = gpd.read_parquet(data_dir + f"/enclosures/enclosure_{region_id}.parquet")

In [16]:
# tess[tess.index >= 0].explore()

In [17]:
# tess.explore()

In [18]:
buildings_q1 = read_parquet(
    data_dir + "neigh_graphs/" + f"building_graph_{region_id}_knn1.parquet"
)

In [19]:
buildings["sdbAre"] = buildings.geometry.area
buildings["sdbPer"] = buildings.geometry.length
buildings["sdbCoA"] = mm.courtyard_area(buildings.geometry)

buildings["ssbCCo"] = mm.circular_compactness(buildings)
buildings["ssbCor"] = mm.corners(buildings.geometry)
buildings["ssbSqu"] = mm.squareness(buildings.geometry)

buildings["ssbERI"] = mm.equivalent_rectangular_index(buildings.geometry)
buildings["ssbElo"] = mm.elongation(buildings.geometry)

cencon = mm.centroid_corner_distance(buildings)
buildings["ssbCCM"] = cencon["mean"]
buildings["ssbCCD"] = cencon["std"]

buildings["stbOri"] = mm.orientation(buildings)
buildings["mtbSWR"] = mm.shared_walls_ratio(
    mm.shared_walls(buildings), buildings.geometry.length
)

buildings["libNCo"] = mm.courtyards(buildings, buildings_q1)
buildings["ldbPWL"] = mm.perimeter_wall(buildings, buildings_q1)

  angles = np.arccos(cosine_angle)
  angles = np.arccos(cosine_angle)
  angles = np.arccos(cosine_angle)


In [37]:
tessellation = gpd.read_parquet(
    data_dir + f"tessellations/tessellation_{region_id}.parquet"
)  #
tessellation = tessellation[~tessellation.index.duplicated()].sort_index()

In [53]:
queen_1 = read_parquet(
    data_dir + "neigh_graphs/" + f"tessellation_graph_{region_id}_knn1.parquet"
)
queen_3 = read_parquet(
    data_dir + "neigh_graphs/" + f"tessellation_graph_{region_id}_knn3.parquet"
)

In [39]:
tessellation["stcOri"] = mm.orientation(tessellation)
tessellation["sdcLAL"] = mm.longest_axis_length(tessellation)
tessellation["sdcAre"] = tessellation.geometry.area
tessellation["sscCCo"] = mm.circular_compactness(tessellation)
tessellation["sscERI"] = mm.equivalent_rectangular_index(tessellation.geometry)

In [40]:
tessellation["mtcWNe"] = mm.neighbors(tessellation, queen_1, weighted=True)
tessellation["mdcAre"] = mm.covered_area(tessellation.geometry.area, queen_1)

In [54]:
tess["ltcWRE"] = mm.block_counts(tess["eID"], queen_3)

ValueError: cannot set using a list-like indexer with a different length than the value

In [42]:
enclosures = gpd.read_parquet(data_dir + f"enclosures/enclosure_{region_id}.parquet")

In [43]:
blo_q1 = read_parquet(
    data_dir + "neigh_graphs/" + f"enclosure_graph_{region_id}_knn1.parquet"
)

In [44]:
enclosures["ldeAre"] = enclosures.area
enclosures["ldePer"] = enclosures.length
enclosures["lseCCo"] = mm.circular_compactness(enclosures)
enclosures["lseERI"] = mm.equivalent_rectangular_index(enclosures)
enclosures["lseCWA"] = mm.compactness_weighted_axis(enclosures.geometry)
enclosures["lteOri"] = mm.orientation(enclosures)

In [45]:
### thats mm.block_counts...
inp, res = enclosures.sindex.query(enclosures.geometry, predicate="intersects")
indices, counts = np.unique(inp, return_counts=True)
enclosures["neighbors"] = counts - 1
enclosures["lteWNB"] = enclosures["neighbors"] / enclosures["ldePer"]

NameError: name 'queen_3' is not defined

In [None]:
streets = gpd.read_parquet()

In [32]:
%%time
graph = mm.gdf_to_nx(streets)

CPU times: user 27.8 s, sys: 388 ms, total: 28.2 s
Wall time: 28 s


In [33]:
%%time
graph = mm.node_degree(graph)

CPU times: user 10.8 s, sys: 180 ms, total: 11 s
Wall time: 11 s


In [34]:
%%time
graph = mm.subgraph(
    graph,
    radius=5,
    meshedness=True,
    cds_length=False,
    mode="sum",
    degree="degree",
    length="mm_len",
    mean_node_degree=False,
    proportion={0: True, 3: True, 4: True},
    cyclomatic=False,
    edge_node_ratio=False,
    gamma=False,
    local_closeness=True,
    closeness_weight="mm_len",
    verbose=False,
)

CPU times: user 3min 46s, sys: 225 ms, total: 3min 47s
Wall time: 3min 46s


In [35]:
%%time
graph = mm.cds_length(graph, radius=3, name="ldsCDL", verbose=False)

CPU times: user 2min 1s, sys: 260 ms, total: 2min 2s
Wall time: 2min 1s


In [36]:
%%time
graph = mm.clustering(graph, name="xcnSCl")

CPU times: user 13.9 s, sys: 39.6 ms, total: 14 s
Wall time: 14 s


In [37]:
%%time
graph = mm.mean_node_dist(graph, name="mtdMDi", verbose=False)

CPU times: user 19.9 s, sys: 344 ms, total: 20.3 s
Wall time: 20.3 s


In [38]:
%%time
nodes, edges, sw = mm.nx_to_gdf(graph, spatial_weights=True)

 There are 600063 disconnected components.


CPU times: user 26.9 s, sys: 12.4 ms, total: 26.9 s
Wall time: 26.9 s


In [None]:
# get node id
%time links = momepy.get_network_ratio(tess, edges)
tess[["edgeID_keys2", "edgeID_values2"]] = links
%time tess['nodeID'] = momepy.get_node_id(tess, nodes, edges, node_id='nodeID', edge_keys='edgeID_keys2', edge_values='edgeID_values2')

In [None]:
%%time
nodes["sddAre"] = momepy.Reached(
    nodes, tess, "nodeID", "nodeID", mode="sum", values="sdcAre"
).series

In [10]:
for tf in tesselation_files:
    tesselations = gpd.read_parquet(tf)
    tarea = tesselations.area
    print(tf, (tarea > 50_000).sum())

/data/uscuni-ulce/tesselations/tesselation_1554.parquet 3
/data/uscuni-ulce/tesselations/tesselation_13224.parquet 4
/data/uscuni-ulce/tesselations/tesselation_32541.parquet 4
/data/uscuni-ulce/tesselations/tesselation_100115.parquet 81
/data/uscuni-ulce/tesselations/tesselation_120665.parquet 12
/data/uscuni-ulce/tesselations/tesselation_99170.parquet 13
/data/uscuni-ulce/tesselations/tesselation_37812.parquet 9
/data/uscuni-ulce/tesselations/tesselation_104389.parquet 23
/data/uscuni-ulce/tesselations/tesselation_8191.parquet 3
/data/uscuni-ulce/tesselations/tesselation_108009.parquet 6
/data/uscuni-ulce/tesselations/tesselation_42925.parquet 6
/data/uscuni-ulce/tesselations/tesselation_8754.parquet 210
/data/uscuni-ulce/tesselations/tesselation_35468.parquet 3
/data/uscuni-ulce/tesselations/tesselation_113651.parquet 42
/data/uscuni-ulce/tesselations/tesselation_46214.parquet 31
/data/uscuni-ulce/tesselations/tesselation_56119.parquet 10
/data/uscuni-ulce/tesselations/tesselation_52

In [11]:
# tesselations.reset_index().explore()

In [7]:
region_id = 4
buildings = gpd.read_parquet(data_dir + f"/buildings/buildings_{region_id}.parquet")
s = 500_000
e = s + 3_000
problem_buildings = buildings.iloc[s:e].reset_index()

In [9]:
problem_buildings.to_parquet("data/buffer_problem_buildings.parquet")