In [1]:
import momepy as mm
import numpy as np
import pandas as pd
from libpysal.graph import Graph
import geopandas as gpd
import pytest
import glob
import shapely
import gc
from shapely import coverage_simplify
import datetime
from libpysal.graph import read_parquet
from utils import lazy_higher_order, partial_apply
import re
import numba
import matplotlib.pyplot as plt
import seaborn as sns
from utils import used_keys, char_units, standardize_features


regions_datadir = '/data/uscuni-ulce/'
data_dir = '/data/uscuni-ulce/processed_data/'
eubucco_files = glob.glob(regions_datadir + 'eubucco_raw/*')
graph_dir = data_dir + 'neigh_graphs/'
chars_dir = '/data/uscuni-ulce/processed_data/chars/'

In [2]:
region_hulls = gpd.read_parquet(regions_datadir + 'regions/' + 'regions_hull.parquet')

In [3]:
def check_available():
    elements = ['buildings', 'enclosures', 'tessellations', 'nodes', 'streets']
    for el in elements:

        el_ids = [int(re.findall(r'\d+', f)[0]) for f in glob.glob(chars_dir + f'{el}/*.parquet')]
        missing = np.setdiff1d(region_hulls.index.values, el_ids)
        print(f'Missing {el} for regions {missing}')
check_available()

Missing buildings for regions []
Missing enclosures for regions []
Missing tessellations for regions []
Missing nodes for regions []
Missing streets for regions []


In [4]:
# 12199 - hills, small test
# 69300 - prague medium
# 226 - germany somewhere, largest cluster

for region_id, region_hull in region_hulls.iterrows():

    if region_id != 69300: continue

    break
region_id

69300

In [6]:
@numba.njit(parallel=True)
def numba_limit_range(rows, cols, partial_vals, output_vals):
    # print(partial_vals)
    ngroups = int(rows[-1])+1
    nrows = rows.shape[0]
    result = np.empty((ngroups, partial_vals.shape[1]*output_vals))

    istart = 0
    for g in range(ngroups):

        # # find focal start
        # istart = 0
        # while istart < nrows and rows[istart] != g:
        #     istart += 1

        # find neighbors
        iend = istart + 1
        while iend < nrows and rows[iend-1] == rows[iend]:
            iend += 1

        ## for every column apply iqr and percentiles
        for c in numba.prange(partial_vals.shape[1]):

            col_vals = partial_vals[cols[istart:iend], c]
            res_index = output_vals*c

            if np.isnan(col_vals).all():
                result[g, res_index] = np.nan
                result[g, res_index+1] = np.nan
                result[g, res_index+2] = np.nan
                result[g, res_index+3] = np.nan
                continue

            lower, med, higher = np.nanpercentile(col_vals, (10, 50, 90))
            result[g, res_index] = lower
            result[g, res_index+1] = med
            result[g, res_index+2] = higher
            # result[g, res_index+3] = higher - lower

            # ## this might be undesired
            # if np.isfinite(col_vals).sum() > 3:

            #     mask = np.logical_and(lower <= col_vals, col_vals <= higher)
            #     col_vals = col_vals[mask]
            #     result[g, res_index] = lower
            #     result[g, res_index+1] = higher
            #     result[g, res_index+2] = np.nanmax(col_vals) - np.nanmin(col_vals)

            # else:

            #     result[g, res_index] = lower
            #     result[g, res_index+1] = higher
            #     result[g, res_index+2] = np.nanmax(col_vals) - np.nanmin(col_vals)

        # # go to next group
        istart = iend
    return result

In [79]:
def parallel_higher_order_context(df, graph, k, n_splits, output_vals):
    A = graph.transform("B").sparse
    ids = graph.unique_ids.values
    rows = np.arange(A.shape[0])
    values = df.values

    final_result = pd.DataFrame(np.empty((values.shape[0], values.shape[1]*output_vals)),
                                index=ids)

    for source in np.array_split(rows, n_splits):
        Q = A[source, :].copy()
        for _ in range(1, k):
            next_step = Q @ A
            Q += next_step

        sparray = Q.tocoo(copy=False)
        sorter = sparray.row.argsort()
        unique_tail = np.unique(sparray.col)
        partial_vals = values[unique_tail, :]

        cols_dict = pd.Series(np.arange(len(unique_tail)), index=unique_tail)
        columns_to_pass = cols_dict.loc[sparray.col].values
        rows_to_pass = sparray.row[sorter]

        partial_res = numba_limit_range(rows_to_pass, columns_to_pass, partial_vals, output_vals)

        final_result.iloc[source, :] = partial_res

    return final_result

In [68]:
A = graph.transform("B").sparse
ids = graph.unique_ids.values
rows = np.arange(A.shape[0])
values = tessellation.drop(columns='geometry').values

In [69]:
source=[0]

In [70]:
Q = A[source, :].copy()
for _ in range(1, k):
    next_step = Q @ A
    Q += next_step

In [71]:
assert np.allclose(sorted(Q.tocoo().col + ids[0]), higher[ids[0]].index.values)
sparray = Q.tocoo(copy=False)

In [72]:
sorter = sparray.row.argsort()
unique_tail = np.unique(sparray.col)
partial_vals = values[unique_tail, :]

In [74]:
np.allclose(partial_vals[:, 0],

array([2.000e+00, 4.042e+03, 1.072e+03, 1.072e+03, 1.072e+03, 1.072e+03,
       1.072e+03, 1.072e+03, 1.072e+03, 1.072e+03])

In [41]:
k=2

In [42]:
tessellation = gpd.read_parquet(chars_dir +  f'tessellations/chars_{region_id}.parquet')

In [43]:
graph = read_parquet(graph_dir + f'tessellation_graph_{region_id}_knn1.parquet')

In [84]:
%%time
context = parallel_higher_order_context(tessellation.drop(columns='geometry'), graph, k=k, n_splits=5, output_vals=3)

CPU times: user 1min 26s, sys: 10 s, total: 1min 36s
Wall time: 10.4 s


In [87]:
context.columns = np.concatenate([(c+'_lower', c+'_median', c+'_higher') for c in tessellation.drop(columns='geometry').columns])

In [None]:
higher = graph.higher_order(k=k, lower_order=True, diagonal=True)

In [90]:
from pandas.testing import assert_series_equal

In [92]:
assert_series_equal(
    higher.describe(tessellation['sdcAre'], statistics=['median'])['median'],
    context['sdcAre_median'],
    check_names=False
)