In [1]:
import glob
import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
import shapely
from numba import njit
from core.cluster_validation import generate_neigbhourhood_groups, colored_crosstab
from core.utils import used_keys

from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score

In [2]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

# region_id = 'freiburg'
# buildings_dir = streets_dir = enclosures_dir = tessellations_dir = graph_dir = '../data/freiburg/'
# chars_dir = '../data/freiburg/chars/'
# cluster_dir = '/data/uscuni-ulce/processed_data/clusters/'

In [3]:
data = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
graph1 = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [6]:
k=3
kernel='inverse'



n_splits=10
df = data.copy()
graph = graph1.copy()
centroids = shapely.get_coordinates(tessellation.representative_point())


In [7]:
@njit
def _interpolate(weights, group):
    q = (50,)
    nan_tracker = np.isnan(group)
    if nan_tracker.all():
        return np.array([float(np.nan) for _ in q])
    group = group[~nan_tracker]
    sorter = np.argsort(group)
    group = group[sorter]
    weights = weights[~nan_tracker][sorter]

    xs = np.cumsum(weights) - 0.5 * weights
    xs = xs / weights.sum()
    ys = group
    interpolate = np.interp(
        [x / 100 for x in q],
        xs,
        ys,
    )
    return interpolate

In [8]:
@numba.njit(parallel=True)
def partial_weighted_percentile(rows, cols, partial_vals, centroids, kernel):
    """rows are the re-mapped focals, cols are re-mapped neighbours"""
    output_vals = 1
    ngroups = len(np.unique(rows))
    nrows = rows.shape[0]
    result = np.empty((ngroups, partial_vals.shape[1] * output_vals))

    istart = 0
    for g in range(ngroups):
        # # find focal start
        # istart = 0
        # while istart < nrows and rows[istart] != g:
        #     istart += 1

        # find neighbors
        iend = istart + 1
        while iend < nrows and rows[iend - 1] == rows[iend]:
            iend += 1

        neighbours = centroids[cols[istart:iend], :]
        focals = centroids[rows[istart:iend], :]
        weights = np.sqrt(((neighbours - focals)**2).sum(axis=1))
        
        not_zero = weights != 0

        if kernel == 'gausian':
            u = weights / np.max(weights)
            weights = np.exp(-((u / 2) ** 2)) / (np.sqrt(2) * np.pi)
        elif kernel == 'inverse':
            weights = 1 / weights
        else:
            # default - reverse weights
            weights = 0 - weights
        
        
        ## for every column apply iqr and percentiles
        for c in numba.prange(partial_vals.shape[1]):
            
            col_vals = partial_vals[cols[istart:iend], c]
            res_index = output_vals * c

            if np.isnan(col_vals).all():
                result[g, res_index] = np.nan
                continue

            else:
                res = _interpolate(weights[not_zero], col_vals[not_zero])
                result[g, res_index] = res[0]

        # # go to next group
        istart = iend
    return result




In [9]:
%%time

def spatially_weighted_partial_lag(df, graph, centroids, kernel, k, n_splits):

    A = graph.transform("B").sparse
    ids = graph.unique_ids.values
    rows = np.arange(A.shape[0])
    values = df.values
    
    final_result = pd.DataFrame(
        np.empty((values.shape[0], values.shape[1])), index=ids
    )
    
    for source in np.array_split(rows, n_splits):
        Q = A[source, :].copy()
        for _ in range(1, k):
            next_step = Q @ A
            Q += next_step
    
        sparray = Q.tocoo(copy=False)
    
        unique_tail = np.unique(sparray.col)
        cols_dict = pd.Series(np.arange(len(unique_tail)), index=unique_tail)
        columns_to_pass = cols_dict.loc[sparray.col].values
        rows_to_pass = cols_dict.loc[source[sparray.row]].values
    
        partial_vals = values[unique_tail, :]
        partial_centroids = centroids[unique_tail, :]
    
        partial_res = partial_weighted_percentile(
                rows_to_pass, columns_to_pass, partial_vals, partial_centroids, kernel
            )
    
        final_result.iloc[source, :] = partial_res

    final_result.columns = [c + "_median" for c in df.columns]
    return final_result

CPU times: user 2 μs, sys: 1 μs, total: 3 μs
Wall time: 3.81 μs


In [10]:
final_result = spatially_weighted_partial_lag(df, graph, centroids, kernel, k, n_splits)

In [11]:
final_result

Unnamed: 0,sdbAre_median,sdbPer_median,sdbCoA_median,ssbCCo_median,ssbCor_median,ssbSqu_median,ssbERI_median,ssbElo_median,ssbCCM_median,ssbCCD_median,...,ldkAre_median,ldkPer_median,lskCCo_median,lskERI_median,lskCWA_median,ltkOri_median,ltkWNB_median,likWBB_median,sdsAre_median,likWCe_median
-1902,210.171557,61.377371,0.0,0.542375,6.000000,3.301156,0.986182,0.741245,9.667424,1.290645,...,1.179657e+04,463.762380,0.474843,0.989225,70.369453,5.448302,0.017250,0.375164,5995.435221,0.001391
-1901,164.573089,59.106657,0.0,0.529245,4.000000,2.134267,0.993159,0.655557,9.396817,0.289101,...,2.483909e+05,3011.720634,0.346688,0.706614,797.598902,6.006415,0.005977,0.095117,6032.119058,0.001047
-1900,215.000772,62.155443,0.0,0.537543,6.000000,3.948550,0.986132,0.699259,9.687717,1.355530,...,1.179657e+04,463.762380,0.390027,0.937259,70.369453,6.004363,0.017250,0.317540,5706.189484,0.001344
-1899,192.914002,61.698306,0.0,0.530621,5.242015,4.734015,0.989273,0.637309,9.624832,1.708446,...,1.125985e+06,8631.655793,0.341887,0.515516,2112.136118,12.396072,0.003244,0.095117,3619.874149,0.000424
-1898,2262.152510,213.759129,0.0,0.530939,6.427015,0.154458,0.998176,0.562405,37.107840,0.784127,...,1.055700e+05,1459.004444,0.143255,0.720965,482.972349,40.735333,0.010598,0.131894,24293.642728,0.000404
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299059,121.391606,45.941254,0.0,0.498676,4.000000,1.173120,0.995716,0.588973,8.659503,0.200933,...,2.806445e+04,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299060,104.184452,42.869792,0.0,0.485859,4.000000,0.673118,0.998227,0.543112,7.433812,0.173459,...,2.806445e+04,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299061,90.153414,41.649142,0.0,0.502518,4.000000,0.275308,0.998702,0.506576,7.279844,0.025071,...,2.806445e+04,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299062,89.468522,41.430052,0.0,0.512002,4.000000,0.641806,0.998650,0.574248,6.764385,0.101571,...,2.806445e+04,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392


In [12]:
data

Unnamed: 0,sdbAre,sdbPer,sdbCoA,ssbCCo,ssbCor,ssbSqu,ssbERI,ssbElo,ssbCCM,ssbCCD,...,ldkAre,ldkPer,lskCCo,lskERI,lskCWA,ltkOri,ltkWNB,likWBB,sdsAre,likWCe
-1902,,,,,,,,,,,...,12331.631344,517.944890,0.323345,0.955507,118.499624,9.199255,0.015446,,22656.466773,0.000081
-1901,,,,,,,,,,,...,435.260502,274.068246,0.030510,0.730176,159.106057,9.709908,0.021892,,3619.874149,0.002297
-1900,,,,,,,,,,,...,1084.360076,212.010389,0.162748,0.738948,81.720096,4.396667,0.037734,,2422.192375,0.000922
-1899,,,,,,,,,,,...,138.230998,50.222715,0.475208,0.958110,7.628495,12.022472,0.079645,,1547.125909,0.007234
-1898,,,,,,,,,,,...,8037.808827,471.004464,0.233662,0.919708,145.143579,42.092543,0.008492,,24293.642728,0.000124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299059,75.225865,36.503153,0.0,0.523964,4.0,0.308174,0.998871,0.529423,6.757689,0.012755,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299060,99.143049,44.524838,0.0,0.424405,4.0,0.197566,0.998745,0.384297,8.621600,0.013901,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299061,208.004116,66.440793,0.0,0.476146,6.0,9.118904,0.885149,0.674641,9.596795,2.791236,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299062,75.241771,38.770296,0.0,0.404768,4.0,4.757553,1.003082,0.377719,7.484418,0.274223,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392


In [13]:
final_result.to_parquet(f'/data/uscuni-ulce/processed_data/context_data/unprocessed_context_chars_{region_id}_lag_{k}_sw.parquet')

In [25]:
final_result = spatially_weighted_partial_lag(df, graph, centroids, 'inverse', k, n_splits)

In [26]:
%%time
higher = graph1.higher_order(k=3, lower_order=True)
# transform
# percentile

CPU times: user 18.4 s, sys: 1.85 s, total: 20.2 s
Wall time: 20.2 s


In [27]:
%%time

from shapely import distance
centroids = tessellation.representative_point()

def _distance_decay_weights(group):
    focal = group.index[0][0]
    neighbours = group.index.get_level_values(1)
    distances = distance(centroids.loc[focal], centroids.loc[neighbours])
    distance_decay = 1 / distances
    return distance_decay.values

decay_graph = higher.transform(_distance_decay_weights)

CPU times: user 2min 40s, sys: 46.9 ms, total: 2min 40s
Wall time: 2min 40s


In [28]:
import momepy as mm

In [30]:
res = mm.percentile(data['sdbAre'], decay_graph, q=[50])

In [32]:
data['sdbAre']

-1933             NaN
-1932             NaN
-1931             NaN
-1930             NaN
-1929             NaN
              ...    
 299059     75.225865
 299060     99.143049
 299061    208.004116
 299062     75.241771
 299063    116.559504
Name: sdbAre, Length: 300997, dtype: float64

In [33]:
res

Unnamed: 0_level_0,50
focal,Unnamed: 1_level_1
-1933,210.208007
-1932,163.100607
-1931,215.135215
-1930,179.147311
-1929,2261.766174
...,...
299059,121.391606
299060,104.184452
299061,90.153414
299062,89.468522


In [34]:
final_result.iloc[:, 0]

-1933       210.208007
-1932       163.100607
-1931       215.135215
-1930       179.147311
-1929      2261.766174
              ...     
 299059     121.391606
 299060     104.184452
 299061      90.153414
 299062      89.468522
 299063      85.371591
Name: sdbAre_median, Length: 300997, dtype: float64

In [36]:
higher.describe(data['sdbAre'], statistics=['mean'])['mean']

focal
-1933       278.624281
-1932       291.339485
-1931       306.293042
-1930       258.910183
-1929      3957.084643
              ...     
 299059     120.616620
 299060     118.224902
 299061     107.338795
 299062     120.615029
 299063     116.483256
Name: mean, Length: 300997, dtype: float64

In [37]:
from pandas.testing import assert_series_equal

In [39]:
isolates = graph1.assign_self_weight(0).isolates

In [40]:
assert_series_equal(res.drop(isolates)[50], final_result.drop(isolates).iloc[:, 0], check_names=False)

In [281]:
not_equal = res[50] != final_result.iloc[:, 0]

In [282]:
not_equal = not_equal[not_equal].index.values

In [289]:
graph1.cardinalities[not_equal]

focal
-643       1
 554       2
 785       1
 847       2
 1097      2
          ..
 294534    1
 298001    2
 298036    2
 298531    1
 299037    1
Name: cardinalities, Length: 241, dtype: int64

In [303]:
final_result.loc[554, 'sdbAre_median'], res.loc[554, 50],

(nan, nan)

In [304]:
graph1[554]

neighbor
-1039    1
 554     1
Name: weight, dtype: int64

In [298]:
np.isin(isolates, not_equal).all()

True

In [300]:
final_result.loc[isolates, 'sdbAre_median']

focal
-643      NaN
 785      NaN
 1144     NaN
 1276     NaN
 1542     NaN
           ..
 220333   NaN
 293066   NaN
 294534   NaN
 298531   NaN
 299037   NaN
Name: sdbAre_median, Length: 163, dtype: float64

In [302]:
not_equal[~np.isin(not_equal, isolates)]

array([   554,    847,   1097,   1619,   2218,   3050,   3145,   3177,
         3810,   4082,   4172,   5907,   7987,   8479,   9928,  15250,
        17362,  17513,  17596,  17719,  18349,  18372,  18424,  18510,
        23828,  25453,  25456,  26876,  27376,  27760,  29130,  29421,
        29666,  29936,  30296,  33783,  33803,  39845,  40536,  41345,
        41626,  42220,  43636,  54363,  54642,  55009,  56605,  61686,
        61689,  66609,  67692,  67741,  72972,  75580,  76469,  79025,
        79026,  80612,  80824,  84456,  96796,  96841,  96934,  97452,
       104147, 104238, 104392, 105174, 115558, 126136, 138833, 138836,
       143380, 152700, 221878, 225867, 298001, 298036])