In [19]:
import glob
import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
import shapely

from core.cluster_validation import generate_neigbhourhood_groups, colored_crosstab
from core.utils import used_keys

import umap
import umap.plot
from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score

In [168]:
region_id = 69300

tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"

In [169]:
data = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
graph1 = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

In [170]:
from numba import njit

In [175]:
@njit
def _interpolate(weights, group):
    q = (50,)
    nan_tracker = np.isnan(group)
    if nan_tracker.all():
        return np.array([float(np.nan) for _ in q])
    group = group[~nan_tracker]
    sorter = np.argsort(group)
    group = group[sorter]
    weights = weights[~nan_tracker][sorter]

    xs = np.cumsum(weights) - 0.5 * weights
    xs = xs / weights.sum()
    ys = group
    interpolate = np.interp(
        [x / 100 for x in q],
        xs,
        ys,
    )
    return interpolate

In [226]:
@numba.njit(parallel=True)
def partial_weighted_percentile(rows, cols, partial_vals, centroids):
    """rows are the re-mapped focals, cols are re-mapped neighbours"""
    output_vals = 1
    ngroups = len(np.unique(rows))
    nrows = rows.shape[0]
    result = np.empty((ngroups, partial_vals.shape[1] * output_vals))

    istart = 0
    for g in range(ngroups):
        # # find focal start
        # istart = 0
        # while istart < nrows and rows[istart] != g:
        #     istart += 1

        # find neighbors
        iend = istart + 1
        while iend < nrows and rows[iend - 1] == rows[iend]:
            iend += 1

        neighbours = centroids[cols[istart:iend], :]
        focals = centroids[rows[istart:iend], :]
        weights = np.sqrt(((neighbours - focals)**2).sum(axis=1))
        not_zero = weights != 0
        weights = 1 / weights
        
        ## for every column apply iqr and percentiles
        for c in numba.prange(partial_vals.shape[1]):
            
            col_vals = partial_vals[cols[istart:iend], c]
            res_index = output_vals * c

            if np.isnan(col_vals).all():
                result[g, res_index] = np.nan
                continue

            else:
                res = _interpolate(weights[not_zero], col_vals[not_zero])
                result[g, res_index] = res[0]

        # # go to next group
        istart = iend
    return result




In [309]:
k=5
n_splits=10
df = data.copy()
graph = graph1.copy()
centroids = shapely.get_coordinates(tessellation.representative_point())

In [310]:
%%time
A = graph.transform("B").sparse
ids = graph.unique_ids.values
rows = np.arange(A.shape[0])
values = df.values

final_result = pd.DataFrame(
    np.empty((values.shape[0], values.shape[1])), index=ids
)

for source in np.array_split(rows, n_splits):
    Q = A[source, :].copy()
    for _ in range(1, k):
        next_step = Q @ A
        Q += next_step

    sparray = Q.tocoo(copy=False)

    unique_tail = np.unique(sparray.col)
    cols_dict = pd.Series(np.arange(len(unique_tail)), index=unique_tail)
    columns_to_pass = cols_dict.loc[sparray.col].values
    rows_to_pass = cols_dict.loc[source[sparray.row]].values

    partial_vals = values[unique_tail, :]
    partial_centroids = centroids[unique_tail, :]

    partial_res = partial_weighted_percentile(
            rows_to_pass, columns_to_pass, partial_vals, partial_centroids
        )

    final_result.iloc[source, :] = partial_res

CPU times: user 4min 34s, sys: 12.9 s, total: 4min 47s
Wall time: 27.9 s


In [315]:
final_result

Unnamed: 0,sdbAre_median,sdbPer_median,sdbCoA_median,ssbCCo_median,ssbCor_median,ssbSqu_median,ssbERI_median,ssbElo_median,ssbCCM_median,ssbCCD_median,...,ldkAre_median,ldkPer_median,lskCCo_median,lskERI_median,lskCWA_median,ltkOri_median,ltkWNB_median,likWBB_median,sdsAre_median,likWCe_median
-1933,195.536206,59.234079,0.0,0.551804,6.00000,2.070150,0.988501,0.707990,9.579160,0.778894,...,14306.167516,489.118302,0.420975,0.937259,70.369453,5.694610,0.016356,0.373460,6307.359375,0.001344
-1932,154.990729,55.942081,0.0,0.542341,4.00000,1.601308,0.995208,0.676957,8.892781,0.322393,...,248187.669548,2984.315010,0.358700,0.710607,776.564149,6.004363,0.005026,0.095119,9719.656368,0.001048
-1931,188.976609,58.926016,0.0,0.545716,6.00000,2.705394,0.989656,0.708116,9.442064,0.752381,...,15105.515530,527.138139,0.416424,0.911087,86.398868,6.004363,0.007588,0.303245,6933.067193,0.001139
-1930,183.820091,59.118965,0.0,0.544831,5.00000,2.794550,0.991371,0.690259,9.617539,0.972490,...,209208.664808,2984.315010,0.358700,0.710607,776.564149,9.199199,0.006533,0.095119,5995.435221,0.001048
-1929,330.796684,113.111417,0.0,0.495606,8.45527,0.247254,0.987428,0.598741,13.505192,2.868224,...,186488.021458,2633.734079,0.143255,0.714065,617.505791,39.353346,0.002831,0.164008,23915.039836,0.000086
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299059,121.391606,45.941254,0.0,0.498676,4.00000,1.173120,0.995716,0.588973,8.659503,0.200933,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299060,104.184452,42.869792,0.0,0.485859,4.00000,0.673118,0.998227,0.543112,7.433812,0.173459,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299061,90.153414,41.649142,0.0,0.502518,4.00000,0.275308,0.998702,0.506576,7.279844,0.025071,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299062,89.468522,41.430052,0.0,0.512002,4.00000,0.641806,0.998650,0.574248,6.764385,0.101571,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392


In [316]:
primary

Unnamed: 0,sdbAre,sdbPer,sdbCoA,ssbCCo,ssbCor,ssbSqu,ssbERI,ssbElo,ssbCCM,ssbCCD,...,ldkAre,ldkPer,lskCCo,lskERI,lskCWA,ltkOri,ltkWNB,likWBB,sdsAre,likWCe
-1933,,,,,,,,,,,...,12331.629405,517.944880,0.323345,0.955507,118.499644,9.199255,0.015446,,55962.996535,0.000081
-1932,,,,,,,,,,,...,435.260502,274.068246,0.030510,0.730177,159.106057,9.709908,0.021892,,55962.996535,0.002297
-1931,,,,,,,,,,,...,1084.359120,212.010299,0.162748,0.738946,81.720059,4.396669,0.037734,,2422.191399,0.000922
-1930,,,,,,,,,,,...,138.230998,50.222715,0.475208,0.958111,7.628495,12.022472,0.079645,,1547.125907,0.007234
-1929,,,,,,,,,,,...,7659.205935,457.005065,0.233216,0.919543,140.376436,41.802499,0.010941,,23915.039836,0.000131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299059,75.225865,36.503153,0.0,0.523964,4.0,0.308174,0.998871,0.529423,6.757689,0.012755,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299060,99.143049,44.524838,0.0,0.424405,4.0,0.197566,0.998745,0.384297,8.621600,0.013901,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299061,208.004116,66.440793,0.0,0.476146,6.0,9.118904,0.885149,0.674641,9.596795,2.791236,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392
299062,75.241771,38.770296,0.0,0.404768,4.0,4.757553,1.003082,0.377719,7.484418,0.274223,...,28064.447793,680.579552,0.493020,1.042630,81.789059,29.828648,0.002939,0.045659,108672.023661,0.000392


In [317]:
final_result.columns = [c + "_median" for c in primary.columns]

In [318]:
final_result.to_parquet(f'/data/uscuni-ulce/processed_data/context_data/unprocessed_context_chars_{region_id}_lag_{k}_sw.parquet')

In [131]:
%%time
higher = graph1.higher_order(k=3, lower_order=True)
# transform
# percentile

CPU times: user 18.2 s, sys: 1.47 s, total: 19.6 s
Wall time: 19.6 s


In [132]:
%%time

from shapely import distance
centroids = tessellation.representative_point()

def _distance_decay_weights(group):
    focal = group.index[0][0]
    neighbours = group.index.get_level_values(1)
    distances = distance(centroids.loc[focal], centroids.loc[neighbours])
    distance_decay = 1 / distances
    return distance_decay.values

decay_graph = higher.transform(_distance_decay_weights)

CPU times: user 2min 39s, sys: 29.4 ms, total: 2min 39s
Wall time: 2min 39s


In [133]:
import momepy as mm

In [134]:
res = mm.percentile(primary['sdbAre'], decay_graph, q=[50])

In [212]:
primary['sdbAre']

-1933             NaN
-1932             NaN
-1931             NaN
-1930             NaN
-1929             NaN
              ...    
 299059     75.225865
 299060     99.143049
 299061    208.004116
 299062     75.241771
 299063    116.559504
Name: sdbAre, Length: 300997, dtype: float64

In [213]:
res

Unnamed: 0_level_0,50
focal,Unnamed: 1_level_1
-1933,210.208007
-1932,163.100607
-1931,215.135215
-1930,179.147311
-1929,2261.766174
...,...
299059,121.391606
299060,104.184452
299061,90.153414
299062,89.468522


In [230]:
final_result.iloc[:, 0]

-1933       210.208007
-1932       163.100607
-1931       215.135215
-1930       179.147311
-1929      2261.766174
              ...     
 299059     121.391606
 299060     104.184452
 299061      90.153414
 299062      89.468522
 299063      85.371591
Name: 0, Length: 300997, dtype: float64

In [215]:
higher.describe(primary['sdbAre'], statistics=['mean'])['mean']

focal
-1933       278.624281
-1932       291.339485
-1931       306.293042
-1930       258.910183
-1929      3957.084643
              ...     
 299059     120.616620
 299060     118.224902
 299061     107.338795
 299062     120.615029
 299063     116.483256
Name: mean, Length: 300997, dtype: float64

In [231]:
from pandas.testing import assert_series_equal

In [308]:
assert_series_equal(res.drop(isolates)[50], final_result.drop(isolates).iloc[:, 0], check_names=False)

In [281]:
not_equal = res[50] != final_result.iloc[:, 0]

In [282]:
not_equal = not_equal[not_equal].index.values

In [289]:
graph1.cardinalities[not_equal]

focal
-643       1
 554       2
 785       1
 847       2
 1097      2
          ..
 294534    1
 298001    2
 298036    2
 298531    1
 299037    1
Name: cardinalities, Length: 241, dtype: int64

In [303]:
final_result.loc[554, 'sdbAre_median'], res.loc[554, 50],

(nan, nan)

In [304]:
graph1[554]

neighbor
-1039    1
 554     1
Name: weight, dtype: int64

In [296]:
isolates = graph1.assign_self_weight(0).isolates

In [298]:
np.isin(isolates, not_equal).all()

True

In [300]:
final_result.loc[isolates, 'sdbAre_median']

focal
-643      NaN
 785      NaN
 1144     NaN
 1276     NaN
 1542     NaN
           ..
 220333   NaN
 293066   NaN
 294534   NaN
 298531   NaN
 299037   NaN
Name: sdbAre_median, Length: 163, dtype: float64

In [302]:
not_equal[~np.isin(not_equal, isolates)]

array([   554,    847,   1097,   1619,   2218,   3050,   3145,   3177,
         3810,   4082,   4172,   5907,   7987,   8479,   9928,  15250,
        17362,  17513,  17596,  17719,  18349,  18372,  18424,  18510,
        23828,  25453,  25456,  26876,  27376,  27760,  29130,  29421,
        29666,  29936,  30296,  33783,  33803,  39845,  40536,  41345,
        41626,  42220,  43636,  54363,  54642,  55009,  56605,  61686,
        61689,  66609,  67692,  67741,  72972,  75580,  76469,  79025,
        79026,  80612,  80824,  84456,  96796,  96841,  96934,  97452,
       104147, 104238, 104392, 105174, 115558, 126136, 138833, 138836,
       143380, 152700, 221878, 225867, 298001, 298036])