In [1]:
import glob

import geopandas as gpd
import matplotlib.pyplot as plt
import numba
import numpy as np
import pandas as pd
from libpysal.graph import read_parquet
from sklearn.preprocessing import PowerTransformer, RobustScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier


In [2]:
from core.cluster_validation import generate_validation_groups, colored_crosstab
from core.utils import used_keys

In [3]:
from palettable.colorbrewer.qualitative import Set3_12
from sklearn.metrics import davies_bouldin_score

In [4]:
region_id = 4182

tessellations_dir = graph_dir = '../data/ms_buildings/'
chars_dir = '../data/ms_buildings/chars/'

In [5]:
primary = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')

In [6]:
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

### Generate context

In [7]:
@numba.njit(parallel=True)
def numba_limit_range(rows, cols, partial_vals, output_vals):
    # print(partial_vals)
    ngroups = int(rows[-1]) + 1
    nrows = rows.shape[0]
    result = np.empty((ngroups, partial_vals.shape[1] * output_vals))

    istart = 0
    for g in range(ngroups):
        # # find focal start
        # istart = 0
        # while istart < nrows and rows[istart] != g:
        #     istart += 1

        # find neighbors
        iend = istart + 1
        while iend < nrows and rows[iend - 1] == rows[iend]:
            iend += 1

        ## for every column apply iqr and percentiles
        for c in numba.prange(partial_vals.shape[1]):
            col_vals = partial_vals[cols[istart:iend], c]
            res_index = output_vals * c

            if np.isnan(col_vals).all():
                result[g, res_index] = np.nan
                result[g, res_index + 1] = np.nan
                result[g, res_index + 2] = np.nan
                continue

            lower, med, higher = np.nanpercentile(col_vals, (15, 50, 85))
            result[g, res_index] = lower
            result[g, res_index + 1] = med
            result[g, res_index + 2] = higher

        # # go to next group
        istart = iend
    return result

In [8]:
def parallel_higher_order_context(df, graph, k, n_splits, output_vals):
    A = graph.transform("B").sparse
    ids = graph.unique_ids.values
    rows = np.arange(A.shape[0])
    values = df.values

    final_result = pd.DataFrame(
        np.empty((values.shape[0], values.shape[1] * output_vals)), index=ids
    )

    for source in np.array_split(rows, n_splits):
        Q = A[source, :].copy()
        for _ in range(1, k):
            next_step = Q @ A
            Q += next_step

        sparray = Q.tocoo(copy=False)
        sorter = sparray.row.argsort()
        unique_tail = np.unique(sparray.col)
        partial_vals = values[unique_tail, :]

        cols_dict = pd.Series(np.arange(len(unique_tail)), index=unique_tail)
        columns_to_pass = cols_dict.loc[sparray.col].values
        rows_to_pass = sparray.row[sorter]

        partial_res = numba_limit_range(
            rows_to_pass, columns_to_pass, partial_vals, output_vals
        )

        final_result.iloc[source, :] = partial_res

    return final_result

In [9]:
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}_knn1.parquet")

In [10]:
## Discard disconnected components and focus on a single component only.
## doesnt make a big difference.

In [11]:
graph.component_labels.value_counts()

component labels
26     111187
134     11517
68       9874
95       7773
2        4436
        ...  
846         1
847         1
834         1
832         1
836         1
Name: count, Length: 858, dtype: int64

In [12]:
data = primary
data_graph = graph

In [13]:
%%time
context = parallel_higher_order_context(
    data, data_graph, k=5, n_splits=10, output_vals=3
)

CPU times: user 2min 39s, sys: 8.85 s, total: 2min 48s
Wall time: 18.9 s


In [14]:
context.columns = np.concatenate(
    [(c + "_lower", c + "_median", c + "_higher") for c in primary.columns]
)

In [15]:
# X = context

In [16]:
# higher = graph.higher_order(k=3, lower_order=True, diagonal=True)
# r = higher.describe(primary['sdbAre'], statistics=['median'])['median']
# from pandas.testing import assert_series_equal
# assert_series_equal(context['sdbAre_median'], r, check_names=False)

# r.duplicated().sum()

### Data preprocessing

In [17]:
tess_groups = generate_validation_groups(tessellation, include_random_sample=True)

def check_score(data, example_clusters):
    groups = example_clusters[example_clusters.index.isin(data.index)]
    groups_ilocs = (
        pd.Series(np.arange(len(data)), index=data.index).loc[groups.index].values
    )
    return davies_bouldin_score(data.iloc[groups_ilocs], groups.values)

In [18]:
## no spatial lag
X = np.nan_to_num(StandardScaler().fit_transform(data))
X = pd.DataFrame(X, columns=data.columns, index=data.index)
check_score(X, tess_groups)

np.float64(7.112594696785641)

In [19]:
## no spatial lag
X = np.nan_to_num(RobustScaler().fit_transform(data))
X = pd.DataFrame(X, columns=data.columns, index=data.index)
check_score(X, tess_groups)

np.float64(23.302472243546216)

In [20]:
## no spatial lag
X = np.nan_to_num(PowerTransformer().fit_transform(data))
X = pd.DataFrame(X, columns=data.columns, index=data.index)
check_score(X, tess_groups)

np.float64(7.082113961203566)

In [21]:
### only medians

In [22]:
X = np.nan_to_num(
    StandardScaler().fit_transform(
        context.loc[:, context.columns.str.endswith("_median")]
    )
)
X = pd.DataFrame(
    X,
    columns=context.loc[:, context.columns.str.endswith("_median")].columns,
    index=context.loc[:, context.columns.str.endswith("_median")].index,
)
check_score(X, tess_groups)

np.float64(3.459891607868656)

In [23]:
X = np.nan_to_num(
    RobustScaler().fit_transform(
        context.loc[:, context.columns.str.endswith("_median")]
    )
)
X = pd.DataFrame(
    X,
    columns=context.loc[:, context.columns.str.endswith("_median")].columns,
    index=context.loc[:, context.columns.str.endswith("_median")].index,
)
check_score(X, tess_groups)

np.float64(5.116880011427646)

In [24]:
X = np.nan_to_num(
    PowerTransformer().fit_transform(
        context.loc[:, context.columns.str.endswith("_median")] + 1
    )
)
X = pd.DataFrame(
    X,
    columns=context.loc[:, context.columns.str.endswith("_median")].columns,
    index=context.loc[:, context.columns.str.endswith("_median")].index,
)
check_score(X, tess_groups)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


np.float64(3.21157342596039)

In [25]:
## clip medians to -10, 10 standard deviations
X = np.nan_to_num(
    StandardScaler().fit_transform(
        context.loc[:, context.columns.str.endswith("_median")]
    )
)
X = pd.DataFrame(
    X,
    columns=context.loc[:, context.columns.str.endswith("_median")].columns,
    index=context.loc[:, context.columns.str.endswith("_median")].index,
)
X = X.clip(-10, 10)
check_score(X, tess_groups)

np.float64(3.438079745593143)

In [26]:
# all features with lag

In [27]:
# no standardisation
X = np.nan_to_num(context)
X = pd.DataFrame(X, columns=context.columns, index=context.index)
check_score(X, tess_groups)

np.float64(5.145511219685344)

In [28]:
X = np.nan_to_num(StandardScaler().fit_transform(context))
X = pd.DataFrame(X, columns=context.columns, index=context.index)
check_score(X, tess_groups)

np.float64(3.198138805724251)

In [29]:
X = np.nan_to_num(RobustScaler().fit_transform(context))
X = pd.DataFrame(X, columns=context.columns, index=context.index)
check_score(X, tess_groups)

np.float64(4.639601177762417)

In [30]:
X = np.nan_to_num(PowerTransformer().fit_transform(context + 1))
X = pd.DataFrame(X, columns=context.columns, index=context.index)
check_score(X, tess_groups)

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims=keepdims, where=where)


np.float64(3.027264749191601)

In [31]:
## clip to -10, 10 standard deviations
X = np.nan_to_num(StandardScaler().fit_transform(context))
X = pd.DataFrame(X, columns=context.columns, index=context.index)
X = X.clip(-10, 10)
check_score(X, tess_groups)

np.float64(3.1589526898388787)

In [32]:
context["sdbAre_lower"].describe()

count    236756.000000
mean         27.213086
std          14.468199
min           5.908614
25%          19.676008
50%          24.270812
75%          31.087876
max        1037.875053
Name: sdbAre_lower, dtype: float64

In [33]:
desc = X.describe()

In [34]:
desc.loc[["25%", "50%", "75%", "max"]]

Unnamed: 0,sdbAre_lower,sdbAre_median,sdbAre_higher,sdbPer_lower,sdbPer_median,sdbPer_higher,sdbCoA_lower,sdbCoA_median,sdbCoA_higher,ssbCCo_lower,...,ltkOri_higher,ltkWNB_lower,ltkWNB_median,ltkWNB_higher,likWBB_lower,likWBB_median,likWBB_higher,sdsAre_lower,sdsAre_median,sdsAre_higher
25%,-0.520942,-0.431746,-0.373578,-0.561952,-0.410962,-0.531033,0.0,0.0,-0.004596,-0.619178,...,-0.688914,-0.645029,-0.801672,-0.742833,-0.760993,-0.807698,-0.60688,-0.149453,-0.229245,-0.353549
50%,-0.203362,-0.143242,-0.297249,-0.16946,-0.102208,-0.330175,0.0,0.0,-0.004596,0.01444,...,0.109981,-0.276748,-0.272126,-0.046437,-0.185844,0.016807,0.041999,-0.138394,-0.194665,-0.265291
75%,0.267815,0.173251,-0.017278,0.354888,0.246271,0.165545,0.0,0.0,-0.004596,0.683554,...,0.83762,0.338989,0.637532,0.607426,0.638015,0.715494,0.490802,-0.108027,-0.099264,-0.039171
max,10.0,10.0,10.0,10.0,10.0,10.0,0.0,0.0,10.0,3.548744,...,1.551723,7.6761,7.761571,10.0,6.6976,6.949751,9.038265,10.0,10.0,10.0


In [35]:
corrs = context.corr().abs()
upper = corrs.where(np.triu(np.ones(corrs.shape), k=1).astype(bool))

In [36]:
corrs_to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
corrs_to_drop

['sdbPer_lower',
 'ssbCCM_lower',
 'ssbCCM_median',
 'ldbPWL_higher',
 'lskCWA_lower',
 'lskCWA_median',
 'lskCWA_higher']

In [37]:
X_no_corr = X.drop(columns=corrs_to_drop)
X_no_corr.shape, check_score(X_no_corr, tess_groups)

((236756, 179), np.float64(3.164550950253643))

In [38]:
# corrs_to_drop = []
# X_no_corr = X.drop(columns=corrs_to_drop)
# X_no_corr.shape, check_score(X_no_corr, tess_groups)

In [39]:
# X.describe()

In [40]:
desc = X_no_corr.describe()
to_drop = desc.columns[(desc.loc["max"] - desc.loc["min"]) < 0.5].values
to_drop

array(['sdbCoA_lower', 'sdbCoA_median', 'libNCo_lower'], dtype=object)

In [41]:
X_different = X_no_corr.drop(columns=to_drop)
X_different.shape, check_score(X_different, tess_groups)

((236756, 176), np.float64(3.164550950253642))

In [42]:
X_no_duplicates = X_different[~X_different.duplicated()]
X_no_duplicates.shape, check_score(X_no_duplicates, tess_groups)

((213995, 176), np.float64(3.049410216680146))

In [43]:
# tessellation[X_different.duplicated(keep=False)].explore()

In [44]:
# plotting = tessellation.loc[X_different[X_different.duplicated(keep=False)].index.values]
# plotting['sdbAre_lower'] = X_different[X_different.duplicated(keep=False)]['sdbAre_lower']
# plotting = plotting.sort_values('sdbAre_lower')
# plotting['sdbAre_lower']

In [45]:
# layer = PolygonLayer.from_geopandas(
#     gdf=plotting.reset_index()[['geometry', 'index', 'sdbAre_lower']],
#     opacity=.15
# )
# m = Map(layer, basemap_style=CartoBasemap.Positron)
# m

In [46]:
# tessellation.loc[graph[250196].index.values].reset_index().explore()

In [47]:
# tessellation.loc[higher[21149].index.values, ].explore()

In [48]:
# tessellation.loc[higher[271717].index.values, ].explore()

In [49]:
# higher = graph.higher_order(k=3, lower_order=True, diagonal=True)

In [50]:
# primary.loc[higher[271717].index.values, 'sdbAre'].median(), primary.loc[higher[271714].index.values, 'sdbAre'].median()

In [51]:
# primary.loc[higher[271717].index.values, 'sdbAre'].describe()

In [52]:
# primary.loc[higher[271714].index.values, 'sdbAre'].describe()

In [53]:
# r = dict([])
# for idx in higher[271717].index.values:
#     r[idx] = (primary.loc[higher[idx].index.values, 'sdbAre'].median())

In [54]:
# dict(sorted(r.items(), key=lambda item: item[1]))

In [55]:
# remove singletons

In [56]:
singletons = graph.unique_ids[graph.cardinalities == 1]

In [57]:
# tessellation.loc[singletons].explore()

In [58]:
X_no_singletons = X_no_duplicates[~X_no_duplicates.index.isin(singletons)]
X_no_singletons.shape, check_score(X_no_singletons, tess_groups)

((213822, 176), np.float64(3.049410216680146))

In [59]:
# remove empty space

In [60]:
X_buildings = X_no_singletons[X_no_singletons.index >= 0]
X_buildings.shape, check_score(X_buildings, tess_groups)

((209124, 176), np.float64(3.048218609612778))

In [61]:
X_train = X_buildings
X_train.shape, check_score(X_train, tess_groups)

((209124, 176), np.float64(3.048218609612778))

In [62]:
X_train.describe().loc["max"].describe()

count    176.000000
mean       7.162977
std        3.046398
min        0.015739
25%        4.738066
50%        7.890156
75%       10.000000
max       10.000000
Name: max, dtype: float64

### assign tessellations to existing clusters

In [63]:
labels = np.load('../data/sample_cluster_labels.npy')

In [64]:
clusters = gpd.read_parquet('../data/sample_clusters.parquet')

In [65]:
tessellation_subset = tessellation.loc[X_train.index.values]

In [66]:
inp, res = tessellation_subset.representative_point().geometry.sindex.query(clusters.geometry, predicate='intersects')

In [67]:
inp.shape

(160215,)

In [68]:
tessellation_subset = tessellation_subset.iloc[res]
X_train_subset = X_train.iloc[res]

In [69]:
y = inp.astype('str')

### predictive model

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_train_subset, y, test_size=0.25, random_state=42)

In [76]:
clf = RandomForestClassifier(random_state=0, n_jobs=-1, verbose=True)

In [77]:
%%time
clf.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    4.2s


CPU times: user 7min 14s, sys: 2.46 s, total: 7min 17s
Wall time: 24.1 s


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   24.0s finished


In [78]:
clf.score(X_test, y_test)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.4s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    1.8s finished


0.9583811853997104

In [79]:
new_labels = clf.predict(X_train_subset)

[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    1.3s
[Parallel(n_jobs=20)]: Done 100 out of 100 | elapsed:    6.4s finished


In [81]:
from lonboard import PolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12

In [82]:
plotting = tessellation_subset

plotting["label"] = new_labels

In [83]:
import glasbey


def hex_to_rgb(hexa):
    return tuple(int(hexa[i : i + 2], 16) for i in (0, 2, 4))


if plotting["label"].unique().shape[0] > 12:
    gb_cols = glasbey.extend_palette(
        Set3_12.hex_colors, palette_size=plotting["label"].unique().shape[0] + 1
    )
else:
    gb_cols = Set3_12.hex_colors

gb_cols = [hex_to_rgb(c[1:]) for c in gb_cols]

colors = apply_categorical_cmap(
    plotting["label"], cmap=dict(zip(np.unique(plotting["label"]), gb_cols, strict=False))
)

In [84]:
layer = PolygonLayer.from_geopandas(
    gdf=plotting[["geometry", "label"]], get_fill_color=colors, opacity=0.15
)



In [86]:
# m = Map(layer, basemap_style=CartoBasemap.Positron)
# m