In [1]:
import momepy as mm
import numpy as np
import numba
import geopandas as gpd
import pandas as pd
import shapely
from sklearn.preprocessing import StandardScaler
from collections import namedtuple
from core.cluster_validation import get_linkage_matrix
from libpysal.graph import read_parquet, Graph
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import preprocess_clustering_data
from sklearn.cluster import AgglomerativeClustering
from core.cluster_validation import get_linkage_matrix
from scipy.cluster.hierarchy import fcluster

In [2]:
regions_datadir = "/data/uscuni-ulce/"
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
model_params = '_75_0_None_None_False'
clip = None
to_drop = [
        'stcSAl','stbOri','stcOri','stbCeA',
        'ldkAre', 'ldkPer', 'lskCCo', 'lskERI','lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe',
          'licBAD','misBAD','ssbCCM','ssbCCD'    
]
linkage = 'ward'
metric = 'euclidean'

In [3]:
def preprocess_data(data, scalar, drop_columns=['percentile_25', 'percentile_75', 'median', 'std']):
    component_data = data.drop(columns=drop_columns, level=1)
    component_data = component_data.drop(columns=to_drop, level=0)
    component_data.columns =  component_data.columns.get_level_values(0)
    
    component_data = component_data[component_data.index.str[-2:] != '-1']
    
    vals = scalar.fit_transform(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)
    
    # component_data = component_data[component_data.index >= 0]
    vals = np.nan_to_num(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    return component_data

def read_region_morphotope_data(region_id):
    print('processing' , region_id)
    data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
    data.index = str(region_id) + '_' + data.index
    data = data.iloc[:, :-1]
    component_data = preprocess_data(data, StandardScaler())

    return component_data

def get_morphotope_linkage(region_data):
    
    clusterer = AgglomerativeClustering(linkage=linkage,
                                        metric=metric,
                                        compute_full_tree=True,
                                        compute_distances=True)
    model = clusterer.fit(region_data)
    linkage_matrix = get_linkage_matrix(model)
    return linkage_matrix


def get_all_clusters(cutoff):
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    all_clusters = []
    
    for region_id, _ in region_hulls.iterrows():
        data = read_region_morphotope_data(region_id)
        region_index = data.index
        linkage_matrix = np.load(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}.npy')
        clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
        clusters = pd.Series(clusters, region_index).astype(str)
        clusters = str(region_id) + '_' +clusters
        all_clusters.append(clusters)
    
    all_clusters = pd.concat(all_clusters)
    
    return all_clusters


def read_morphotopes_data(model_params):

    ### primary chars
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    data = []
    for region_id, _ in region_hulls.iterrows():
        region_morphotope_data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
        region_morphotope_data.index = str(region_id) + '_' + region_morphotope_data.index.str[:]
        data.append(region_morphotope_data)
    
    data = pd.concat(data)
    return data

def morphotopes_to_etcs(region_id, etcs=True, model_params='_100_0_None_None_False'):


    if etcs:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')

    else:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')
        
    etcs['label'] = -1
    
    morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
    morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]

    morph_dict = pd.Series(np.arange(np.unique(morphotopes.values).shape[0]),
                       np.unique(morphotopes.values))
    etcs.loc[morphotopes.index, 'label'] = morphotopes.map(lambda x: morph_dict.loc[x]).values
    etcs['morph'] = str(region_id) + '_' + '-1'
    etcs.loc[morphotopes.index, 'morph'] = str(region_id) + '_' + morphotopes.values
    return etcs

In [4]:
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)

In [5]:
%%time

## generate and save linkage matricies for each region
for region_id, _ in region_hulls.iterrows():
    data = read_region_morphotope_data(region_id)
    linkage_matrix = get_morphotope_linkage(data)
    np.save(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}', linkage_matrix)

processing 4
processing 10
processing 132
processing 134
processing 286
processing 313
processing 400
processing 523
processing 765
processing 801
processing 832
processing 913
processing 960
processing 1124
processing 1154
processing 1387
processing 1478
processing 1515
processing 1605
processing 1718
processing 1736
processing 1782
processing 1970
processing 1981
processing 2096
processing 2322
processing 2350
processing 2478
processing 2514
processing 2625
processing 2728
processing 2975
processing 3039
processing 3109
processing 3150
processing 3221
processing 3250
processing 3526
processing 3610
processing 3612
processing 3701
processing 3705
processing 3752
processing 3759
processing 3981
processing 4070
processing 4214
processing 4215
processing 4235
processing 4284
processing 4356
processing 4382
processing 4723
processing 4805
processing 5096
processing 5191
processing 5246
processing 5310
processing 5408
processing 5427
processing 5662
processing 5671
processing 5766
processi

### Setup regional cutoff to merge regional morphotopes

In [21]:
regional_cutoff = 10

In [22]:
regional_clusters = get_all_clusters(cutoff=regional_cutoff)

processing 4
processing 10
processing 132
processing 134
processing 286
processing 313
processing 400
processing 523
processing 765
processing 801
processing 832
processing 913
processing 960
processing 1124
processing 1154
processing 1387
processing 1478
processing 1515
processing 1605
processing 1718
processing 1736
processing 1782
processing 1970
processing 1981
processing 2096
processing 2322
processing 2350
processing 2478
processing 2514
processing 2625
processing 2728
processing 2975
processing 3039
processing 3109
processing 3150
processing 3221
processing 3250
processing 3526
processing 3610
processing 3612
processing 3701
processing 3705
processing 3752
processing 3759
processing 3981
processing 4070
processing 4214
processing 4215
processing 4235
processing 4284
processing 4356
processing 4382
processing 4723
processing 4805
processing 5096
processing 5191
processing 5246
processing 5310
processing 5408
processing 5427
processing 5662
processing 5671
processing 5766
processi

In [11]:
%%time

## generate and save linkage matricies for each region
morphotopes_data = []
for region_id, _ in region_hulls.iterrows():
    data = read_region_morphotope_data(region_id)
    morphotopes_data.append(data)

morphotopes_data = pd.concat(morphotopes_data)

regional_ward_morphotopes_data = morphotopes_data.groupby(regional_clusters).agg('mean').fillna(0)

processing 4
processing 10
processing 132
processing 134
processing 286
processing 313
processing 400
processing 523
processing 765
processing 801
processing 832
processing 913
processing 960
processing 1124
processing 1154
processing 1387
processing 1478
processing 1515
processing 1605
processing 1718
processing 1736
processing 1782
processing 1970
processing 1981
processing 2096
processing 2322
processing 2350
processing 2478
processing 2514
processing 2625
processing 2728
processing 2975
processing 3039
processing 3109
processing 3150
processing 3221
processing 3250
processing 3526
processing 3610
processing 3612
processing 3701
processing 3705
processing 3752
processing 3759
processing 3981
processing 4070
processing 4214
processing 4215
processing 4235
processing 4284
processing 4356
processing 4382
processing 4723
processing 4805
processing 5096
processing 5191
processing 5246
processing 5310
processing 5408
processing 5427
processing 5662
processing 5671
processing 5766
processi

In [29]:
np.unique(regional_clusters).shape

(17299,)

In [130]:
# %%time
morphotopes_data = read_morphotopes_data(model_params)
morphotopes_data = morphotopes_data.loc[regional_clusters.index]
morphotopes_data = morphotopes_data.iloc[:, :-1]

In [131]:
component_data = morphotopes_data.drop(columns=['percentile_25', 'percentile_75', 'median', 'std'], level=1)
component_data = component_data.drop(columns=to_drop, level=0)
component_data.columns =  component_data.columns.get_level_values(0)

In [132]:
grouped_data = component_data.groupby(regional_clusters).agg('mean').fillna(0)


In [390]:
from sklearn.preprocessing import QuantileTransformer, RobustScaler
scalar = QuantileTransformer(output_distribution='uniform') # works with uniform
vals = scalar.fit_transform(grouped_data)
regional_ward_morphotopes_data = pd.DataFrame(vals, index=regional_ward_morphotopes_data.index, columns=regional_ward_morphotopes_data.columns)

In [391]:
stats = regional_ward_morphotopes_data.describe()
stats.loc['min', ].min(), stats.loc['max', ].max()

(np.float64(-5.199337582605575), np.float64(5.19933758270342))

In [392]:
# from sklearn.preprocessing import RobustScaler


In [393]:
# region_id = 4
# data = read_region_morphotope_data(region_id)
# linkage_matrix = get_morphotope_linkage(data)

### Final ward clustering

In [394]:

linkage = 'ward'
metric = 'euclidean'

In [395]:
%%time
linkage_matrix = get_morphotope_linkage(regional_ward_morphotopes_data)

CPU times: user 1min 59s, sys: 5.17 s, total: 2min 4s
Wall time: 2min 3s


In [396]:
# ## final dendrogram
# import matplotlib.pyplot as plt
# from scipy.cluster.hierarchy import dendrogram
# fig,ax = plt.subplots(figsize=(20,20))
# _ = dendrogram(linkage_matrix,ax=ax)

In [397]:
regional_ward_morphotopes_data

Unnamed: 0,sdbAre,sdbPer,sdbCoA,ssbCCo,ssbCor,ssbSqu,ssbERI,ssbElo,mtbSWR,libNCo,...,mibAre,mibLen,mibElo,mibERI,mibCCo,mibLAL,mibFR,mibSCo,micBAD,midBAD
10019_1,-1.039210,-0.590021,0.576424,-1.513002,-0.307713,1.150626,-0.721116,-1.346896,1.004946,0.702353,...,0.492817,0.921169,-0.850742,-1.451519,-1.298215,0.658298,-0.469579,-0.469579,-0.606596,-1.074782
10019_10,-0.585602,-0.189017,0.633938,-1.230931,-0.354892,1.041240,-0.577612,-1.180112,1.130808,0.800891,...,1.181217,1.627628,-0.378183,-1.975329,-1.563979,1.367697,0.234369,0.234369,-1.021952,-1.178998
10019_11,-0.124720,0.414082,0.706760,-1.899970,1.016910,0.402436,-1.938906,-1.366864,1.019479,1.432172,...,0.979125,1.450709,-0.586836,-1.813275,-1.325531,1.095375,0.073893,0.073893,-0.580243,-1.107418
10019_12,-0.802304,-0.162513,1.287906,-1.885369,0.011083,1.266927,-1.107879,-1.651859,1.127945,1.245460,...,1.015163,1.502536,-0.431689,-1.860757,-1.292032,1.117740,-0.046660,-0.046660,-1.300866,-1.439725
10019_13,-1.257060,-1.190302,0.634383,0.013883,-0.726924,-0.116545,0.687637,-0.153233,0.194360,-5.199338,...,-0.863579,-0.542551,0.660526,-0.252251,0.066792,-0.676378,-1.147317,-1.147317,-0.753734,-0.143445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99886_95,0.749366,0.880454,1.268654,-1.242909,-0.169387,0.402452,-1.291639,-0.555473,-0.357880,1.300223,...,0.743615,0.915401,-0.173204,-0.766706,-0.586139,0.600540,0.333458,0.333458,0.625703,0.579904
99886_96,0.466655,0.650882,0.058965,-1.181083,-0.021558,-0.178837,-1.232566,-0.559127,-0.186933,1.122782,...,0.365273,0.499643,-0.385037,-0.717530,-0.586607,0.371951,0.101625,0.101625,0.249232,0.325426
99886_97,0.008839,0.121941,1.348461,0.115161,0.317045,-0.054912,-0.976406,0.504062,-0.830039,1.270181,...,0.067482,0.324855,0.589319,-0.410124,0.415156,-0.068797,-0.919184,-0.919184,-0.482758,-0.099776
99886_98,1.718223,1.406828,-5.199338,-0.950406,0.814631,0.466001,-0.936251,-0.857799,-0.784737,1.489959,...,1.649600,1.315221,-0.593897,-0.013498,-0.013082,1.354083,1.233398,1.233398,1.905417,1.764158


In [416]:
final_cutoff = 100

In [417]:
clusters = fcluster(linkage_matrix, t=final_cutoff, criterion='distance')
final_clusters = pd.Series(clusters, regional_ward_morphotopes_data.index)
final_clusters.value_counts()

18    6326
3     4209
6     3790
10    3552
22    3521
23    3222
12    2955
21    2942
11    2909
1     2892
24    2715
17    2497
9     2442
8     2438
19    2431
14    1906
4     1617
16    1480
26    1314
20    1305
5     1260
7     1255
25    1236
2     1211
15     769
13     605
Name: count, dtype: int64

In [418]:
# regional_ward_morphotopes_data.groupby(final_clusters).mean().loc[[4,7, 8, 11]].style.background_gradient(axis=0, cmap="BuGn")

### Plotting

In [419]:
from core.cluster_validation import get_color
final_colors = pd.DataFrame(get_color(final_clusters.values), final_clusters.values).drop_duplicates()
final_colors.loc[-1] = [255,255,255]

In [428]:
# region_id = 5883

region_id = 69333

In [429]:
# etcs=False to read buildings, etcs=True for tessellation cells.
etcs = morphotopes_to_etcs(region_id, etcs=False, model_params=model_params)

In [430]:
## relabel
### regional_clusters = morphotopes -> regional_clusters
### final_clusters = regional_clusters -> final_clusters
etcs['final'] = etcs['morph'].map(lambda x: final_clusters.loc[regional_clusters.loc[x]] if x in regional_clusters else -1)


In [431]:
# ## can run this to change colors on an exisitng layer
# layer.get_fill_color = get_color(etcs.final)

In [432]:
etcs['geometry'] = etcs.simplify(1).to_crs(epsg=4326).make_valid()
etcs = etcs[etcs['geometry'].geom_type == 'Polygon']

In [433]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.7)

CPU times: user 1.05 s, sys: 138 ms, total: 1.19 s
Wall time: 1.19 s


In [434]:
from sidecar import Sidecar
sc = Sidecar(title=f'Final Clusters - {final_cutoff}')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.DarkMatter)
with sc:
    display(m)

In [435]:
from core.cluster_validation import get_color
layer.get_fill_color = final_colors.loc[etcs.final].values.astype('uint8')

In [302]:
etcs.final.value_counts().shape

(16,)

In [37]:
target_cluster = 8
etcs[etcs.final == target_cluster].shape

(1189, 9)

In [39]:
# etcs[etcs.final == target_cluster].explore(prefer_canvas=True, tiles='Cartodb positron')