In [12]:
import momepy as mm
import numpy as np
import numba
import geopandas as gpd
import pandas as pd
import shapely
from sklearn.preprocessing import StandardScaler
from collections import namedtuple
from core.cluster_validation import get_linkage_matrix
from libpysal.graph import read_parquet, Graph
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import preprocess_clustering_data
from sklearn.cluster import AgglomerativeClustering
from core.cluster_validation import get_linkage_matrix
from scipy.cluster.hierarchy import fcluster

In [13]:
regions_datadir = "/data/uscuni-ulce/"
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
model_params = '_100_0_None_None_False'
clip = None
to_drop = [
        'stcSAl','stbOri','stcOri','stbCeA',
        'ldkAre', 'ldkPer', 'lskCCo', 'lskERI','lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe'
]
linkage = 'ward'
metric = 'euclidean'

In [14]:
def preprocess_data(data):
    component_data = data.drop(columns=['percentile_25', 'percentile_75', 'median', 'std'], level=1)
    component_data = component_data.drop(columns=to_drop, level=0)
    component_data.columns =  component_data.columns.get_level_values(0)
    
    component_data = component_data[component_data.index.str[-2:] != '-1']
    
    vals = StandardScaler().fit_transform(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)
    
    # component_data = component_data[component_data.index >= 0]
    vals = np.nan_to_num(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    return component_data

def read_region_morphotope_data(region_id):
    print('processing' , region_id)
    data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
    data.index = str(region_id) + '_' + data.index
    data = data.iloc[:, :-1]
    component_data = preprocess_data(data)

    return component_data

def get_morphotope_linkage(region_data):
    
    clusterer = AgglomerativeClustering(linkage=linkage,
                                        metric=metric,
                                        compute_full_tree=True,
                                        compute_distances=True)
    model = clusterer.fit(region_data)
    linkage_matrix = get_linkage_matrix(model)
    return linkage_matrix


def get_all_clusters(cutoff):
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    all_clusters = []
    
    for region_id, _ in region_hulls.iterrows():
        data = read_region_morphotope_data(region_id)
        region_index = data.index
        linkage_matrix = np.load(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}.npy')
        clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
        clusters = pd.Series(clusters, region_index).astype(str)
        clusters = str(region_id) + '_' +clusters
        all_clusters.append(clusters)
    
    all_clusters = pd.concat(all_clusters)
    
    return all_clusters


def read_morphotopes_data(model_params):

    ### primary chars
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    data = []
    for region_id, _ in region_hulls.iterrows():
        region_morphotope_data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
        region_morphotope_data.index = str(region_id) + '_' + region_morphotope_data.index.str[:]
        data.append(region_morphotope_data)
    
    data = pd.concat(data)
    return data

def morphotopes_to_etcs(region_id, etcs=True, model_params='_100_0_None_None_False'):


    if etcs:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')

    else:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')
        
    etcs['label'] = -1
    
    morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
    morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]

    morph_dict = pd.Series(np.arange(np.unique(morphotopes.values).shape[0]),
                       np.unique(morphotopes.values))
    etcs.loc[morphotopes.index, 'label'] = morphotopes.map(lambda x: morph_dict.loc[x]).values
    etcs['morph'] = str(region_id) + '_' + '-1'
    etcs.loc[morphotopes.index, 'morph'] = str(region_id) + '_' + morphotopes.values
    return etcs

In [15]:
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)

In [16]:
%%time

## generate and save linkage matricies for each region
for region_id, _ in region_hulls.iterrows():
    data = read_region_morphotope_data(region_id)
    linkage_matrix = get_morphotope_linkage(data)
    np.save(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}', linkage_matrix)

processing 4
processing 10
processing 132
processing 134
processing 286
processing 313
processing 400
processing 523
processing 765
processing 801
processing 832
processing 913
processing 960
processing 1124
processing 1154
processing 1387
processing 1478
processing 1515
processing 1605
processing 1718
processing 1736
processing 1782
processing 1970
processing 1981
processing 2096
processing 2322
processing 2350
processing 2478
processing 2514
processing 2625
processing 2728
processing 2975
processing 3039
processing 3109
processing 3150
processing 3221
processing 3250
processing 3526
processing 3610
processing 3612
processing 3701
processing 3705
processing 3752
processing 3759
processing 3981
processing 4070
processing 4214
processing 4215
processing 4235
processing 4284
processing 4356
processing 4382
processing 4723
processing 4805
processing 5096
processing 5191
processing 5246
processing 5310
processing 5408
processing 5427
processing 5662
processing 5671
processing 5766
processi

### Setup regional cutoff to merge regional morphotopes

In [17]:
regional_cutoff = 10

In [18]:
regional_clusters = get_all_clusters(cutoff=regional_cutoff)

processing 4
processing 10
processing 132
processing 134
processing 286
processing 313
processing 400
processing 523
processing 765
processing 801
processing 832
processing 913
processing 960
processing 1124
processing 1154
processing 1387
processing 1478
processing 1515
processing 1605
processing 1718
processing 1736
processing 1782
processing 1970
processing 1981
processing 2096
processing 2322
processing 2350
processing 2478
processing 2514
processing 2625
processing 2728
processing 2975
processing 3039
processing 3109
processing 3150
processing 3221
processing 3250
processing 3526
processing 3610
processing 3612
processing 3701
processing 3705
processing 3752
processing 3759
processing 3981
processing 4070
processing 4214
processing 4215
processing 4235
processing 4284
processing 4356
processing 4382
processing 4723
processing 4805
processing 5096
processing 5191
processing 5246
processing 5310
processing 5408
processing 5427
processing 5662
processing 5671
processing 5766
processi

In [19]:
%%time
morphotopes_data = read_morphotopes_data(model_params)

CPU times: user 19.9 s, sys: 6.03 s, total: 25.9 s
Wall time: 11.6 s


In [20]:
morphotopes_data = morphotopes_data.loc[regional_clusters.index]
regional_ward_morphotopes_data = morphotopes_data.groupby(regional_clusters).mean()
regional_ward_morphotopes_data = regional_ward_morphotopes_data.iloc[:, :-1]
regional_ward_morphotopes_data

Unnamed: 0_level_0,sdbAre,sdbAre,sdbAre,sdbAre,sdbAre,sdbPer,sdbPer,sdbPer,sdbPer,sdbPer,...,mibFR,mibFR,mibFR,mibFR,mibFR,mibSCo,mibSCo,mibSCo,mibSCo,mibSCo
Unnamed: 0_level_1,percentile_25,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean,...,percentile_25,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean
10019_1,41.665471,76.436068,114.555128,69.329920,89.562989,28.185691,37.749298,49.808288,21.854990,41.283216,...,2.661164,2.943937,3.328402,0.578406,2.960789,2.661164,2.943937,3.328402,0.578406,2.960789
10019_10,39.536272,70.727951,97.178991,44.660034,73.987544,27.004483,37.427027,44.696122,16.214666,37.906596,...,2.382708,2.626358,3.037813,0.576040,2.681680,2.382708,2.626358,3.037813,0.576040,2.681680
10019_11,27.982819,58.522489,98.133516,105.815268,77.201779,22.942131,32.178600,44.106748,17.561800,35.180016,...,1.916474,2.323647,2.827715,0.926689,2.406312,1.916474,2.323647,2.827715,0.926689,2.406312
10019_12,30.981144,63.237628,87.039080,148.940366,80.002312,24.363440,34.243514,44.781546,21.841073,36.871782,...,2.172962,2.655810,3.075035,1.136119,2.667299,2.172962,2.655810,3.075035,1.136119,2.667299
10019_13,32.550772,65.979051,114.580329,56.698458,77.739225,24.904164,37.455911,52.014800,19.656390,39.378638,...,2.191464,2.510311,2.903334,0.667618,2.525694,2.191464,2.510311,2.903334,0.667618,2.525694
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99886_95,22.705593,44.093908,127.004702,266.972364,134.748477,19.673543,30.492898,48.811016,37.853698,42.485245,...,1.230781,1.991971,3.209430,1.619403,2.451849,1.230781,1.991971,3.209430,1.619403,2.451849
99886_96,76.703455,130.039436,197.376553,314.455173,183.466571,39.552732,51.291909,71.041352,40.591541,62.298484,...,5.979692,5.979692,5.979692,0.358862,5.944671,5.979692,5.979692,5.979692,0.358862,5.944671
99886_97,78.443064,277.753932,1439.005059,4163.971949,1867.610948,39.536591,74.983949,194.321212,154.715489,143.031339,...,2.184342,4.618721,11.558035,10.780050,9.082772,2.184342,4.618721,11.558035,10.780050,9.082772
99886_98,50.376496,307.451302,885.957341,1622.432600,913.318189,29.622192,68.074508,146.062216,104.156035,101.757364,...,1.806908,4.963206,8.366598,6.178261,6.372601,1.806908,4.963206,8.366598,6.178261,6.372601


In [21]:
# region_id = 4
# data = read_region_morphotope_data(region_id)
# linkage_matrix = get_morphotope_linkage(data)

### Final ward clustering

In [22]:

linkage = 'ward'
metric = 'euclidean'

In [23]:
linkage_matrix = get_morphotope_linkage(preprocess_data(regional_ward_morphotopes_data))

In [24]:
final_cutoff = 200

In [25]:
clusters = fcluster(linkage_matrix, t=final_cutoff, criterion='distance')
final_clusters = pd.Series(clusters, regional_ward_morphotopes_data.index)
final_clusters.value_counts()

13    12627
11     7545
1      7465
3      6724
22     4946
12     4675
19     4492
5      4341
15     3043
14     2931
2      2424
7      2079
10     1383
17     1171
18      956
23      769
6       387
4       147
8         8
16        5
9         1
20        1
21        1
Name: count, dtype: int64

### Plotting

In [26]:
from core.cluster_validation import get_color
final_colors = pd.DataFrame(get_color(final_clusters.values), final_clusters.values).drop_duplicates()
final_colors.loc[-1] = [255,255,255]

In [27]:
region_id = 5883

In [28]:
# etcs=False to read buildings, etcs=True for tessellation cells.
etcs = morphotopes_to_etcs(region_id, etcs=False)

In [29]:
## relabel
### regional_clusters = morphotopes -> regional_clusters
### final_clusters = regional_clusters -> final_clusters
etcs['final'] = etcs['morph'].map(lambda x: final_clusters.loc[regional_clusters.loc[x]] if x in regional_clusters else -1)

In [30]:
# can run this to change colors on an exisitng layer
# layer.get_fill_color = get_color(etcs.final)

In [31]:
etcs = etcs.to_crs(epsg=4326)
etcs['geometry'] = etcs.geometry.make_valid()

In [32]:
etcs = etcs[etcs['geometry'].geom_type == 'Polygon']

In [33]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.7)

CPU times: user 169 ms, sys: 30.8 ms, total: 200 ms
Wall time: 199 ms


In [34]:
from sidecar import Sidecar
sc = Sidecar(title='Final Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.DarkMatter)
with sc:
    display(m)

In [35]:
from core.cluster_validation import get_color
layer.get_fill_color = final_colors.loc[etcs.final].values.astype('uint8')

In [45]:
target_cluster = 7
etcs[etcs.final == target_cluster].shape

(684, 9)

In [47]:
# etcs[etcs.final == target_cluster].explore(prefer_canvas=True, tiles='Cartodb positron')

In [None]:
## final dendrogram
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
fig,ax = plt.subplots(figsize=(20,20), dpi=200)
_ = dendrogram(linkage_matrix,ax=ax)