In [1]:
import momepy as mm
import numpy as np
import numba
import geopandas as gpd
import pandas as pd
import shapely
from sklearn.preprocessing import StandardScaler
from collections import namedtuple
from core.cluster_validation import get_linkage_matrix
from libpysal.graph import read_parquet, Graph
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import preprocess_clustering_data
from sklearn.cluster import AgglomerativeClustering
from core.cluster_validation import get_linkage_matrix
from scipy.cluster.hierarchy import fcluster

In [2]:
regions_datadir = "/data/uscuni-ulce/"
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
model_params = '_100_0_None_None_False'
clip = None
to_drop = [
        'stcSAl','stbOri','stcOri','stbCeA',
        'ldkAre', 'ldkPer', 'lskCCo', 'lskERI','lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe'
]
linkage = 'ward'
metric = 'euclidean'

In [3]:
def preprocess_data(data):
    component_data = data.drop(columns=['percentile_25', 'percentile_75', 'median', 'std'], level=1)
    component_data = component_data.drop(columns=to_drop, level=0)
    component_data.columns =  component_data.columns.get_level_values(0)
    
    component_data = component_data[component_data.index.str[-2:] != '-1']
    
    vals = StandardScaler().fit_transform(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)
    
    # component_data = component_data[component_data.index >= 0]
    vals = np.nan_to_num(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    return component_data

def read_region_morphotope_data(region_id):
    print('processing' , region_id)
    data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
    data.index = str(region_id) + '_' + data.index
    data = data.iloc[:, :-1]
    component_data = preprocess_data(data)

    return component_data

def get_morphotope_linkage(region_data):
    
    clusterer = AgglomerativeClustering(linkage=linkage,
                                        metric=metric,
                                        compute_full_tree=True,
                                        compute_distances=True)
    model = clusterer.fit(region_data)
    linkage_matrix = get_linkage_matrix(model)
    return linkage_matrix


def get_all_clusters(cutoff):
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    all_clusters = []
    
    for region_id, _ in region_hulls.iterrows():
        data = read_region_morphotope_data(region_id)
        region_index = data.index
        linkage_matrix = np.load(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}.npy')
        clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
        clusters = pd.Series(clusters, region_index).astype(str)
        clusters = str(region_id) + '_' +clusters
        all_clusters.append(clusters)
    
    all_clusters = pd.concat(all_clusters)
    
    return all_clusters


def read_morphotopes_data(model_params):

    ### primary chars
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    data = []
    for region_id, _ in region_hulls.iterrows():
        region_morphotope_data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
        region_morphotope_data.index = str(region_id) + '_' + region_morphotope_data.index.str[:]
        data.append(region_morphotope_data)
    
    data = pd.concat(data)
    return data

def morphotopes_to_etcs(region_id, etcs=True, model_params='_100_0_None_None_False'):


    if etcs:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')

    else:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')
        
    etcs['label'] = -1
    
    morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
    morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]

    morph_dict = pd.Series(np.arange(np.unique(morphotopes.values).shape[0]),
                       np.unique(morphotopes.values))
    etcs.loc[morphotopes.index, 'label'] = morphotopes.map(lambda x: morph_dict.loc[x]).values
    etcs['morph'] = str(region_id) + '_' + '-1'
    etcs.loc[morphotopes.index, 'morph'] = str(region_id) + '_' + morphotopes.values
    return etcs

In [173]:
# region_hulls = gpd.read_parquet(
#         regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
# )
# region_hulls.explore()

In [63]:
### generate and save linkage matricies for each region

# %%time
# for region_id, _ in region_hulls.iterrows():
#     data = read_region_morphotope_data(region_id)
#     linkage_matrix = get_morphotope_linkage(data)
#     np.save(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}', linkage_matrix)

### Setup regional cutoff to merge regional morphotopes

In [60]:
regional_cutoff = 15

In [61]:
regional_clusters = get_all_clusters(cutoff=regional_cutoff)

processing 4
processing 10
processing 132
processing 134
processing 286
processing 313
processing 400
processing 523
processing 765
processing 801
processing 832
processing 913
processing 960
processing 1124
processing 1154
processing 1387
processing 1478
processing 1515
processing 1605
processing 1718
processing 1736
processing 1782
processing 1970
processing 1981
processing 2096
processing 2322
processing 2350
processing 2478
processing 2514
processing 2625
processing 2728
processing 2975
processing 3039
processing 3109
processing 3150
processing 3221
processing 3250
processing 3526
processing 3610
processing 3612
processing 3701
processing 3705
processing 3752
processing 3759
processing 3981
processing 4070
processing 4214
processing 4215
processing 4235
processing 4284
processing 4356
processing 4382
processing 4723
processing 4805
processing 5096
processing 5191
processing 5246
processing 5310
processing 5408
processing 5427
processing 5662
processing 5671
processing 5766
processi

In [62]:
%%time
morphotopes_data = read_morphotopes_data(model_params)

CPU times: user 19.7 s, sys: 5.98 s, total: 25.7 s
Wall time: 11.4 s


In [63]:
morphotopes_data = morphotopes_data.loc[regional_clusters.index]
regional_ward_morphotopes_data = morphotopes_data.groupby(regional_clusters).mean()
regional_ward_morphotopes_data = regional_ward_morphotopes_data.iloc[:, :-1]
regional_ward_morphotopes_data

Unnamed: 0_level_0,sdbAre,sdbAre,sdbAre,sdbAre,sdbAre,sdbPer,sdbPer,sdbPer,sdbPer,sdbPer,...,mibFR,mibFR,mibFR,mibFR,mibFR,mibSCo,mibSCo,mibSCo,mibSCo,mibSCo
Unnamed: 0_level_1,percentile_25,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean,...,percentile_25,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean
10019_1,34.536730,114.068308,278.934558,650.286045,282.583524,25.312069,48.141677,77.322486,50.642453,60.643572,...,2.598926,3.892183,5.491581,3.721435,4.680928,2.598926,3.892183,5.491581,3.721435,4.680928
10019_10,29.591414,77.321654,147.444861,163.791489,118.394178,23.559792,37.995651,52.087817,25.324299,41.711235,...,2.469697,2.901412,3.418221,1.276734,3.065962,2.469697,2.901412,3.418221,1.276734,3.065962
10019_11,18.847849,36.570236,92.169744,49.536524,57.364671,19.071898,26.931763,41.016204,14.213961,30.502720,...,2.440881,2.841616,3.200131,0.694075,2.823337,2.440881,2.841616,3.200131,0.694075,2.823337
10019_12,28.424606,61.517916,111.470829,74.553089,80.058668,23.275742,33.979346,46.376135,19.125748,36.903047,...,2.412081,2.787745,3.168981,0.698953,2.752917,2.412081,2.787745,3.168981,0.698953,2.752917
10019_13,27.907766,61.151440,108.360001,78.276632,78.002345,22.848842,33.673006,45.144735,17.426332,35.850793,...,2.283845,2.662463,3.062158,0.792215,2.674245,2.283845,2.662463,3.062158,0.792215,2.674245
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99886_52,61.498067,123.278905,185.278598,179.430589,156.523084,32.881491,47.836831,61.761740,31.255048,52.045140,...,2.226469,2.883438,3.406777,1.110289,2.908672,2.226469,2.883438,3.406777,1.110289,2.908672
99886_6,49.327802,141.792385,391.914222,1008.970477,379.480664,29.407011,51.412164,91.171801,64.928935,70.199947,...,2.051694,3.282357,4.950029,3.070233,3.932878,2.051694,3.282357,4.950029,3.070233,3.932878
99886_7,33.455443,84.415063,307.813081,924.528262,377.563642,23.647200,39.388886,79.117487,68.012007,64.769012,...,1.616690,2.722175,5.511442,3.905005,4.162024,1.616690,2.722175,5.511442,3.905005,4.162024
99886_8,68.056427,199.920394,415.662993,757.820836,432.521022,36.107616,63.178272,95.083265,74.579926,81.734931,...,3.430934,5.187315,7.172479,3.947179,6.131818,3.430934,5.187315,7.172479,3.947179,6.131818


In [64]:
# region_id = 4
# data = read_region_morphotope_data(region_id)
# linkage_matrix = get_morphotope_linkage(data)

### Final ward clustering

In [140]:
final_cutoff = 30

In [141]:
linkage_matrix = get_morphotope_linkage(preprocess_data(regional_ward_morphotopes_data))

In [142]:
clusters = fcluster(linkage_matrix, t=final_cutoff, criterion='distance')
final_clusters = pd.Series(clusters, regional_ward_morphotopes_data.index)
final_clusters.value_counts()

36     639
23     619
144    432
146    422
11     421
      ... 
155      1
132      1
94       1
90       1
44       1
Name: count, Length: 212, dtype: int64

### Plotting

In [143]:
from core.cluster_validation import get_color
final_colors = pd.DataFrame(get_color(final_clusters.values), final_clusters.values).drop_duplicates()
final_colors.loc[-1] = [0, 0, 0]

In [174]:
# 107131 - krakow
# 86873 = vienna
# 69333 = prague
# munich
# 4 - rhineruhr
# 55763 - berlin
#bratislava - 99886
# freiburg - 5883
 # hamburg - 16242
# munich - 38679

In [180]:
region_id = 5883

In [181]:
# etcs=False to read buildings, etcs=True for tessellation cells.
etcs = morphotopes_to_etcs(region_id, etcs=False)

In [182]:
## relabel
### regional_clusters = morphotopes -> regional_clusters
### final_clusters = regional_clusters -> final_clusters
etcs['final'] = etcs['morph'].map(lambda x: final_clusters.loc[regional_clusters.loc[x]] if x in regional_clusters else -1)

In [183]:
# # can run this to change colors on an exisitng layer
# layer.get_fill_color = get_color(etcs.final)

In [184]:
etcs = etcs.to_crs(epsg=4326)
etcs['geometry'] = etcs.geometry.make_valid()

In [185]:
etcs = etcs[etcs['geometry'].geom_type == 'Polygon']

In [186]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.7)

CPU times: user 181 ms, sys: 25 ms, total: 206 ms
Wall time: 206 ms


In [187]:
from sidecar import Sidecar
sc = Sidecar(title='Final Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [171]:
layer.get_fill_color = final_colors.loc[etcs.final].values.astype('uint8')

In [188]:
# color by regional morphotope
layer.get_fill_color = get_color(etcs.label)

In [153]:
target_cluster = 96
etcs[etcs.final == target_cluster].shape

(2636, 9)

In [None]:
# etcs[etcs.final == target_cluster].explore(prefer_canvas=True, tiles='Cartodb positron')

In [None]:
## final dendrogram
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
fig,ax = plt.subplots(figsize=(20,20), dpi=200)
_ = dendrogram(linkage_matrix,ax=ax)