In [1]:
import momepy as mm
import numpy as np
import numba
import geopandas as gpd
import pandas as pd
import shapely
from sklearn.preprocessing import StandardScaler
from collections import namedtuple
from core.cluster_validation import get_linkage_matrix
from libpysal.graph import read_parquet, Graph
from core.generate_context import spatially_weighted_partial_lag
from core.generate_clusters import preprocess_clustering_data
from sklearn.cluster import AgglomerativeClustering
from core.cluster_validation import get_linkage_matrix
from scipy.cluster.hierarchy import fcluster


In [2]:
regions_datadir = "/data/uscuni-ulce/"
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
model_params = '_100_0_None_None_False'
clip = None
to_drop = [
        'stcSAl','stbOri','stcOri','stbCeA',
        'ldkAre', 'ldkPer', 'lskCCo', 'lskERI','lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe'
]
linkage = 'ward'
metric = 'euclidean'

# model_params = '_75_0_None_None_False'

In [3]:
region_id = 69333

In [29]:
def preprocess_data(data):
    component_data = data.drop(columns=['percentile_25', 'percentile_75', 'median', 'std'], level=1)
    component_data = component_data.drop(columns=to_drop, level=0)
    component_data.columns =  component_data.columns.get_level_values(0)
    
    component_data = component_data[component_data.index.str[-2:] != '-1']
    
    vals = StandardScaler().fit_transform(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)
    
    # component_data = component_data[component_data.index >= 0]
    vals = np.nan_to_num(component_data)
    component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
    return component_data

def read_region_morphotope_data(region_id):
    print('processing' , region_id)
    data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
    data.index = str(region_id) + '_' + data.index
    data = data.iloc[:, :-1]
    component_data = preprocess_data(data)

    return component_data

def get_morphotope_linkage(region_data):
    
    clusterer = AgglomerativeClustering(linkage=linkage,
                                        metric=metric,
                                        compute_full_tree=True,
                                        compute_distances=True)
    model = clusterer.fit(region_data)
    linkage_matrix = get_linkage_matrix(model)
    return linkage_matrix


def get_all_clusters(cutoff):
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    all_clusters = []
    
    for region_id, _ in region_hulls.iterrows():
        data = read_region_morphotope_data(region_id)
        region_index = data.index
        linkage_matrix = np.load(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}.npy')
        clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
        clusters = pd.Series(clusters, region_index).astype(str)
        clusters = str(region_id) + '_' +clusters
        all_clusters.append(clusters)
    
    all_clusters = pd.concat(all_clusters)
    
    return all_clusters


def read_morphotopes_data(model_params):

    ### primary chars
    region_hulls = gpd.read_parquet(
            regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
    )
    
    data = []
    for region_id, _ in region_hulls.iterrows():
        region_morphotope_data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
        region_morphotope_data.index = str(region_id) + '_' + region_morphotope_data.index.str[:]
        data.append(region_morphotope_data)
    
    data = pd.concat(data)
    return data

def morphotopes_to_etcs(region_id, etcs=True, model_params='_100_0_None_None_False'):


    if etcs:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')

    else:
        etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')
        
    etcs['label'] = -1
    
    morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
    morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]

    morph_dict = pd.Series(np.arange(np.unique(morphotopes.values).shape[0]),
                       np.unique(morphotopes.values))
    etcs.loc[morphotopes.index, 'label'] = morphotopes.map(lambda x: morph_dict.loc[x]).values
    etcs['morph'] = str(region_id) + '_' + '-1'
    etcs.loc[morphotopes.index, 'morph'] = str(region_id) + '_' + morphotopes.values
    return etcs

In [5]:
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)

In [6]:
### generate and save linkage matricies for each region

# %%time
# for region_id, _ in region_hulls.iterrows():
#     data = read_region_morphotope_data(region_id)
#     linkage_matrix = get_morphotope_linkage(data)
#     np.save(f'/data/uscuni-ulce/processed_data/morphotope_linkage/{linkage}_{metric}_{region_id}', linkage_matrix)

In [7]:
cutoff = 20

In [8]:
all_clusters = get_all_clusters(cutoff=cutoff)

processing 4
processing 10
processing 132
processing 134
processing 286
processing 313
processing 400
processing 523
processing 765
processing 801
processing 832
processing 913
processing 960
processing 1124
processing 1154
processing 1387
processing 1478
processing 1515
processing 1605
processing 1718
processing 1736
processing 1782
processing 1970
processing 1981
processing 2096
processing 2322
processing 2350
processing 2478
processing 2514
processing 2625
processing 2728
processing 2975
processing 3039
processing 3109
processing 3150
processing 3221
processing 3250
processing 3526
processing 3610
processing 3612
processing 3701
processing 3705
processing 3752
processing 3759
processing 3981
processing 4070
processing 4214
processing 4215
processing 4235
processing 4284
processing 4356
processing 4382
processing 4723
processing 4805
processing 5096
processing 5191
processing 5246
processing 5310
processing 5408
processing 5427
processing 5662
processing 5671
processing 5766
processi

In [19]:
%%time
morphotopes_data = read_morphotopes_data(model_params)

CPU times: user 19.7 s, sys: 6.15 s, total: 25.8 s
Wall time: 11.5 s


In [20]:
morphotopes_data = morphotopes_data.loc[all_clusters.index]
regional_ward_morphotopes_data = morphotopes_data.groupby(all_clusters).mean()
regional_ward_morphotopes_data = regional_ward_morphotopes_data.iloc[:, :-1]
regional_ward_morphotopes_data

Unnamed: 0_level_0,sdbAre,sdbAre,sdbAre,sdbAre,sdbAre,sdbPer,sdbPer,sdbPer,sdbPer,sdbPer,...,mibFR,mibFR,mibFR,mibFR,mibFR,mibSCo,mibSCo,mibSCo,mibSCo,mibSCo
Unnamed: 0_level_1,percentile_25,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean,...,percentile_25,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean
10019_1,34.536730,114.068308,278.934558,650.286045,282.583524,25.312069,48.141677,77.322486,50.642453,60.643572,...,2.598926,3.892183,5.491581,3.721435,4.680928,2.598926,3.892183,5.491581,3.721435,4.680928
10019_2,30.883804,63.774971,98.082417,118.270604,86.344882,23.746705,34.429539,43.927445,19.900002,36.490578,...,1.841299,2.354463,2.793675,0.866916,2.386156,1.841299,2.354463,2.793675,0.866916,2.386156
10019_3,50.598924,82.293761,131.969295,105.118231,109.614812,30.607161,39.419854,50.402463,22.647987,43.887531,...,3.341867,3.653477,4.095116,0.712386,3.687432,3.341867,3.653477,4.095116,0.712386,3.687432
10019_4,36.518561,67.753131,101.979863,68.609528,80.161853,25.973667,35.517353,46.121608,18.817697,38.196474,...,2.414012,2.752912,3.107820,0.688489,2.766476,2.414012,2.752912,3.107820,0.688489,2.766476
10019_5,27.481523,53.508844,85.153879,47.915400,61.898621,22.425619,31.399784,39.227095,14.344752,32.352298,...,2.028884,2.413209,2.799706,0.663114,2.414690,2.028884,2.413209,2.799706,0.663114,2.414690
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99886_5,95.596808,267.806174,554.471447,615.310948,442.940138,41.766585,73.921126,114.849461,64.424965,87.911153,...,3.183112,4.803344,6.459835,2.904836,5.124597,3.183112,4.803344,6.459835,2.904836,5.124597
99886_6,71.697345,153.279637,268.781231,201.388804,206.603629,37.332438,56.601564,85.713352,43.389526,67.518114,...,3.438859,4.044486,4.721955,1.205909,4.078275,3.438859,4.044486,4.721955,1.205909,4.078275
99886_7,91.632803,199.029520,418.219510,425.441030,331.467639,42.386874,67.355284,101.837884,54.888611,80.118555,...,4.568383,5.637538,6.847343,2.202609,5.817028,4.568383,5.637538,6.847343,2.202609,5.817028
99886_8,126.632340,229.324058,393.981760,236.712805,292.266066,47.073777,78.150717,118.100314,58.474686,91.897213,...,4.452137,4.617901,4.657044,0.491905,4.584646,4.452137,4.617901,4.657044,0.491905,4.584646


In [12]:
linkage_matrix = get_morphotope_linkage(preprocess_data(regional_ward_morphotopes_data))

In [12]:
# region_id = 4
# data = read_region_morphotope_data(region_id)
# linkage_matrix = get_morphotope_linkage(data)

processing 4


In [65]:
# import matplotlib.pyplot as plt
# from scipy.cluster.hierarchy import dendrogram
# fig,ax = plt.subplots(figsize=(20,20), dpi=200)
# _ = dendrogram(linkage_matrix,ax=ax)

In [13]:
cutoff = 2

In [14]:
kmeans_clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')

In [15]:
clusters = kmeans_clusters.copy()

In [22]:
final_clusters = pd.Series(clusters, regional_ward_morphotopes_data.index)

In [17]:
### allclusters = morphotopes -> regional_clusters
### final_clusters = regional_clusters -> final_clusters

4_0_0               4_148
4_0_1               4_211
4_0_10              4_259
4_0_11              4_562
4_0_12              4_385
                  ...    
139096_942_0    139096_15
139096_98_0     139096_13
139096_990_0    139096_10
139096_99_0      139096_8
139096_99_1     139096_16
Length: 356332, dtype: object

In [24]:
etcs = morphotopes_to_etcs(region_id)

In [46]:
etcs['final'] = etcs['morph'].map(lambda x: final_clusters.loc[all_clusters.loc[x]] if x in all_clusters else -1)

In [47]:
# from core.cluster_validation import get_color
# layer.get_fill_color = get_color(etcs.label)

In [48]:
etcs = etcs.to_crs(epsg=4326)

In [49]:
etcs['geometry'] = etcs.geometry.make_valid()

In [54]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.08)

CPU times: user 4.58 s, sys: 500 ms, total: 5.08 s
Wall time: 5.07 s


In [55]:
from sidecar import Sidecar
sc = Sidecar(title='Final Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [56]:
from core.cluster_validation import get_color
layer.get_fill_color = get_color(etcs.final)

In [102]:
target_cluster = 23
etcs[etcs.label == target_cluster].shape

(1802, 3)

In [103]:

etcs[etcs.label == target_cluster].explore(prefer_canvas=True, tiles='Cartodb positron')