In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from libpysal.graph import read_parquet
from core.generate_clusters import get_tree, get_clusters

In [2]:
chars_dir = "/data/uscuni-ulce/processed_data/chars/"
graph_dir = "/data/uscuni-ulce/processed_data/neigh_graphs/"
tessellations_dir = '/data/uscuni-ulce/processed_data/tessellations/'

In [3]:
region_id = 5883 # freiburtg
# region_id = 69333 # prague
# region_id = 139196 # prague

# region_id = 66593


region_id = 69333 # prague


In [4]:
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
graph = read_parquet(graph_dir + f"tessellation_graph_{region_id}.parquet")
tessellation = gpd.read_parquet(
        tessellations_dir + f"tessellation_{region_id}.parquet"
)

building_graph = graph.subgraph(graph.unique_ids[graph.unique_ids >= 0])
labels = building_graph.component_labels

In [6]:

### clustering parameters
min_cluster_size = 75

# spatial_lag = 3
# kernel='gaussian' 
# lag_type = '_median'

lag_type = None
spatial_lag = 0
kernel='None'

clip = None

to_drop = ['stcSAl','stbOri','stcOri','stbCeA', 
           'ldkAre', 'ldkPer', 'lskCCo', 'lskERI',
           'lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe',
          'licBAD',
          'misBAD',

           'ssbCCM',
           'ssbCCD'
           
          ]


linkage='complete'
metric='euclidean'
eom_clusters = False


from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer
scalar = QuantileTransformer(subsample=None, output_distribution='uniform')
# scalar = QuantileTransformer(subsample=None, output_distribution='normal')
# scalar = PowerTransformer()

In [7]:
def preprocess_clustering_data(X_train, scalar, clip, to_drop):
    '''Data pre-processing before clustering is carried out.'''
    ## drop non-buildings
    X_train = X_train[X_train.index >= 0]

    # drop 'to_drop' columns and spatial lag
    all_drop = []
    for c in to_drop:
        all_drop += X_train.columns[X_train.columns.str.contains(c)].tolist()
    X_train = X_train.drop(all_drop, axis=1)

    # standardise data
    vals = scalar.fit_transform(X_train)
    X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)
    vals = np.nan_to_num(X_train)
    X_train = pd.DataFrame(vals, columns=X_train.columns, index=X_train.index)

    # drop any columns with no variation
    stats = X_train.describe()
    X_train = X_train.drop(stats.columns[stats.loc['std'] == 0], axis=1)

    #optionally clip the data
    if clip is not None:
        X_train = X_train.clip(*clip)

    return X_train

In [8]:
def cluster_data(X_train, graph, scalar, to_drop, clip, 
                 min_cluster_size, linkage, metric, eom_clusters=True):
    '''Split the input data into connected components and carry out an agglomerative clustering for each component independently.
    Pre-process the input data, cluster and then carry out post-processing and finally combine all the seperate clusterings into one set of clusters.'''
    
    # label building input data, could work with empty tess as well
    building_graph = graph.subgraph(graph.unique_ids[graph.unique_ids >= 0])
    labels = building_graph.component_labels
    
    results = {}
    
    for label, group in labels.groupby(labels):
    
        if group.shape[0] <= min_cluster_size:
            component_clusters = np.full(group.shape[0], -1)
    
        else:
            component_buildings_data = preprocess_clustering_data(X_train.loc[group.index.values], 
                                                                  scalar=scalar, clip=clip, to_drop=to_drop)
            component_graph = building_graph.subgraph(group.index.values)
            ward_tree = get_tree(component_buildings_data, component_graph.transform('B').sparse, linkage, metric)
    
            # # sometimes ward linkage breaks the monotonic increase in the MST
            # # if that happens shift all distances by the max drop
            # # need a loop because several connections might be problematic
            problem_idxs = np.where(ward_tree[1:, 2] < ward_tree[0:-1, 2])[0]
            while problem_idxs.shape[0]:
                ward_tree[problem_idxs + 1, 2] = ward_tree[problem_idxs, 2] + .01
                problem_idxs = np.where(ward_tree[1:, 2] < ward_tree[0:-1, 2])[0]
            # check if ward tree distances are always increasing
            assert (ward_tree[1:, 2] >= ward_tree[0:-1, 2]).all()
            
            component_clusters = get_clusters(ward_tree, min_cluster_size, 
                                              component_buildings_data.shape[0], 
                                              eom_clusters=eom_clusters)
                
           # ## post process - needs changing, since it doesnt make much of a difference
           #  res = component_buildings_data.groupby(component_clusters).apply(post_process_clusters_tightening, min_cluster_size=min_cluster_size, t=15)
           #  if res.shape[0] == 1:
           #      component_clusters = pd.Series(res.values[0], res.columns)
           #  else:
           #      component_clusters = pd.Series(res.values, res.index.get_level_values(1)).loc[component_buildings_data.index].values
            
            # for c in np.unique(component_clusters):
            #     # if c == -1: continue
            #     cluster_graph = component_graph.subgraph(group.index[component_clusters == c].values)
            #     assert cluster_graph.n_components == 1
        
        results[label] = component_clusters

    ### relabel local clusters(0,1,2,0,1) to regional clusters(0_0,0_1,0_2, 0_0,0_1,) etc
    label_groups = labels.groupby(labels)
    region_cluster_labels = []
    for label, component_clusters in results.items():
        group = label_groups.get_group(label)
        component_labels = str(label) + '_' + pd.Series(component_clusters.astype(str), 
                                                        index=group.index.values)
        region_cluster_labels.append(component_labels)
    
    region_cluster_labels = pd.concat(region_cluster_labels).sort_index()
    assert (X_train[X_train.index >= 0].index == region_cluster_labels.index).all()
    
    return region_cluster_labels

In [9]:
clustering_data = X_train

In [10]:
region_cluster_labels = cluster_data(clustering_data, graph, scalar, 
                                     to_drop, clip, min_cluster_size, 
                                     linkage, metric, eom_clusters=eom_clusters)



### Plotting

In [11]:
plotting = tessellation[tessellation.index >=0]
plotting['regional_label'] = region_cluster_labels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [12]:
vals, indx = plotting['regional_label'].factorize()
vals[np.isin(vals, np.where(indx.str.split('_').str[-1] == '-1')[0])] = -1
plotting['regional_label_factor'] = vals

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [13]:

from core.cluster_validation import get_color
final_colors = pd.DataFrame(get_color(vals), vals).drop_duplicates()
final_colors.loc[-1] = [0,0,0]

In [14]:
plotting['geometry'] = plotting.simplify(1).to_crs(epsg=4326).make_valid()
plotting = plotting[plotting['geometry'].geom_type == 'Polygon']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [15]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(plotting, opacity=.07)

CPU times: user 1.08 s, sys: 147 ms, total: 1.23 s
Wall time: 1.22 s


In [16]:
from sidecar import Sidecar
sc = Sidecar(title=f'Final Clusters')
m = lonboard.Map(layer)
with sc:
    display(m)

In [17]:
from core.cluster_validation import get_color
layer.get_fill_color = final_colors.loc[plotting.regional_label_factor].values.astype('uint8')

## Save data

In [17]:
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
from core.generate_clusters import percentile

In [18]:
region_cluster_labels.to_frame('morphotope_label').to_parquet(morphotopes_dir + f'tessellation_labels_morphotopes_{region_id}_{min_cluster_size}_{spatial_lag}_{lag_type}_{kernel}_{eom_clusters}.pq')

# generate morphotopes data
print("--------Generating morphotopes data----------")
component_data = X_train.loc[region_cluster_labels.index]
component_data = component_data.groupby(region_cluster_labels.values).agg([percentile(25), 
                                                         'median', 
                                                         percentile(75), 'std', 'mean'] )
# save sizes for clustering
component_data[('Size', 'Size')] = X_train.loc[region_cluster_labels.index].groupby(region_cluster_labels.values).size()

# store morphotopes data
component_data.to_parquet(morphotopes_dir + f'data_morphotopes_{region_id}_{min_cluster_size}_{spatial_lag}_{lag_type}_{kernel}_{eom_clusters}.pq')

--------Generating morphotopes data----------


In [19]:
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)

In [21]:
# region_hulls.explore()

In [136]:
# 66593 - nad ustie