In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
from sklearn.preprocessing import QuantileTransformer, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from libpysal.graph import read_parquet

In [2]:
clusters_dir = '/data/uscuni-ulce/processed_data/clusters/'
chars_dir = '/data/uscuni-ulce/processed_data/chars/'
regions_datadir = "/data/uscuni-ulce/"
region_hulls = gpd.read_parquet(
        regions_datadir + "regions/" + "cadastre_regions_hull.parquet"
)
graph_dir = '/data/uscuni-ulce/processed_data/neigh_graphs/'
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'

In [3]:
region_id = 69333

In [4]:
input_model_params = '_75_0_None_None_False'

In [5]:
output_model_params = '_post_processing_v1'

# 1. Change morphotope boundaries so that adjacent buildings are always in the majority morphotope.

In [125]:
def post_process_morphotope_labels(region_id, input_model_params, output_model_params):
    
    ## read data
    bq1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")
    region_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{input_model_params}.pq').morphotope_label
    region_morphotope_labels = region_morphotope_labels.loc[bq1.unique_ids]
    
    # assign mode of non-noise clusters to whole adjacent structure
    # if its only noise, assign the most common noise cluster
    def non_noise_mode(x):
        non_noise = x[~x.str.endswith('-1')]
        if non_noise.shape[0]:
            return pd.Series.mode(non_noise)[0]
        else:
            return pd.Series.mode(x)[0]
            
    component_morphotopes = region_morphotope_labels.groupby(bq1.component_labels).agg(non_noise_mode)
    
    aggregated_morphotope_labels = bq1.component_labels.map(component_morphotopes.to_dict())
    aggregated_morphotope_labels.name = 'morphotope_label'
    assert (aggregated_morphotope_labels.index == region_morphotope_labels.index).all()
    aggregated_morphotope_labels.to_frame().to_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq')

In [126]:
%%time
from joblib import Parallel, delayed
n_jobs = -1
new = Parallel(n_jobs=n_jobs)(
    delayed(post_process_morphotope_labels)(region_id, input_model_params, output_model_params) for region_id, _ in region_hulls.iterrows()
)

CPU times: user 2.77 s, sys: 1.55 s, total: 4.32 s
Wall time: 15min 3s


# 2. Generate new morphotope data based on the new morphotope boundaries

In [10]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_

def post_process_morphotope_data(region_id, input_model_params, output_model_params):
    ## read data
    new_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq').morphotope_label
    X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
    component_data = X_train.loc[new_morphotope_labels.index]
    
    
    # get morphotope stats
    component_data = component_data.groupby(new_morphotope_labels.values).agg([percentile(25), 'median', percentile(75), 'std', 'mean'])
    
    # save sizes for clustering
    component_data[('Size', 'Size')] = X_train.loc[new_morphotope_labels.index].groupby(new_morphotope_labels.values).size()
    
    # store morphotopes data
    component_data.to_parquet(morphotopes_dir + f'data_morphotopes_{region_id}{output_model_params}.pq')

In [11]:
%%time
post_process_morphotope_data(region_id, input_model_params, output_model_params)

CPU times: user 41.3 s, sys: 540 ms, total: 41.9 s
Wall time: 41 s


In [12]:
%%time
from joblib import Parallel, delayed
n_jobs = -1
new = Parallel(n_jobs=n_jobs)(
    delayed(post_process_morphotope_data)(region_id, input_model_params, output_model_params) for region_id, _ in region_hulls.iterrows()
)



CPU times: user 2.83 s, sys: 1.16 s, total: 3.99 s
Wall time: 18min 17s


### Plotting

In [110]:

from lonboard import SolidPolygonLayer, Map
from lonboard.basemap import CartoBasemap
from lonboard.colormap import apply_categorical_cmap
from palettable.colorbrewer.qualitative import Set3_12
from core.cluster_validation import get_color

In [111]:
buildings = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/buildings/buildings_{region_id}.parquet')

In [112]:
buildings['morph'] = region_morphotope_labels
buildings['new_morph'] = aggregated_morphotope_labels
buildings['component_label'] = bq1.component_labels

In [113]:
layer = SolidPolygonLayer.from_geopandas(
    gdf=buildings[["geometry", "morph", 'new_morph', 'component_label']], opacity=0.15
)



In [114]:
m = Map(layer, basemap_style=CartoBasemap.Positron)
m

Map(basemap_style=<CartoBasemap.Positron: 'https://basemaps.cartocdn.com/gl/positron-gl-style/style.json'>, la…

In [115]:
factors, idx = buildings['new_morph'].factorize()

In [116]:
layer.get_fill_color = get_color(factors)

# Experiments

### Add spatial restrictions to morphotope deliniation

In [11]:
from libpysal.graph import Graph

output_model_params = '_post_processing_v2'

In [12]:
def relabel_morphotopes_geography(group):

    # extract morph info
    gc = group.morph.iloc[0].split('_')[0]
    morph_number = group.morph.iloc[0].split('_')[1]

    # gropu the buildings with no 100m distances between them
    focal, neighbor = group.sindex.query(group.geometry, predicate='dwithin', distance=100)
    geom_graph = Graph.from_adjacency(pd.DataFrame({'focal': group.index[focal],
                                                    'neighbor': group.index[neighbor], 'weight': 1}))
    value_counts = geom_graph.component_labels.value_counts()
    
    # return new morphotope labels - change format from 123_0 to 123_00
    comp_labels = geom_graph.component_labels.map(lambda x: f'{gc}_{morph_number}{str(x)}'  if value_counts[x] > 75 else f'{gc}_-1')
    
    return comp_labels

In [13]:
def post_process_morphotope_labels(region_id, input_model_params, output_model_params):
    
    ## read data
    bq1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")
    region_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{input_model_params}.pq').morphotope_label
    region_morphotope_labels = region_morphotope_labels.loc[bq1.unique_ids]
    
    # assign mode of non-noise clusters to whole adjacent structure
    # if its only noise, assign the most common noise cluster
    def non_noise_mode(x):
        non_noise = x[~x.str.endswith('-1')]
        if non_noise.shape[0]:
            return pd.Series.mode(non_noise)[0]
        else:
            return pd.Series.mode(x)[0]
            
    component_morphotopes = region_morphotope_labels.groupby(bq1.component_labels).agg(non_noise_mode)
    
    aggregated_morphotope_labels = bq1.component_labels.map(component_morphotopes.to_dict())
    aggregated_morphotope_labels.name = 'morphotope_label'

    del bq1
    
    # read buildings geometries
    buildings = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/chars/buildings_chars_{region_id}.parquet', columns=['geometry'])
    buildings['morph'] = aggregated_morphotope_labels

    # tighten morphotopes based on geographic distances
    res = []
    groups = buildings.groupby('morph')
    for key, group in groups:
    
        # if noise leave unchanged
        if key.split('_')[-1] == '-1':
            res.append(group.morph)
            continue
        else:
            res.append(relabel_morphotopes_geography(group))
    
    aggregated_morphotope_labels = pd.concat(res).sort_index()
    aggregated_morphotope_labels.name = 'morphotope_label'

    
    assert (aggregated_morphotope_labels.index == region_morphotope_labels.index).all()
    aggregated_morphotope_labels.to_frame().to_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq')

In [14]:
# %%time
# from joblib import Parallel, delayed
# n_jobs = -1
# new = Parallel(n_jobs=n_jobs)(
#     delayed(post_process_morphotope_labels)(region_id, input_model_params, output_model_params) for region_id, _ in region_hulls.iterrows()
# )

Add stats from the morphotope core - the most similar 75 ETCs

In [21]:
from scipy.spatial.distance import pdist
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import linkage, fcluster

In [28]:
def calculate_core_stats(group):

    # if its noise retrun whatever
    if (group.empty) or (group.shape[0] < 75):
        core_stats = group.describe().loc[['25%', '50%', '75%', 'std', 'mean']]
        core_stats.index = ['percentile_25', 'median', 'percentile_75', 'std', 'mean']
        return core_stats.stack().swaplevel().sort_index()
    
    # find the first 75 nearest ETCs - the morphotope core
    scaled = StandardScaler().fit_transform(group.values)
    lm = linkage(pdist(np.nan_to_num(scaled)))
    cutoff = lm[np.where(lm[:, 3] >= 75)[0][0], 2]

    clusters = fcluster(lm, t=cutoff, criterion='distance')
    core_cluster = pd.Series(clusters).value_counts().index[0]
    core_etcs = group[clusters == core_cluster]

    # calculate stats
    core_stats = core_etcs.describe().loc[['25%', '50%', '75%', 'std', 'mean']]
    core_stats.index = ['percentile_25', 'median', 'percentile_75', 'std', 'mean']
    return core_stats.stack().swaplevel().sort_index(level=0)
    

def post_process_morphotope_data(region_id, output_model_params):
    ## read data
    new_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq').morphotope_label
    X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
    component_data = X_train.loc[new_morphotope_labels.index]
    
    
    # get morphotope stats
    component_data = component_data.groupby(new_morphotope_labels.values).apply(calculate_core_stats)
    component_data = component_data.unstack([1, 2])
    
    # save sizes for clustering
    component_data[('Size', 'Size')] = X_train.loc[new_morphotope_labels.index].groupby(new_morphotope_labels.values).size()
    
    # store morphotopes data
    component_data.to_parquet(morphotopes_dir + f'data_morphotopes_{region_id}_{output_model_params}.pq')

In [29]:
%%time
from joblib import Parallel, delayed
n_jobs = -1
new = Parallel(n_jobs=n_jobs)(
    delayed(post_process_morphotope_data)(region_id, output_model_params) for region_id, _ in region_hulls.iterrows()
)

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count
  updated_mean = (last_sum + new_sum) / updated_samp

MemoryError: Unable to allocate 267. GiB for an array with shape (35829101895,) and data type float64

In [27]:
# region_id = 69333
# post_process_morphotope_data(region_id, output_model_params)

In [18]:
## read data
new_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{output_model_params}.pq').morphotope_label
X_train = pd.read_parquet(chars_dir + f'primary_chars_{region_id}.parquet')
component_data = X_train.loc[new_morphotope_labels.index]

In [19]:
groups = component_data.groupby(new_morphotope_labels)


In [106]:
pd.read_parquet(morphotopes_dir + f'data_morphotopes_{region_id}_post_processing_v1.pq')

Unnamed: 0_level_0,sdbAre,sdbAre,sdbAre,sdbAre,sdbAre,sdbPer,sdbPer,sdbPer,sdbPer,sdbPer,...,misBAD,misBAD,misBAD,misBAD,midBAD,midBAD,midBAD,midBAD,midBAD,Size
Unnamed: 0_level_1,percentile_25,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean,...,median,percentile_75,std,mean,percentile_25,median,percentile_75,std,mean,Size
0_-1,137.301434,137.301434,137.301434,,137.301434,49.637208,49.637208,49.637208,,49.637208,...,52.456478,52.456478,,52.456478,155.664723,155.664723,155.664723,,155.664723,1
1000_0,109.759359,145.329083,216.693998,347.022351,218.154302,45.067388,53.718509,73.310913,35.140641,64.173489,...,151.417331,,218.635071,183.540407,229.475859,229.475859,579.241962,197.279745,285.671852,86
1001_-1,43.816714,129.538040,163.491878,70.727784,111.560013,26.995417,52.926018,59.971880,20.294192,46.359191,...,65.233299,67.534362,3.533038,64.430592,64.151064,69.365213,73.102810,6.004547,67.888661,8
1002_-1,119.777291,122.679492,125.581693,8.208663,122.679492,48.152608,48.227610,48.302612,0.212138,48.227610,...,82.479491,82.479491,0.000000,82.479491,28.718283,28.718283,28.718283,0.000000,28.718283,2
1003_-1,219.429978,249.432694,279.435409,84.860494,249.432694,65.926905,71.563066,77.199228,15.941473,71.563066,...,82.479491,82.479491,0.000000,82.479491,88.026361,88.026361,88.026361,0.000000,88.026361,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
998_-1,31.203731,31.203731,31.203731,,31.203731,22.401346,22.401346,22.401346,,22.401346,...,59.721408,59.721408,,59.721408,59.721408,59.721408,59.721408,,59.721408,1
999_-1,33.352216,45.950070,55.400582,41.333709,53.902909,23.497498,27.661601,31.683057,11.509662,30.022516,...,59.721408,59.721408,0.000000,59.721408,59.721408,59.721408,59.721408,0.000000,59.721408,58
99_-1,55.810224,55.810224,55.810224,,55.810224,29.883778,29.883778,29.883778,,29.883778,...,73.615914,73.615914,,73.615914,77.460818,77.460818,77.460818,,77.460818,1
9_-1,113.026756,151.083435,234.450644,427.444239,276.386584,43.641113,58.379816,79.560283,76.395201,75.766057,...,72.955090,,356.747605,230.454381,45.366236,88.360506,592.692346,321.118785,254.324753,54


In [107]:
group = groups.get_group('849_3610')

In [157]:
def calculate_core_stats(group):

    # if its noise retrun whatever
    if (group.empty) or (group.shape[0] < 75):
        core_stats = group.describe().loc[['25%', '50%', '75%', 'std', 'mean']]
        core_stats.index = ['percentile_25', 'median', 'percentile_75', 'std', 'mean']
        return core_stats.stack().swaplevel().sort_index()
    
    # find the first 75 nearest ETCs - the morphotope core
    scaled = StandardScaler().fit_transform(group.values)
    lm = linkage(pdist(np.nan_to_num(scaled)))
    cutoff = lm[np.where(lm[:, 3] >= 75)[0][0], 2]

    clusters = fcluster(lm, t=cutoff, criterion='distance')
    core_cluster = pd.Series(clusters).value_counts().index[0]
    core_etcs = group[clusters == core_cluster]

    # calculate stats
    core_stats = core_etcs.describe().loc[['25%', '50%', '75%', 'std', 'mean']]
    core_stats.index = ['percentile_25', 'median', 'percentile_75', 'std', 'mean']
    return core_stats.stack().swaplevel()

In [158]:
res = calculate_core_stats(group)

In [160]:
%%time
res = groups.apply(calculate_core_stats)

CPU times: user 3min 24s, sys: 176 ms, total: 3min 24s
Wall time: 3min 24s


In [161]:
res = res.unstack([1, 2])

In [162]:
res[res.index.str.split('_').str[-1] != '-1']

Unnamed: 0_level_0,lcdMes,lcdMes,lcdMes,lcdMes,lcnClo,lcnClo,lcnClo,lcnClo,ldbPWL,ldbPWL,...,ssbSqu,sscCCo,sscERI,sssLin,stbCeA,stbOri,stbSAl,stcOri,stcSAl,xcnSCl
Unnamed: 0_level_1,mean,median,percentile_25,percentile_75,mean,median,percentile_25,percentile_75,mean,median,...,std,std,std,std,std,std,std,std,std,std
morphotope_label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1000_00,0.045560,0.041096,0.041096,0.047619,1.692468e-07,1.844877e-07,1.556789e-07,1.844877e-07,68.769858,63.653309,...,2.766928,0.124801,0.083728,0.026213,8.910203,11.906003,9.091221,12.335940,9.614840,0.000000
1005_00,0.082474,0.082474,0.082474,0.082474,2.923115e-07,2.923115e-07,2.923115e-07,2.923115e-07,58.625253,53.237479,...,2.549229,0.115867,0.066186,0.034214,8.777150,10.696158,6.051219,12.113448,6.379661,0.000000
1016_00,0.114720,0.112360,0.102804,0.145299,1.004474e-06,1.067629e-06,8.929103e-07,1.301400e-06,54.210444,51.505762,...,4.567328,0.113109,0.054705,0.027963,8.039746,11.421928,7.365768,12.526173,6.959146,0.000000
1016_10,0.113773,0.111111,0.107692,0.123457,1.069447e-06,1.000224e-06,8.622150e-07,1.183166e-06,47.323900,47.432468,...,3.114346,0.102199,0.057288,0.329734,7.536924,10.981749,10.362463,12.015506,11.628311,0.037860
1016_20,0.145962,0.144578,0.126984,0.161905,1.709577e-06,1.689486e-06,1.578546e-06,2.003597e-06,42.585157,44.866511,...,2.877012,0.107088,0.049552,0.070840,6.346112,7.123301,5.497911,9.823816,6.861356,0.063776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
978_00,0.091688,0.089552,0.084507,0.101266,1.310054e-06,1.204307e-06,1.094749e-06,1.662271e-06,54.115970,51.694544,...,2.046621,0.113848,0.059074,0.029332,9.027582,9.776263,8.188248,12.530653,7.867394,0.000000
991_00,0.080440,0.080000,0.068966,0.093333,5.112157e-07,5.199192e-07,3.465594e-07,6.848443e-07,52.395414,49.923359,...,3.902263,0.118226,0.054350,0.036807,10.096105,9.327904,7.507936,13.971638,12.269341,0.062089
991_10,0.104042,0.106383,0.085714,0.122449,5.633065e-07,4.846408e-07,4.238164e-07,7.234224e-07,46.918799,45.098962,...,1.172118,0.122181,0.073687,0.059951,9.076529,13.079530,7.727762,13.544423,8.597127,0.035405
997_00,0.122255,0.140351,0.096169,0.140351,2.975942e-07,2.949158e-07,2.723503e-07,3.156148e-07,51.501311,49.064966,...,4.281989,0.110449,0.059296,0.036932,10.267193,14.720760,8.690517,13.674154,7.978295,0.045741


In [51]:
buildings = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/chars/buildings_chars_{region_id}.parquet', columns=['geometry'])


In [54]:
m = buildings.loc[group.index].explore()
m = buildings.loc[core_etcs.index].explore(m=m, color='r')
m

In [64]:
## read data
bq1 = read_parquet(graph_dir + f"building_graph_{region_id}.parquet")
region_morphotope_labels = pd.read_parquet(f'{morphotopes_dir}tessellation_labels_morphotopes_{region_id}{input_model_params}.pq').morphotope_label
region_morphotope_labels = region_morphotope_labels.loc[bq1.unique_ids]

# assign mode of non-noise clusters to whole adjacent structure
# if its only noise, assign the most common noise cluster
def non_noise_mode(x):
    non_noise = x[~x.str.endswith('-1')]
    if non_noise.shape[0]:
        return pd.Series.mode(non_noise)[0]
    else:
        return pd.Series.mode(x)[0]
        
component_morphotopes = region_morphotope_labels.groupby(bq1.component_labels).agg(non_noise_mode)

aggregated_morphotope_labels = bq1.component_labels.map(component_morphotopes.to_dict())
aggregated_morphotope_labels.name = 'morphotope_label'

In [99]:
%%time

buildings = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/chars/buildings_chars_{region_id}.parquet', columns=['geometry'])
buildings['morph'] = aggregated_morphotope_labels

res = []
groups = buildings.groupby('morph')
for key, group in groups:

    # if noise leave unchanged
    if key.split('_')[-1] == '-1':
        res.append(group.morph)
        continue
    else:
        res.append(relabel_morphotopes_geography(group))

aggregated_morphotope_labels = pd.concat(res).sort_index()
aggregated_morphotope_labels.name = 'morphotope_label'

CPU times: user 22.4 s, sys: 15.6 ms, total: 22.4 s
Wall time: 22.4 s


In [103]:
aggregated_morphotope_labels

focal
0            0_-1
1             1_0
2             1_0
3             1_0
4             1_0
           ...   
446822    849_640
446823     849_-1
446824    849_487
446825    849_508
446826    849_361
Name: morphotope_label, Length: 446827, dtype: object

In [95]:
def relabel_morphotopes_geography(group):

    # extract morph info
    gc = group.morph.iloc[0].split('_')[0]
    morph_number = group.morph.iloc[0].split('_')[1]

    # gropu the buildings with no 100m distances between them
    focal, neighbor = group.sindex.query(group.geometry, predicate='dwithin', distance=100)
    geom_graph = Graph.from_adjacency(pd.DataFrame({'focal': group.index[focal],
                                                    'neighbor': group.index[neighbor], 'weight': 1}))
    value_counts = geom_graph.component_labels.value_counts()
    
    # return new morphotope labels
    comp_labels = geom_graph.component_labels.map(lambda x: f'{gc}_{morph_number}_{str(x)}'  if value_counts[x] > 75 else f'{gc}_-1')
    
    return comp_labels

In [30]:
# %%time
# res = buildings.groupby('morph').transform(relabel_morphotopes_geography)