In [9]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import fcluster
import pandas as pd
import numpy as np
import geopandas as gpd

In [10]:
regions_datadir = "/data/uscuni-ulce/"
morphotopes_dir = '/data/uscuni-ulce/processed_data/morphotopes/'
model_params = '_100_0_None_None_False'
kernel = 'gaussian'
spatial_lag = 3

In [11]:
to_drop = [
        'stcSAl','stbOri','stcOri','stbCeA',
        'ldkAre', 'ldkPer', 'lskCCo', 'lskERI','lskCWA', 'ltkOri', 'ltkWNB', 'likWBB', 'likWCe',
# 'sdbAre', 'ssbCCM', 'sdcLAL'
    # 'mibCCo', 'mibLAL'
]

In [12]:
region_id = 4


In [13]:
data = pd.read_parquet(f'{morphotopes_dir}data_morphotopes_{region_id}{model_params}.pq')
data.index = str(region_id) + '_' + data.index
data = data.iloc[:, :-1]

component_data = data.drop(columns=['percentile_25', 'percentile_75', 'median', 'std'], level=1)
component_data = component_data.drop(columns=to_drop, level=0)
component_data.columns =  component_data.columns.get_level_values(0)


# lag_data = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotope_lagged_data/{region_id}_{kernel}_{spatial_lag}.pq')
# data = lag_data.join(component_data)


data = component_data.copy()


data = data[data.index.str[-2:] != '-1']

In [14]:
# component_data = data.drop(columns=['percentile_25', 'percentile_75', 'median', 'std'], level=1)
# component_data = component_data.drop(columns=to_drop, level=0)
# component_data.columns =  ['_'.join(col).strip() for col in component_data.columns.values]

In [15]:
vals = StandardScaler().fit_transform(component_data)
component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)
component_data = component_data.drop(component_data.columns[component_data.std() == 0], axis=1)

# component_data = component_data[component_data.index >= 0]
vals = np.nan_to_num(component_data)
component_data = pd.DataFrame(vals, columns=component_data.columns, index=component_data.index)


In [None]:
%%time
linkage = 'ward'
metric = 'euclidean'

from sklearn.cluster import AgglomerativeClustering
from core.cluster_validation import get_linkage_matrix
clusterer = AgglomerativeClustering(linkage=linkage,
                                    metric=metric,
                                    compute_full_tree=True,
                                    compute_distances=True)
model = clusterer.fit(component_data.values)
linkage_matrix = get_linkage_matrix(model)

In [None]:
import sys
sys.setrecursionlimit(10000)

In [None]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
fig,ax = plt.subplots(figsize=(20,20), dpi=200)
_ = dendrogram(linkage_matrix,ax=ax)

In [None]:
def get_dendrogram_scatter_data(linkage_matrix):
    
    R = dendrogram(linkage_matrix, no_plot=True)
    xs = np.array(R['icoord'])[:, 1:3].sum(axis=1)
    ys = np.array(R['dcoord'])[:, 1]

    ### fast way to find out where in ii, each node is equivalent to np.nonzero(ii == j)
    ii = np.argsort(ys)
    sorter = np.argsort(ii)
    positions = sorter[np.searchsorted(ii, np.arange(0, len(R['dcoord'])), sorter=sorter)]
    nodes = linkage_matrix[positions, ] 

    scatter_df = pd.DataFrame(nodes, columns=['child', 'parent', 'dist', 'size'])
    scatter_df['x'] = xs
    scatter_df['y'] = ys
    
    return scatter_df

In [12]:
scatter_df = get_dendrogram_scatter_data(linkage_matrix)

In [13]:
import jscatter
import ipywidgets

In [14]:
scatter = jscatter.Scatter(data=scatter_df, x='x', y='y', height=640, width=1280)
scatter.axes(grid=True)

output = ipywidgets.Output()

@output.capture(clear_output=True)
def selection_change_handler(change):
    display(scatter_df.loc[change.new].style.hide(axis='index'))
            
scatter.widget.observe(selection_change_handler, names=["selection"])

ipywidgets.HBox([scatter.show(), output])


HBox(children=(HBox(children=(VBox(children=(Button(button_style='primary', icon='arrows', layout=Layout(width…

In [16]:
target_morphotope = '69333_849_104'

target_morphotope_idx = np.where(component_data.index == target_morphotope)[0][0]
print(target_morphotope_idx)
target_scatter_node = np.where((scatter_df['child'] == target_morphotope_idx) | (scatter_df['parent'] == target_morphotope_idx))
target_scatter_node

scatter.zoom([target_scatter_node])
# scatter.selection(target_scatter_node)

1392


<jscatter.jscatter.Scatter at 0x7da46bf14890>

In [25]:
selected_node = scatter.selection()[0]
children_ids = get_children(scatter_df.loc[selected_node, 'child'].astype(int), scatter_df.loc[selected_node, 'parent'].astype(int))
children_morphotopes = component_data.iloc[children_ids].index

In [27]:
# etcs[etcs.morph.isin(children_morphotopes)].explore(prefer_canvas=True, tiles='CartoDB positron')

In [19]:
region_id = 69333


etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')
etcs['morph'] = "-1"

morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
morphotopes.loc[:, 'morphotope_label'] =  morphotopes.values[:, 0]
etcs.loc[morphotopes.index, 'morph'] = str(region_id) + '_' + morphotopes.values

In [20]:
import numpy as np

def get_original_observations(Z, node_id, n):
    """
    Recursively retrieves all original observations that belong to a cluster node.

    Parameters:
    Z : numpy.ndarray
        Linkage matrix of shape (n-1, 4) where each row [Z[i, 0], Z[i, 1], Z[i, 2], Z[i, 3]]
        contains two merged clusters and additional metadata.
    node_id : int
        The node (cluster) ID for which to retrieve original observations.
    n : int
        The total number of original observations.

    Returns:
    observations : list
        List of original observation indices that are part of the specified node_id.
    """
    # If the node_id refers to an original observation, return it
    if node_id < n:
        return [node_id]
    
    # Otherwise, recursively find observations for the two merged clusters
    cluster_idx = node_id - n # Adjust the index because new clusters start from n
    
    left_cluster = int(Z[cluster_idx, 0])
    right_cluster = int(Z[cluster_idx, 1])
    
    left_observations = get_original_observations(Z, left_cluster, n)
    right_observations = get_original_observations(Z, right_cluster, n)
    
    return left_observations + right_observations





# def dendogram_idx_nodes(idx, node_id, linkage, n_children, out=[]):
    
#     if node_id < 0:
#        return
#     left, right = linkage[idx]
#     out.append((idx, node_id))
#     # recurse over the right node
#     if right >= n_children: # make sure it's not a leaf node
#         node_id -= 1
#         node_id = dendogram_idx_nodes(right - (n_children + 1), node_id,
#                                      linkage, n_children, out)
#     if left >= n_children: # make sure it's not a leaf node
#         node_id -= 1
#         node_id = dendogram_idx_nodes(left - (n_children +1), node_id,
#                                      linkage, n_children, out)
#     return node_id

In [21]:
def get_children(left, right):
    res1 = get_original_observations(linkage_matrix, left, component_data.shape[0])
    res2 = get_original_observations(linkage_matrix, right, component_data.shape[0])
    res = np.union1d(res1, res2)
    return res

In [24]:
assert get_children(int(linkage_matrix[123, 0]), int(linkage_matrix[123, 1])).shape[0] == linkage_matrix[123, 3]
assert get_children(int(linkage_matrix[321, 0]), int(linkage_matrix[321, 1])).shape[0] == linkage_matrix[321, 3]
assert get_children(int(linkage_matrix[-1, 0]), int(linkage_matrix[-1, 1])).shape[0] == linkage_matrix[-1, 3]


In [None]:
pd.Series(linkage_matrix[:, 2]).describe()

In [None]:
cutoff = 20

kmeans_clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')


In [None]:
# ward_clusters = fcluster(linkage_matrix, t=cutoff, criterion='distance')
# ward_clusters = pd.Series(ward_clusters, index=np.arange(np.unique(kmeans_clusters).shape[0]))
# np.unique(ward_clusters)

In [None]:
# clusters = pd.Series(kmeans_clusters).map(lambda x: ward_clusters.loc[x]).values

In [None]:
clusters = kmeans_clusters.copy()

In [None]:
region_id = 4


etcs = gpd.read_parquet(f'/data/uscuni-ulce/processed_data/tessellations/tessellation_{region_id}.parquet')
etcs['label'] = -1

morphotopes = pd.read_parquet(f'/data/uscuni-ulce/processed_data/morphotopes/tessellation_labels_morphotopes_{region_id}{model_params}.pq')
morphotopes.loc[:, 'morphotope_label'] =  str(region_id) + '_' + morphotopes.values[:, 0]

region_clusters = pd.Series(clusters, component_data.index)
region_clusters

In [None]:
region_clusters.nunique()

In [None]:
region_clusters.value_counts()

In [None]:
%%time
## assign morphotope clusters to tess cells

etcs.loc[morphotopes.index, 'label'] = morphotopes.morphotope_label.map(lambda x: region_clusters.loc[x] if x in region_clusters.index else -1).values


In [None]:
# %%time
# # # # relabel morphotopes for plotting
# morph_dict = pd.Series(np.arange(np.unique(morphotopes.values).shape[0]),
#                        np.unique(morphotopes.values))
# etcs.loc[morphotopes.index, 'label'] = morphotopes.map(lambda x: morph_dict.loc[x]).values
# etcs['morph'] = '-1'
# etcs.loc[morphotopes.index, 'morph'] = morphotopes.values

In [None]:
etcs.label.value_counts()

In [74]:
from core.cluster_validation import get_color
layer.get_fill_color = get_color(etcs.label)

In [None]:
%%time
import lonboard
# plotting = tessellation[tessellation.index.isin(X_train.index)].copy()
layer = lonboard.SolidPolygonLayer.from_geopandas(etcs, opacity=.08)

In [None]:
from sidecar import Sidecar
sc = Sidecar(title='Final Clusters')
m = lonboard.Map(layer, basemap_style=lonboard.basemap.CartoBasemap.Positron)
with sc:
    display(m)

In [None]:
from core.cluster_validation import get_color
layer.get_fill_color = get_color(etcs.label)