# Tutorial about analyzing grouped cluster properties

Localization properties vary within clusters. Analyzing such variations can help to characterize cluster populations. Here, we show examples for variations in convex hull properties or coordinate variances.

In [None]:
from pathlib import Path

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from colorcet import m_fire, m_gray, m_dimgray

import locan as lc

In [None]:
lc.show_versions(system=False, dependencies=False, verbose=False)

## Synthetic data

We simulate localization data that follows a Neyman-Scott distribution in 2D:

In [None]:
rng = np.random.default_rng(seed=1)

In [None]:
locdata = lc.simulate_dstorm(parent_intensity=1e-5, region=((0, 10_000), (0, 10_000)), cluster_mu=10, cluster_std=10, seed=rng)

locdata.print_summary()

In [None]:
bin_range=((0, 2_000), (0, 2_000))

In [None]:
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range)

## True clusters

First we look at ground truth clusters.

In [None]:
grouped = locdata.data.groupby("cluster_label")
clust = lc.LocData.from_collection([lc.LocData.from_selection(locdata, indices=group.index) for name, group in grouped])

Filter out clusters with less than 3 localizations since on convex hull can be computed for such clusters.

In [None]:
clust_selection = lc.select_by_condition(clust, condition="2 < localization_count")
references_ = [clust.references[i] for i in clust_selection.indices]
clust_selection.reduce()
clust_selection.references = references_
clust = clust_selection

In [None]:
n_clustered_loc = np.sum([ref.properties['localization_count'] for ref in clust.references])
print(f"Number of clusters: {clust.properties['localization_count']}")
print(f"Number of clustered localizations: {n_clustered_loc}")
print(f"Ratio cluster to noise localizations: {n_clustered_loc / len(locdata):.3}")

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1)
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range, cmap=m_gray.reversed())
clust.data.plot.scatter(x='position_x', y='position_y', ax=ax, color='Red', s=10, label='cluster centroids', xlim=bin_range[0], ylim=bin_range[1])
plt.show()

In [None]:
clust.data.head()

In [None]:
clust.properties

## Investigate the convex hull areas

Localization clusters can be analyzed with respect to their convex hull region properties as function of localization_count as outlined in Ebert et al. (https://doi:10.1093/bioinformatics/btac700).

In [None]:
che = lc.ConvexHullExpectation(convex_hull_property='region_measure_ch', expected_variance=10**2).compute(locdata=clust)
che.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
che.plot(ax=axes[0])
che.hist(ax=axes[1]);

## Investigate the position variances

Localization coordinates in localization clusters come with a certain variance. The variance is related to the localization precision or other localization properties but also varies with localization_count if determined as biased sample variance (i.e. without Bessel's correction).

In [None]:
pve_biased = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=True).compute(locdata=clust)
pve_biased.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve_biased.plot(ax=axes[0])
pve_biased.hist(ax=axes[1]);

A similar analysis can be performed with unbiased variances in which Bessel's correction is applied.

In [None]:
pve = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=False).compute(locdata=clust)
pve.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve.plot(ax=axes[0])
pve.hist(ax=axes[1], log=True);

## Investigate any grouped property

A similar analysis can be carried out with any LocData property. For instance, let's check the coordinate uncertainties of cluster centroids. The uncertainty in one dimension should follow the square root of the biased position variance for clusters with variable number of localizations.

It is important to consider the differences between variance and standard deviation. Position uncertainties are usually given as standard deviation with units equal to position units. Converting the ground truth for the coordinate standard deviation in each cluster (std as used in the simulations above) requires a Bessel correction on the squared std being the variance). In addition the coordinate uncertainties of cluster centroids should scale with the inverse square root of the number of localizations per cluster.

In [None]:
n_locs = np.arange(1, 1000)
ground_truth_std = 10
ground_truth_variance = ground_truth_std**2
biased_variance = ground_truth_variance * (1 - 1 / n_locs)
biased_uncertainty = np.sqrt(biased_variance)
expected_uncertainty = biased_uncertainty / np.sqrt(n_locs)
expectation = pd.Series(data=expected_uncertainty, index=n_locs)
expectation;

In [None]:
loc_property = "uncertainty_x"
other_loc_property = "localization_count"

In [None]:
gpe = lc.GroupedPropertyExpectation(loc_property=loc_property, other_loc_property=other_loc_property, expectation=expectation).compute(locdata=clust)
gpe.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
gpe.plot(ax=axes[0])
gpe.hist(ax=axes[1]);

## Cluster localizations by dbscan

When clustering data by dbscan slight deviations appear between expectation and computed properties.

In [None]:
noise, clust = lc.cluster_dbscan(locdata, eps=20, min_samples=3)

In [None]:
n_clustered_loc = np.sum([ref.properties['localization_count'] for ref in clust.references])
print(f"Number of clusters: {clust.properties['localization_count']}")
print(f"Number of noise localizations: {noise.properties['localization_count']}")
print(f"Number of clustered localizations: {n_clustered_loc}")
print(f"Ratio cluster to noise localizations: {n_clustered_loc / (n_clustered_loc + noise.properties['localization_count']):.3}")

In [None]:
fig, ax = plt.subplots(nrows=1, ncols=1)
lc.render_2d_mpl(locdata, bin_size=20, bin_range=bin_range, cmap=m_gray.reversed())
clust.data.plot.scatter(x='position_x', y='position_y', ax=ax, color='Red', s=10, label='cluster centroids', xlim=bin_range[0], ylim=bin_range[1])
plt.show()

## Investigate the position variances

In [None]:
pve_biased = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=True).compute(locdata=clust)
pve_biased.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve_biased.plot(ax=axes[0])
pve_biased.hist(ax=axes[1]);

In [None]:
pve = lc.PositionVarianceExpectation(loc_property="position_x", expectation=10**2, biased=False).compute(locdata=clust)
pve.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
pve.plot(ax=axes[0])
pve.hist(ax=axes[1], log=True);

## Investigate the convex hull areas

In [None]:
che = lc.ConvexHullExpectation(convex_hull_property='region_measure_ch', expected_variance=10**2).compute(locdata=clust)
che.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
che.plot(ax=axes[0])
che.hist(ax=axes[1]);

## Investigate the uncertainties for cluster centroids 

In [None]:
n_locs = np.arange(1, 1000)
ground_truth_std = 10
ground_truth_variance = ground_truth_std**2
biased_variance = ground_truth_variance * (1 - 1 / n_locs)
biased_uncertainty = np.sqrt(biased_variance)
expected_uncertainty = biased_uncertainty / np.sqrt(n_locs)
expectation = pd.Series(data=expected_uncertainty, index=n_locs)
expectation;

In [None]:
loc_property = "uncertainty_x"
other_loc_property = "localization_count"

In [None]:
gpe = lc.GroupedPropertyExpectation(loc_property=loc_property, other_loc_property=other_loc_property, expectation=expectation).compute(locdata=clust)
gpe.results

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
gpe.plot(ax=axes[0])
gpe.hist(ax=axes[1]);