# Extract and analyse summary features

The clustered and projected data can be used to extract different features:

- size per cluster
- mean intensity per cluster
- TODO add more

The features are saved as an adata object.

In [1]:
from miann.tl import Experiment, FeatureExtractor
import os
from miann.pl import plot_mean_intensity, zscore, get_intensity_change, plot_intensity_change, plot_mean_size, plot_size_change
import anndata as ad
from miann.utils import init_logging
init_logging()

## Extract features
Features can be extracted with `python extract_features.py test/CondVAE_pert-CC clustering_0.5 intensity`


In [2]:
exp = Experiment.from_dir('test/CondVAE_pert-CC')
# just use one dir here, for extracting all features, run the script
data_dir = exp.data_params['data_dirs'][0]
extr = FeatureExtractor(exp, data_dir=data_dir, cluster_name='clustering_0.5', 
                        cluster_dir='aggregated/sub-0.005')

AssertionError: no config.json in test/CondVAE_pert-CC

In [4]:
extr.extract_intensity_size()

INFO:FeatureExtractor:Calculating clustering_0.5 (col: clustering_0.5) mean and size for 184A1_unperturbed/I09
INFO:MPPData:Created new: MPPData for NascentRNA (12132995 mpps with shape (1, 1, 35) from 886 objects). Data keys: ['x', 'y', 'obj_ids', 'labels', 'mpp'].
INFO:MPPData:Loaded data from 184A1_unperturbed/I09.
INFO:MPPData:Setting mpp to empty array
INFO:MPPData:Created new: MPPData for NascentRNA (7218204 mpps with shape (1, 1, 34) from 557 objects). Data keys: ['x', 'y', 'clustering_0.5', 'obj_ids', 'latent', 'mpp'].
INFO:MPPData:Before subsetting: 886 objects
INFO:MPPData:Subsetting to 557 objects
INFO:MPPData:Updated data to keys ['x', 'y', 'obj_ids', 'labels', 'mpp', 'clustering_0.5', 'latent']
INFO:MPPData:Loaded data from 184A1_unperturbed/I09, with base data from 184A1_unperturbed/I09
INFO:MPPData:Restricted channels to 34 channels
INFO:MPPData:Subtracting channel-specific background value defined in column mean_background
INFO:MPPData:Rescaling MPP intensities per chan

processing 0
processing 1
processing 2
processing 3
processing 4
processing 5
processing 6


  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'plate_name' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'well_name' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'plate_name_cell' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'well_name_cell' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_cycle' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'description' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'dimensions' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'cell_type' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storing 'perturbation' as categorical
  c.reorder_categories(natsorted(c.categories), inplace=True)
... storin

processing 7
processing 8


## Explore extracted features

In [7]:
extrs = [FeatureExtractor.from_adata(os.path.join(exp.full_path, 'aggregated/full_data', data_dir, 'features_seed1.h5ad')) for data_dir in exp.data_params['data_dirs']]

# get combined adata for dotplots
adatas = [extr.get_intensity_adata() for extr in extrs]
adata = ad.concat(adatas, index_unique='-')
zscore(adata, limit_to_groups={'perturbation_duration':'unperturbed'})

In [5]:
plot_mean_intensity(adata, groupby='cluster', limit_to_groups={'perturbation_duration':'unperturbed'}, dendrogram=True, layer='zscored', cmap='bwr', vmin=-10, vmax=10)
plot_mean_size(adata, groupby_row='cluster', groupby_col='perturbation_duration', normby_row='all', vmax=0.3)

In [None]:
res = get_intensity_change(adata, groupby='cluster', reference_group='perturbation_duration', reference='unperturbed', color='logfoldchange', size='pval')
plot_intensity_change(**res, adjust_height=True, figsize=(15,5), vmin=-2, vmax=2, dendrogram=True)

res = get_intensity_change(adata, groupby='cluster', reference_group='perturbation_duration', reference='unperturbed', color='logfoldchange', size='pval', norm_by_group='all')
plot_intensity_change(**res, adjust_height=True, figsize=(15,5), vmin=-2, vmax=2)