# Cluster analysis (batch/pipeline)

11.8.2021 (8.9.2022) Sören Doose

## Initial setup

In [None]:
import sys
from pathlib import Path
import re
import pickle
import logging

%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from tqdm.notebook import tqdm
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

import locan as lc

In [None]:
lc.show_versions(dependencies=False)

In [None]:
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

## File list

The dataset should have been corrected for drift and chromatic abberation.

Set the directory in which a set of localization files is found.

In [None]:
directory = Path(r'.') / '../data/Titration'
assert directory.exists()

In [None]:
files = list(directory.glob('**/*.yaml'))
print(f'Number of files: {len(files)}')
for i, file in enumerate(files):
    print(i,":", file);

## Set up the pipeline

In [None]:
def computation(self, file):
    """
    Analysis procedure on a LocData object specified by file.
    
    Parameters
    ----------
    self : Pipeline
        Pipeline object that collects results of the analysis procedure.
    file : str
        File path for roi-file.
        
    Returns
    -------
    Pipeline
        The Pipeline object specified by `self`.
    """    
    # Prologue
    self.file_indicator = Path(file) #.stem
    
    # Load locdata
    logger.debug(f'Loading')
    roi = lc.Roi.from_yaml(path=file)
    roi.reference.file_path = str(Path(file)
                              .relative_to(Path('.'))
                              .with_name(Path(roi.reference.file_path).name)
                             )
    locdata = roi.locdata()
    
    # Prerequisites
    if not len(locdata)>1:
        logger.info(f'Not enough data in file: {file}')
        return None
    
    # Select
    condition = '0 < frame < 15_000 and ' \
                '8000 < intensity'
    locdata = lc.select_by_condition(locdata, condition=condition)
    
    # compute cluster
    logger.debug('Processing Cluster')
    self.noise, self.clust = lc.cluster_dbscan(locdata, eps=20, min_samples=3)
    
    # add properties
    for reference in self.clust.references:
        reference.properties['intensity'] = reference.data.intensity.sum()
        reference.properties['intensity_mean'] = reference.data.intensity.mean()
        reference.properties['local_background_mean'] = reference.data.local_background.mean()
        reference.properties['frame_mean'] = reference.data.frame.mean()
        reference.properties['frame_std'] = reference.data.frame.std()
        
    new_df = pd.DataFrame([reference.properties for reference in self.clust.references])
    new_df.index = self.clust.data.index
    self.clust.dataframe.update(new_df)
    new_columns = [column for column in new_df.columns if column in self.clust.dataframe.columns]
    new_df.drop(columns=new_columns, inplace=True, errors='ignore')
    self.clust.dataframe = pd.concat([self.clust.dataframe, new_df], axis=1)
    
    # compute convex hull
    logger.debug('Processing convex hulls')
    self.clust.update_convex_hulls_in_references()
    self.clust.data.fillna(0, inplace=True)
    
    # select cluster
    logger.debug('Processing Cluster Selection')
    if self.clust.data.empty:
        self.clust_selection = self.clust
    else:
        condition = '0 < localization_count and ' \
                    '0 < region_measure_ch'
        self.clust_selection = lc.select_by_condition(self.clust, condition=condition)
        references_ = [self.clust.references[i] for i in self.clust_selection.indices]
        self.clust_selection.reduce()
        self.clust_selection.references = references_

    logger.debug('Processing Properties')
    if self.clust_selection.data.empty:
        # LocalizationProperty
        self.lp_localization_count = None
        self.lp_region_measure_ch = None

        # Scalar properties
        self.n_locs_in_cluster = np.nan
        self.n_locs_in_cluster_relative = np.nan
    
    else:
        # LocalizationProperty
        self.lp_localization_count = lc.LocalizationProperty(loc_property='localization_count').compute(self.clust_selection)
        self.lp_region_measure_ch = lc.LocalizationProperty(loc_property='region_measure_ch').compute(self.clust_selection)

        # Scalar properties
        self.n_locs_in_cluster = self.clust_selection.data['localization_count'].sum()
        self.n_locs_in_cluster_relative = self.n_locs_in_cluster / len(locdata)

    logger.debug(f'Computation completed for: {file}')
            
    return self

## Run pipeline

In [None]:
logger.setLevel(logging.INFO)  # alternative logging.WARNING

## Multiprocessing with ray

In [None]:
import ray

ray.init()
# ray.init(num_cpus = 4)

In [None]:
%%time
@ray.remote
def worker(i, file):
    logging.basicConfig(level=logging.INFO)
    logger.info(f'Processing {i} : {file}')
    try:
        pipe = lc.Pipeline(computation=computation, file=file).compute()
        return pipe
    except Exception as e:
        logger.warning(f'Error in {file}: \n {e}')
        return None

futures = [worker.remote(i, file) for i, file in enumerate(files)]
pipes = ray.get(futures)
print(f'Number of pipes: {len(pipes)}')

### Pipeline attributes

In [None]:
[attr for attr in dir(pipes[0]) if not attr.startswith('__') and not attr.endswith('__')]

## Save pickled pipes

## Load pickled pipes

## Data presentation

### Remove None from pipes

In [None]:
print(f'Number of pipes: {len(pipes)}')
pipes = [pipe for pipe in pipes if pipe]
print(f'Number of pipes that are not None: {len(pipes)}')

### Extract from pipes

In [None]:
def collect_scalars(pipelines):
    """
    Collect scalar properties from Pipeline objects and assemble them in a pandas.DataFrame.
    
    Parameters
    ----------
    pipelines : list(Pipeline)
        Pipeline objects.
        
    Returns
    -------
    pandas.DataFrame
    """
    dictionaries = []
    for pipe in tqdm(pipelines, desc='Processed pipelines:'):

        new_dict = {
            'files': pipe.file_indicator
            }
        new_dict.update(cluster_count=pipe.clust_selection.properties['localization_count'])
        new_dict.update(cluster_density=pipe.clust_selection.properties['localization_density'])
        new_dict.update(n_locs_in_cluster_relative=pipe.n_locs_in_cluster_relative)

        if pipe.clust_selection.properties['localization_count']==0:
            new_dict.update(localization_count_mean=np.nan)
            new_dict.update(region_measure_ch_mean=np.nan)
            new_dict.update(localization_density_in_cluster_mean=np.nan)
            new_dict.update(localization_density_in_cluster_mean=np.nan)

        else:
            new_dict.update(localization_count_mean=pipe.clust_selection.data['localization_count'].mean())
            new_dict.update(region_measure_ch_mean=pipe.clust_selection.data['region_measure_ch'].mean()) 
            new_dict.update(localization_density_in_cluster_mean=pipe.clust_selection.data['localization_density_ch'].mean()) 
            new_dict.update(localization_density_in_cluster_mean=pipe.clust_selection.data['localization_density_ch'].mean())
            new_dict.update(intensity_sum_mean=pipe.clust_selection.data['intensity'].mean())
            new_dict.update(intensity_mean_mean=pipe.clust_selection.data['intensity_mean'].mean())
            new_dict.update(local_background_mean_mean=pipe.clust_selection.data['local_background_mean'].mean())
        
        dictionaries.append(new_dict)
        
    return pd.DataFrame(dictionaries)

In [None]:
scalars_df = collect_scalars(pipes)

In [None]:
scalars_df.head()

## Compare for files

Reduce the file name to group identifier - here buffer:

In [None]:
choices = ['*5ng*',
           '*0-005*']

choices_name =  ['dense',
                 'sparse']
   
conditions = [[f.match(name) for f in scalars_df['files']] for name in choices]
scalars_df['sample'] = np.select(conditions, choices_name, default=None)

In [None]:
grouped = scalars_df.groupby('sample')

In [None]:
assert all([group in choices_name for group in list(grouped.groups)])
list(grouped.groups)

### Descriptive statistics

#### Mean values:

In [None]:
grouped.size()

In [None]:
grouped.mean()

In [None]:
grouped.median()

In [None]:
grouped.sem()

### Statistical plots

In [None]:
scalars_df.columns

In [None]:
blacklist = [
    'files',
    'sample'
]
columns_of_interest = [column for column in scalars_df.columns if column not in blacklist]
columns_of_interest

In [None]:
n_elements = len(columns_of_interest)
n_cols = 1
n_rows = -(-n_elements // n_cols)

fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=columns_of_interest, vertical_spacing=0.01)

colors = px.colors.qualitative.T10 * 5
assert len(colors) >= len(grouped)

for column, indices in zip(columns_of_interest, lc.iterate_2d_array(n_elements, n_cols)):
    for (key, value), color in zip(grouped[column], colors):
        fig.add_trace(
            go.Box(x=value, name=key, boxpoints='all', marker_size=5, width=0.5, marker_color =color),
            row=indices[0]+1, col=indices[1]+1
            )
        fig.update_layout(boxmode='group', showlegend=False)

fig.update_layout(height=12_000, width=1000, title_text="Averaged properties for each experimental group")
go.FigureWidget(fig).show()

## Grouped number of localizations per cluster

In [None]:
def get_concatenated_dataframe(pipes, grouped, property):
    results_dict = {}
    for name, group in grouped:
        series = pd.Series(dtype=np.float64)
        for i in group.index:
            if pipes[i].clust_selection.properties['localization_count']>0:
                new_series = pipes[i].clust_selection.data[property]
                series = series.append(new_series, ignore_index=True)
        results_dict[name] = series
    return pd.DataFrame(results_dict)

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)
df.hist(bins=100, log=True, figsize=(20, 10), density=True);
plt.tight_layout()

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)
plt.hist(df, bins=40, density=True, log=True);

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)
plt.hist(df, bins=np.linspace(0, 100, 20), density=True, log=True);

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)
df.hist(bins=np.linspace(0,100,100), log=True, figsize=(20, 10), density=True);
plt.tight_layout()

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)
df.hist(bins=np.linspace(0,50,50), log=False, figsize=(20, 10), density=True);
plt.tight_layout()

### Ratio of cluster above a certain size

In [None]:
threshold = 100
large_spot_ratio = len(df.dense[df.dense > threshold]) / len(df.dense)
round(large_spot_ratio, ndigits=5)

### Localizations in cluster above a certain size

In [None]:
threshold = 1100

large_spot_localization_ratio =df.dense[df.dense > threshold].sum() / df.dense.sum()
round(large_spot_localization_ratio, ndigits=5)

### Cummulative locs

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)
plt.hist(df, bins=1000, density=True, log=False, cumulative=True, weights=df, histtype='step', linewidth=2);
ax = plt.gca()
ax.set_xlabel(r'localizations per cluster')
ax.set_ylabel(r"cdf weighted by Localizations per cluster");

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)

hist, bins = np.histogram(df.dense, bins=100)
logbins = np.logspace(np.log10(bins[0]), np.log10(bins[-1]), len(bins))

plt.hist(df, bins=logbins, density=True, log=False, cumulative=True, weights=df, histtype='step', linewidth=2);
plt.xscale('log')

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)
plt.hist(df, bins=np.linspace(0, 100, 100), density=True, log=False, cumulative=True, weights=df, histtype='step', linewidth=2);

## Publication Figures

#### plot localization_count

In [None]:
df_simulation = pd.read_csv("simulated_dense_myc_clust.csv")
df_simulation

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)

with plt.style.context('./publication_histogram.mplstyle'):
    fig, ax = plt.subplots()
    ax.hist([df['sparse'], df.dense, df_simulation.localization_count], bins=np.linspace(0, 100, 10), density=True, log=True);

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)

with plt.style.context('./publication_histogram.mplstyle'):
    fig, ax = plt.subplots()
    ax.hist([df['sparse'], df.dense, df_simulation.localization_count], bins=20, density=True, log=True);
    ax = plt.gca()
    ax.set_xlabel(r'localizations per cluster')
    ax.set_ylabel(r"pdf")

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)

hist, bins = np.histogram(df.dense, bins=20)
logbins = np.logspace(np.log10(bins[0]), np.log10(bins[-1]), len(bins))

with plt.style.context('./publication_histogram.mplstyle'):
    fig, ax = plt.subplots()
    ax.hist([df['sparse'], df.dense, df_simulation.localization_count], bins=logbins, density=True, log=True);
    ax = plt.gca()
    plt.xscale('log')
    ax.set_xlabel(r'localizations per cluster')
    ax.set_ylabel(r"pdf")

In [None]:
property = 'localization_count'
df = get_concatenated_dataframe(pipes, grouped, property)

with plt.style.context('./publication_histogram.mplstyle'):
    fig, ax = plt.subplots(figsize=(7.2, 5.8))
    plt.hist([df['sparse'], df.dense, df_simulation.localization_count], bins=1000, density=True, log=False, cumulative=True, weights=[df['sparse'], df.dense, df_simulation.localization_count], histtype='step', linewidth=4);
    ax = plt.gca()
    ax.set_xlim(-200, 4200)
    ax.set_xlabel(r'localizations per cluster')
    ax.set_ylabel(r"weighted cdf")