# Integrated stAge pipeline

#### stAge steps:
1. Load H5AD dataset and set parameters
2. Optimal Resolution Search (optional) or custom resolution
3. Apply stAge at optimal resolution
4. Display/Save results

Requirements:
- Scaled and YuGene EN pkl files
- Mus_musculus.gene_info
- st_utils.py & st_resol.py

Instructions: 
- Give directory with one or more H5AD files (1 sample = 1 file)
- Make sure gene names (var_names) are SYMBOL
- Make sure main gene expression matrix is raw counts

Notes: 
- Must run spatial plots to get spot-level predictions
- Must run box plots to get metaspot-level predictions

In [3]:
def stAge(rawdata_dir='',
          control_file_pattern = '',
          ORS=True,
          alt_res=1,
          clocks_dir='',
          spatial_plot=True,
          box_plot=True,
          group_patterns=[],
          save_at_spot=False,
          save_at_metaspot=False,
          save_dir=''):

    import warnings
    warnings.filterwarnings("ignore", category=FutureWarning)     # Suppress specific warning types
    warnings.filterwarnings("ignore", category=RuntimeWarning)
    
    import math
    import joblib
    import os
    from pathlib import Path
    
    import pandas as pd
    import scanpy as sc
    
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy.stats import mannwhitneyu
    import squidpy as sq
    import numpy as np
    from scipy import stats
    import scanpy as sc, squidpy as sq, anndata as ad
    from statannotations.Annotator import Annotator
    
    # 1. Load H5AD dataset and set parameters
    ipynb_dir = os.getcwd()
    h5ad_files = [file for file in os.listdir(f'{rawdata_dir}') if ".h5ad" in file]
    assembled_adatas = {file:sc.read(f'{rawdata_dir}/{file}') for file in h5ad_files}

    # 2. Maximum Optimal Resolution Search
    if ORS==True:
        resol_df = optimal_resolution_search(
            assembled_adatas,
            ipynb_dir=ipynb_dir,
            pred_pipeline=full_nonoverlap_mp_pipeline,
            control_file_pattern = control_file_pattern,
            cohen_weight=0.6,             # weight for Cohen's d in composite score
            tstat_weight=0.4,             # weight for t-statistic
            tolerance=0.1               # tie-breaking tolerance for score 0.1 = 10%
            )
    
        optimal_resolutions = {row['Clock']: row['Resolution'] for _, row in resol_df.iterrows()}
        orig_resol = optimal_resolutions['orig'] # optimal resolution for the original tAge, use 'tms' or 'tmsh' for other
    else: 
        orig_resol = alt_res
        
    # 3. MAIN: Apply stAge pipeline with the resolution
    cleaned = {name: ad for name, ad in assembled_adatas.items()
               if ad.n_obs >= 20}            # keep only well-sized slices

    if spatial_plot == True: 
        # 3.1. Spot-level predictions for SPATIAL PLOTTING
        is_lowres = False  
        
        # now run once on the filtered dict
        preds_per_file = full_nonoverlap_mp_pipeline(
                        cleaned,
                        res=orig_resol,
                        lower_res=is_lowres,
                        control_file_pattern=control_file_pattern,
                        mp_coverage_threshold=1_000,
                        save_plot=False,
                        save_result=False,
                        clock_folder='tAge_clocks/tms_clocks', #tAge_clocks/tms_clocks #EN differential models 4.6 #tAge_clocks/EN differential models 5.4 # 5.4 is for HUMANS, 4.6 for MOUSE
                        save_dir=f'{save_dir}',
                        )
        # Save results 
        if save_at_spot == True:
            for file, adata in preds_per_file.items():
                adata.write_h5ad(f'{save_dir}/{file}' #.h5ad
                                )
        
        ## 4.1. SPATIAL plotting
        # Determine consistent color scale across all samples and both clocks
        vmax_sm = max(adata.obs['tAge_SM'].max() for adata in preds_per_file.values())
        vmin_sm = min(adata.obs['tAge_SM'].min() for adata in preds_per_file.values())
        
        # Make it symmetric around 0 (if needed)
        vfinal = max(abs(vmax_sm), abs(vmin_sm))
        
        # Number of samples
        n = len(preds_per_file)
        fig, axes = plt.subplots(nrows=2, ncols=n, figsize=(n * 24, 24), dpi=150)
        
        # If only one sample, axes might not be 2D
        if n == 1:
            axes = np.array([[axes[0]], [axes[1]]])
        
        # Plot each sample's clocks in two rows
        for i, (tis, adata_pred) in enumerate(preds_per_file.items()):
            for j, clock in enumerate(['tAge_SM']):
                ax = axes[j][i]
                sc.pl.spatial(
                    adata_pred,
                    color=clock,
                    spot_size=10,
                    cmap='coolwarm',
                    vmax=vfinal,
                    vmin=-vfinal,
                    ax=ax,
                    show=False,
                    title=f'{tis.replace('.h5ad', '')} | {clock}',
                    )
        plt.tight_layout()
        plt.show()
        ##
        
    if box_plot == True: 
    
        # 3.2. MetaSpot-level predictions for BOX PLOTTING
        is_lowres = True  
        
        # now run once on the filtered dict
        preds_per_file = full_nonoverlap_mp_pipeline(
                        cleaned,
                        res=orig_resol,
                        lower_res=is_lowres,
                        control_file_pattern=control_file_pattern,
                        mp_coverage_threshold=1_000,
                        save_plot=False,
                        save_result=False,
                        clock_folder=clocks_dir, #tAge_clocks/tms_clocks #EN differential models 4.6 #tAge_clocks/EN differential models 5.4 # 5.4 is for HUMANS, 4.6 for MOUSE
                        save_dir=f'{ipynb_dir}/{save_folder}',
                        )
        # Optional save
        if save_at_metaspot == True:
            for file, adata in preds_per_file.items():
                adata.write_h5ad(f'{save_dir}/{file}' #.h5ad
                                )
        # 4.2. BOX plotting  
        plot_clock_distributions(preds_per_file, group_patterns, norm_cols=['tAge_SM'], test='Mann-Whitney')

In [None]:
# Example usage 
import os
from st_utils import *
from st_resol import *

stAge(rawdata_dir='/home/vvicente/spatial_aging/vvicente/stomics_datasets/notion2/as_h5ad/GSE212903_brain3g',
          control_file_pattern = 'Young',
          ORS=True,
          alt_res=1,
          clocks_dir='/home/vvicente/spatial_aging/EN differential models 4.6',
          spatial_plot=True,
          box_plot=True,
          group_patterns=['Young', 'Middle', 'Old'],
          save_dir='/home/vvicente/spatial_aging/vvicente/results')


 Running orig with resolution = 0.25
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 5 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 5 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 5 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 4 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 6 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 5 samples left and 32285 genes left.
After filtering there are 30 samples left and 15580 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 30/30 [00:00<00:00, 5458.02it/s]



 Running orig with resolution = 0.5
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 9 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 9 samples left and 32285 genes left.
After filtering there are 50 samples left and 14784 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 50/50 [00:00<00:00, 5336.40it/s]



 Running orig with resolution = 1
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 17 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 16 samples left and 32285 genes left.
After filtering there are 89 samples left and 13826 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 89/89 [00:00<00:00, 5156.91it/s]



 Running orig with resolution = 1.5
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 19 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 21 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 19 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 20 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 19 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 19 samples left and 32285 genes left.
After filtering there are 117 samples left and 13312 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 117/117 [00:00<00:00, 6100.01it/s]



 Running orig with resolution = 2
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 23 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 26 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 21 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 26 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 26 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 23 samples left and 32285 genes left.
After filtering there are 145 samples left and 12876 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 145/145 [00:00<00:00, 6088.32it/s]



 Running orig with resolution = 4
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 38 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 34 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 37 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 38 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 38 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 38 samples left and 32285 genes left.
After filtering there are 223 samples left and 12094 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 223/223 [00:00<00:00, 6659.62it/s]



 Running orig with resolution = 8
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 63 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 63 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 60 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 63 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 68 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 63 samples left and 32285 genes left.
After filtering there are 380 samples left and 11030 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 380/380 [00:00<00:00, 3678.98it/s]



 Running orig with resolution = 16
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 107 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 108 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 107 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 104 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 114 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 109 samples left and 32285 genes left.
After filtering there are 649 samples left and 9707 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 649/649 [00:00<00:00, 6833.27it/s]



 Best resolution for orig: 1.0 (score = 0.993)

 Running tms with resolution = 0.25
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 5 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 5 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 5 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 4 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 6 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 5 samples left and 32285 genes left.
After filtering there are 30 samples left and 15580 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 30/30 [00:00<00:00, 9399.35it/s]



 Running tms with resolution = 0.5
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 9 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 8 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 9 samples left and 32285 genes left.
After filtering there are 50 samples left and 14784 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 50/50 [00:00<00:00, 2764.25it/s]



 Running tms with resolution = 1
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 17 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 14 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 16 samples left and 32285 genes left.
After filtering there are 89 samples left and 13826 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 89/89 [00:00<00:00, 4030.94it/s]



 Running tms with resolution = 1.5
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 19 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 21 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 19 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
Before filtering there are 20 samples left and 32285 genes left.
Analyzing sample GSM6560899_Old_R01_S1.h5ad
Before filtering there are 19 samples left and 32285 genes left.
Analyzing sample GSM6560900_Young_R02_S1.h5ad
Before filtering there are 19 samples left and 32285 genes left.
After filtering there are 117 samples left and 13312 genes left.


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Processing samples: 100%|██████████| 117/117 [00:00<00:00, 6121.01it/s]



 Running tms with resolution = 2
Analyzing sample GSM6560901_Mid_R02_S2.h5ad
Before filtering there are 23 samples left and 32285 genes left.
Analyzing sample GSM6560897_Young_R01_S1.h5ad
Before filtering there are 26 samples left and 32285 genes left.
Analyzing sample GSM6560898_Mid_R01_S1.h5ad
Before filtering there are 21 samples left and 32285 genes left.
Analyzing sample GSM6560902_Old_R02_S2.h5ad
