#### Run scanpro as a nice python implementation of propeller, save the results for plotting in the next script `03_produce_cell_type_proportion_plots.ipynb`

In [2]:
from scanpro import scanpro
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from collections import Counter

In [3]:
results_dir = "scanpro_results_dir/"
os.makedirs(results_dir, exist_ok=True)

In [4]:
%%time
adata = sc.read_h5ad("../07_final_RNA_without_scvi.h5ad")
adata

CPU times: user 13.6 s, sys: 1min 7s, total: 1min 21s
Wall time: 1min 21s


AnnData object with n_obs × n_vars = 2305964 × 16115
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'disease', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'barcode', 'sample_id', 'age_status', 'tech_plus_study', 'disease_binary', 'decade', 'age_group', '_scvi_batch', '_scvi_labels', 'leiden_scVI', 'scvi_cell_type', 'redo_leiden_0.5', 'UMAP1', 'UMAP2', 'v2_scvi_cell_type', 'final_cell_type'
    obsm: 'X_scVI', 'X_umap', '_scvi_extra_categorical_covs'
    layers: 'counts'

### Determine number of nuclei per donor, remove those without more than 1K nuclei

In [5]:
counts_per_nuclei = adata.obs.groupby('donor_id').count()[['age']].rename(columns = {'age': 'count'}).reset_index()

In [18]:
print(f"Number of total donors: {len(counts_per_nuclei.donor_id.unique())}")

Number of total donors: 299


In [6]:
nuclei_threshold = 1000
donors_to_keep = counts_per_nuclei[counts_per_nuclei['count'] > nuclei_threshold].donor_id

In [19]:
filtered_adata = adata[adata.obs.donor_id.isin(donors_to_keep)]
num_donors = len(filtered_adata.obs.donor_id.unique())
print(f"Number of total donors after filtering: {num_donors}")

Number of total donors after filtering: 292


In [11]:
filtered_adata.obs.columns

Index(['age', 'donor_id', 'sex', 'region', 'cell_type', 'disease',
       'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei',
       'barcode', 'sample_id', 'age_status', 'tech_plus_study',
       'disease_binary', 'decade', 'age_group', '_scvi_batch', '_scvi_labels',
       'leiden_scVI', 'scvi_cell_type', 'redo_leiden_0.5', 'UMAP1', 'UMAP2',
       'v2_scvi_cell_type', 'final_cell_type'],
      dtype='object')

In [12]:
filtered_adata.obs.disease_binary.unique()

['N', 'Y']
Categories (2, object): ['N', 'Y']

In [13]:
filtered_adata.obs.age_group.unique()

['fetal', 'old', 'middle', 'young']
Categories (4, object): ['fetal', 'middle', 'old', 'young']

### Perform disease and age related analysis 

#### To do this, filter to just the postnatal data

In [20]:
non_fetal_adata = filtered_adata[filtered_adata.obs.age_group != "fetal", :]
num_donors = len(non_fetal_adata.obs.donor_id.unique())
print(f"Number of non-fetal donors: {num_donors}")

Number of non-fetal donors: 279


#### Perform analysis focused on age_group

In [21]:
out = scanpro(non_fetal_adata, clusters_col='final_cell_type', 
              conds_col='age_group', samples_col='donor_id', 
              covariates=['disease_binary', 'sex', 'tech_plus_study'], transform="arcsin")

non_fetal_age_group_results_df = out.results
non_fetal_age_group_results_df

[INFO] There are more than 2 conditions. ANOVA will be performed...
[INFO] Done!


Unnamed: 0_level_0,baseline_props,mean_props_old,mean_props_middle,mean_props_young,f_statistics,p_values,adjusted_p_values
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adipocyte,0.005411,0.006875,0.005385,0.004197,0.662804,0.516245,0.559266
Cardiomyocyte,0.292212,0.249498,0.286405,0.302098,1.527865,0.218878,0.355677
Endocardial,0.015946,0.017433,0.017308,0.022682,0.429799,0.651087,0.651087
Endothelial,0.139881,0.133678,0.131475,0.162958,1.858238,0.157946,0.324117
Epicardial,0.003912,0.011209,0.002427,0.000951,2.504586,0.083616,0.266599
Fibroblast,0.236539,0.256185,0.230033,0.221655,2.968938,0.053046,0.266599
LEC,0.005776,0.007132,0.007608,0.003926,1.061833,0.347268,0.455997
Lymphoid,0.029115,0.036557,0.029105,0.025979,2.494865,0.084418,0.266599
Mast,0.003981,0.003053,0.004419,0.003231,1.757085,0.174525,0.324117
Myeloid,0.10699,0.120511,0.111215,0.102704,2.329377,0.09932,0.266599


In [22]:
non_fetal_age_group_results_df.to_csv(results_dir + "age_group_scanpro_results.csv")

In [23]:
non_fetal_age_group_results_df
non_fetal_age_group_results_df[non_fetal_age_group_results_df['adjusted_p_values'] < 0.05]

Unnamed: 0_level_0,baseline_props,mean_props_old,mean_props_middle,mean_props_young,f_statistics,p_values,adjusted_p_values
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


#### Perform analysis focused on disease

In [24]:
out = scanpro(non_fetal_adata, clusters_col = 'final_cell_type', 
              conds_col='disease_binary', samples_col='donor_id', 
              covariates=['age_group', 'sex', 'tech_plus_study'], transform="arcsin")

disease_status_results_df = out.results

[INFO] There are 2 conditions. T-Test will be performed...
[INFO] Done!


In [25]:
disease_status_results_df

Unnamed: 0_level_0,baseline_props,mean_props_Y,mean_props_N,prop_ratio,t_statistics,p_values,adjusted_p_values
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adipocyte,0.005411,0.004297,0.006796,0.632315,0.029597,0.976411,0.976411
Cardiomyocyte,0.292212,0.249562,0.300341,0.830928,-5.310456,2.311943e-07,1.502763e-06
Endocardial,0.015946,0.013399,0.02273,0.589477,-1.602456,0.1102426,0.1577002
Endothelial,0.139881,0.18105,0.102721,1.762537,3.959885,9.634199e-05,0.0002504892
Epicardial,0.003912,0.007221,0.003331,2.167691,3.01055,0.002858576,0.004645186
Fibroblast,0.236539,0.225313,0.247326,0.910996,1.554279,0.1213078,0.1577002
LEC,0.005776,0.008714,0.004981,1.749345,4.877336,1.853793e-06,8.033104e-06
Lymphoid,0.029115,0.035356,0.027314,1.294402,3.504948,0.0005358041,0.001160909
Mast,0.003981,0.002081,0.005092,0.408754,-4.423857,1.415397e-05,4.60004e-05
Myeloid,0.10699,0.099089,0.12413,0.798271,1.224,0.2220362,0.2624064


In [26]:
disease_status_results_df[disease_status_results_df['adjusted_p_values'] < 0.05]

Unnamed: 0_level_0,baseline_props,mean_props_Y,mean_props_N,prop_ratio,t_statistics,p_values,adjusted_p_values
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cardiomyocyte,0.292212,0.249562,0.300341,0.830928,-5.310456,2.311943e-07,1.502763e-06
Endothelial,0.139881,0.18105,0.102721,1.762537,3.959885,9.634199e-05,0.0002504892
Epicardial,0.003912,0.007221,0.003331,2.167691,3.01055,0.002858576,0.004645186
LEC,0.005776,0.008714,0.004981,1.749345,4.877336,1.853793e-06,8.033104e-06
Lymphoid,0.029115,0.035356,0.027314,1.294402,3.504948,0.0005358041,0.001160909
Mast,0.003981,0.002081,0.005092,0.408754,-4.423857,1.415397e-05,4.60004e-05
Pericyte,0.124728,0.131442,0.125312,1.048921,-3.368283,0.0008683818,0.001612709
vSMC,0.025918,0.032807,0.020251,1.620058,6.237148,1.746174e-09,2.270026e-08


In [27]:
disease_status_results_df.to_csv(results_dir + "disease_scanpro_results.csv")

### Perform analysis based on sex

In [28]:
out = scanpro(non_fetal_adata, clusters_col='final_cell_type', 
              conds_col='sex', samples_col='donor_id', 
              covariates=['age_group', 'disease_binary', 'tech_plus_study'], transform="arcsin")
sex_results_df = out.results
sex_results_df

[INFO] There are 2 conditions. T-Test will be performed...
[INFO] Done!


Unnamed: 0_level_0,baseline_props,mean_props_male,mean_props_female,prop_ratio,t_statistics,p_values,adjusted_p_values
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adipocyte,0.005411,0.00559,0.005755,0.9713,-1.009978,0.313424,0.549344
Cardiomyocyte,0.292212,0.255364,0.315252,0.810033,-2.135835,0.033607,0.145629
Endocardial,0.015946,0.015958,0.022838,0.698773,-1.383531,0.167663,0.435923
Endothelial,0.139881,0.141848,0.133033,1.066261,-0.187942,0.851065,0.921987
Epicardial,0.003912,0.006567,0.002558,2.567672,-0.283689,0.77687,0.918119
Fibroblast,0.236539,0.251984,0.211218,1.193003,3.339102,0.000961,0.01249
LEC,0.005776,0.005338,0.009082,0.587841,-2.490385,0.013372,0.086916
Lymphoid,0.029115,0.029977,0.032813,0.913562,-1.2127,0.226321,0.490363
Mast,0.003981,0.003493,0.004095,0.853122,-0.369597,0.711977,0.918119
Myeloid,0.10699,0.115031,0.10843,1.060881,0.740653,0.459558,0.663806


In [29]:
sex_results_df[sex_results_df['adjusted_p_values'] < 0.05]

Unnamed: 0_level_0,baseline_props,mean_props_male,mean_props_female,prop_ratio,t_statistics,p_values,adjusted_p_values
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Fibroblast,0.236539,0.251984,0.211218,1.193003,3.339102,0.000961,0.01249


In [30]:
sex_results_df.to_csv(results_dir + "sex_scanpro_results.csv")

### Fetal vs. young

In [31]:
fetal_young_adata =  ( filtered_adata[(filtered_adata.obs.age_group.isin(["fetal", "young"])) & 
                      (filtered_adata.obs.disease_binary == "N"), :] )
fetal_young_adata
num_donors = len(fetal_young_adata.obs.donor_id.unique())

print(f"Number of fetal + young non-diseased donors: {num_donors}")

Number of fetal + young non-diseased donors: 42


In [38]:
out = scanpro(fetal_young_adata, clusters_col='final_cell_type', 
              conds_col='age_group', samples_col='donor_id', 
              covariates=['sex', 'disease_binary', 'tech_plus_study'], transform="arcsin")
fetal_results_df = out.results

[INFO] There are 2 conditions. T-Test will be performed...
[INFO] Done!


In [39]:
fetal_results_df[fetal_results_df['adjusted_p_values'] < 0.05]

Unnamed: 0_level_0,baseline_props,mean_props_fetal,mean_props_young,prop_ratio,t_statistics,p_values,adjusted_p_values
clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Adipocyte,0.004763,3.3e-05,0.006825,0.004869,-2.612991,0.01309829,0.024325
Cardiomyocyte,0.313055,0.379948,0.282319,1.34581,2.480335,0.01803466,0.029306
Endocardial,0.044097,0.073538,0.028796,2.553783,2.901315,0.006358206,0.013776
Epicardial,0.010494,0.019459,0.00088,22.103201,4.399723,9.536322e-05,0.000579
LEC,0.0056,0.008553,0.003605,2.372401,3.881913,0.0004341168,0.001411
Mast,0.004385,0.001154,0.005597,0.206168,-3.034535,0.004499907,0.0117
Myeloid,0.104578,0.029864,0.128352,0.232671,-6.077779,5.885822e-07,8e-06
Pericyte,0.091981,0.049726,0.111475,0.446076,-4.285766,0.0001336553,0.000579


In [33]:
fetal_results_df.to_csv(results_dir + "fetal_scanpro_results.csv")

### Produce plots of proportions using tidy version of data

In [34]:
adata_metadata = adata.obs
cell_type_key = "final_cell_type"
donor_level_metadata = adata_metadata[["sex", "age", "age_group",
                                     "donor_id", "disease_binary", 
                                       "tech_plus_study"]].drop_duplicates().reset_index(drop = True)

cell_type_counts_per_donor = pd.crosstab(adata_metadata['donor_id'], adata_metadata[cell_type_key])
norm_filtered_cell_counts_df = 100 * cell_type_counts_per_donor.div(cell_type_counts_per_donor.sum(axis = 1), axis = 0)

In [35]:
prop_long_df = norm_filtered_cell_counts_df.reset_index().melt(id_vars = 'donor_id', 
                                                               var_name = 'cell_type', 
                                                               value_name = 'proportion').reset_index(drop = True)
prop_long_df.head()

# merge with metadata so that we have the covariates for regression
prop_long_df = prop_long_df.merge(donor_level_metadata, on = "donor_id", how = "inner")

In [36]:
prop_long_df

Unnamed: 0,donor_id,cell_type,proportion,sex,age,age_group,disease_binary,tech_plus_study
0,Chaffin 2022:P1290,Adipocyte,0.000000,male,65.0,old,Y,3prime-v3_Chaffin 2022
1,Chaffin 2022:P1290,Cardiomyocyte,38.080942,male,65.0,old,Y,3prime-v3_Chaffin 2022
2,Chaffin 2022:P1290,Endocardial,0.036027,male,65.0,old,Y,3prime-v3_Chaffin 2022
3,Chaffin 2022:P1290,Endothelial,18.806293,male,65.0,old,Y,3prime-v3_Chaffin 2022
4,Chaffin 2022:P1290,Epicardial,0.000000,male,65.0,old,Y,3prime-v3_Chaffin 2022
...,...,...,...,...,...,...,...,...
3882,Simonson 2023:P1801,Mast,0.081054,male,42.0,middle,N,3prime-v3_Simonson 2023
3883,Simonson 2023:P1801,Myeloid,7.213779,male,42.0,middle,N,3prime-v3_Simonson 2023
3884,Simonson 2023:P1801,Neuronal,1.215805,male,42.0,middle,N,3prime-v3_Simonson 2023
3885,Simonson 2023:P1801,Pericyte,11.286727,male,42.0,middle,N,3prime-v3_Simonson 2023
