In [1]:
import warnings

warnings.filterwarnings("ignore")
import pandas as pd
import pertpy as pt

from sccoda.util import comp_ana as mod
from sccoda.util import cell_composition_data as dat

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


## Load data

In [2]:
# Load data
haber_cells = pt.dt.haber_2017_regions()
# Extract condition from mouse name and add it as an extra column to the covariates
sccoda_model = pt.tl.Sccoda()
sccoda_data = sccoda_model.load(
    haber_cells,
    type="cell_level",
    generate_sample_level=True,
    cell_type_identifier="cell_label",
    sample_identifier="batch",
    covariate_obs=["condition"],
)
sccoda_data.mod["coda_salm"] = sccoda_data["coda"][
    sccoda_data["coda"].obs["condition"].isin(["Control", "Salmonella"])
].copy()

# Select control and salmonella data
data_salm = sccoda_data.mod["coda_salm"]

## Basic model

### Old implementation

In [14]:
# run model
model_old = mod.CompositionalAnalysis(
    data_salm, formula="condition", reference_cell_type="Goblet"
)
# run mcmc
results_old = model_old.sample_hmc()

# show results
results_old.summary()
print(results_old.credible_effects())
results_old.set_fdr(est_fdr=0.4)
results_old.summary()

100%|██████████| 20000/20000 [00:26<00:00, 766.46it/s]


MCMC sampling finished. (33.233 sec)
Acceptance rate: 60.8%
Compositional Analysis summary:

Data: 6 samples, 8 cell types
Reference index: 3
Formula: condition

Intercepts:
                       Final Parameter  Expected Sample
Cell Type                                              
Endocrine                        1.115        34.514313
Enterocyte                       2.332       116.556264
Enterocyte.Progenitor            2.522       140.945615
Goblet                           1.751        65.194435
Stem                             2.703       168.911364
TA                               2.114        93.725857
TA.Early                         2.861       197.823280
Tuft                             0.426        17.328874


Effects:
                                               Final Parameter  \
Covariate               Cell Type                                
condition[T.Salmonella] Endocrine                     0.000000   
                        Enterocyte                    1.3

### New implementation

In [4]:
sccoda_data = sccoda_model.prepare(
    sccoda_data,
    modality_key="coda_salm",
    formula="condition",
    reference_cell_type="Goblet",
)

# Run MCMC
sccoda_model.run_nuts(sccoda_data, modality_key="coda_salm")

# show results
sccoda_model.summary(sccoda_data, modality_key="coda_salm")
sccoda_model.credible_effects(sccoda_data, modality_key="coda_salm")
sccoda_model.set_fdr(sccoda_data, modality_key="coda_salm", est_fdr=0.4)
sccoda_model.summary(sccoda_data, modality_key="coda_salm")

sample: 100%|██████████| 11000/11000 [00:13<00:00, 819.51it/s, 127 steps of size 3.69e-02. acc. prob=0.70]


### Compare

In [23]:
result_df_old = results_old.effect_df
result_df_new = sccoda_model.get_effect_df(sccoda_data["coda_salm"])

result_df_combined = pd.DataFrame({
    "Final Parameter (old)": result_df_old["Final Parameter"].values,
    "Final Parameter (new)": result_df_new["Final Parameter"],
    "log2-fold change (old)": result_df_old["log2-fold change"].values,
    "log2-fold change (new)": result_df_new["log2-fold change"],
},index=result_df_new.index)
result_df_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Final Parameter (old),Final Parameter (new),log2-fold change (old),log2-fold change (new)
Covariate,Cell Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
conditionT.Salmonella,Endocrine,0.283754,0.284457,-0.087481,-0.090331
conditionT.Salmonella,Enterocyte,1.346851,1.352542,1.446244,1.450591
conditionT.Salmonella,Enterocyte.Progenitor,0.0,0.0,-0.496852,-0.500715
conditionT.Salmonella,Goblet,0.0,0.0,-0.496852,-0.500715
conditionT.Salmonella,Stem,0.0,0.0,-0.496852,-0.500715
conditionT.Salmonella,TA,0.0,0.0,-0.496852,-0.500715
conditionT.Salmonella,TA.Early,0.0,0.0,-0.496852,-0.500715
conditionT.Salmonella,Tuft,0.019015,0.011722,-0.469419,-0.483803


## Multigroup

### Old implementation

In [24]:
data_all = sccoda_data.mod["coda"]
model_all = mod.CompositionalAnalysis(
    data_all, formula="condition", reference_cell_type="Endocrine"
)
all_results = model_all.sample_hmc()
all_results.summary()

100%|██████████| 20000/20000 [00:32<00:00, 620.88it/s]


MCMC sampling finished. (41.133 sec)
Acceptance rate: 55.7%
Compositional Analysis summary:

Data: 10 samples, 8 cell types
Reference index: 0
Formula: condition

Intercepts:
                       Final Parameter  Expected Sample
Cell Type                                              
Endocrine                        1.004        47.032938
Enterocyte                       1.925       118.137378
Enterocyte.Progenitor            2.352       181.063567
Goblet                           1.475        75.327718
Stem                             2.428       195.360813
TA                               1.883       113.278362
TA.Early                         2.549       220.489080
Tuft                             0.665        33.510143


Effects:
                                                Final Parameter  \
Covariate                Cell Type                                
condition[T.Hpoly.Day3]  Endocrine                     0.000000   
                         Enterocyte                  

### New implementation

In [25]:
# model all three diseases at once
sccoda_data = sccoda_model.prepare(
    sccoda_data,
    modality_key="coda",
    formula="condition",
    reference_cell_type="Endocrine",
)
sccoda_model.run_nuts(sccoda_data, modality_key="coda")
sccoda_model.summary(sccoda_data, modality_key="coda")

sample: 100%|██████████| 11000/11000 [00:34<00:00, 323.31it/s, 255 steps of size 2.07e-02. acc. prob=0.74]


### Compare

In [26]:
result_df_old = all_results.effect_df
result_df_new = sccoda_model.get_effect_df(sccoda_data["coda"])

result_df_combined = pd.DataFrame({
    "Final Parameter (old)": result_df_old["Final Parameter"].values,
    "Final Parameter (new)": result_df_new["Final Parameter"],
    "log2-fold change (old)": result_df_old["log2-fold change"].values,
    "log2-fold change (new)": result_df_new["log2-fold change"],
},index=result_df_new.index)
result_df_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Final Parameter (old),Final Parameter (new),log2-fold change (old),log2-fold change (new)
Covariate,Cell Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
conditionT.Hpoly.Day3,Endocrine,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day3,Enterocyte,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day3,Enterocyte.Progenitor,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day3,Goblet,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day3,Stem,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day3,TA,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day3,TA.Early,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day3,Tuft,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day10,Endocrine,0.0,0.0,0.0,0.0
conditionT.Hpoly.Day10,Enterocyte,0.0,0.0,0.0,0.0


## Different base label

### Old implementation

In [27]:
model_salm_switch_cond = mod.CompositionalAnalysis(
    data_salm, formula="C(condition, Treatment('Salmonella'))", reference_cell_type="Goblet"
)
switch_results = model_salm_switch_cond.sample_hmc()
switch_results.summary()

100%|██████████| 20000/20000 [00:26<00:00, 751.89it/s]


MCMC sampling finished. (33.783 sec)
Acceptance rate: 67.2%
Compositional Analysis summary:

Data: 6 samples, 8 cell types
Reference index: 3
Formula: C(condition, Treatment('Salmonella'))

Intercepts:
                       Final Parameter  Expected Sample
Cell Type                                              
Endocrine                        1.224        27.904021
Enterocyte                       3.681       325.632946
Enterocyte.Progenitor            2.547       104.770350
Goblet                           1.751        47.265036
Stem                             2.607       111.248987
TA                               2.031        62.537778
TA.Early                         2.858       142.989445
Tuft                             0.433        12.651437


Effects:
                                                                        Final Parameter  \
Covariate                                        Cell Type                                
C(condition, Treatment('Salmonella'))[T.Contr

### New implementation

In [28]:
sccoda_data = sccoda_model.prepare(
    sccoda_data,
    modality_key="coda_salm",
    formula="C(condition, Treatment('Salmonella'))",
    reference_cell_type="Goblet",
)
sccoda_model.run_nuts(sccoda_data, modality_key="coda_salm")
sccoda_model.summary(sccoda_data, modality_key="coda_salm")

sample: 100%|██████████| 11000/11000 [00:26<00:00, 419.77it/s, 127 steps of size 1.86e-02. acc. prob=0.88]


### Compare

In [29]:
result_df_old = switch_results.effect_df
result_df_new = sccoda_model.get_effect_df(sccoda_data["coda_salm"])

result_df_combined = pd.DataFrame({
    "Final Parameter (old)": result_df_old["Final Parameter"].values,
    "Final Parameter (new)": result_df_new["Final Parameter"],
    "log2-fold change (old)": result_df_old["log2-fold change"].values,
    "log2-fold change (new)": result_df_new["log2-fold change"],
},index=result_df_new.index)
result_df_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Final Parameter (old),Final Parameter (new),log2-fold change (old),log2-fold change (new)
Covariate,Cell Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
"C(condition, Treatment('Salmonella'))T.Control",Endocrine,0.0,0.0,0.489783,0.486292
"C(condition, Treatment('Salmonella'))T.Control",Enterocyte,-1.340024,-1.333453,-1.443464,-1.437474
"C(condition, Treatment('Salmonella'))T.Control",Enterocyte.Progenitor,0.0,0.0,0.489783,0.486292
"C(condition, Treatment('Salmonella'))T.Control",Goblet,0.0,0.0,0.489783,0.486292
"C(condition, Treatment('Salmonella'))T.Control",Stem,0.0,0.0,0.489783,0.486292
"C(condition, Treatment('Salmonella'))T.Control",TA,0.0,0.0,0.489783,0.486292
"C(condition, Treatment('Salmonella'))T.Control",TA.Early,0.0,0.0,0.489783,0.486292
"C(condition, Treatment('Salmonella'))T.Control",Tuft,0.0,0.0,0.489783,0.486292


## Different reference cell type

### Old implementation

In [43]:
# switching reference cell type
model_salm_ref = mod.CompositionalAnalysis(
    data_salm, formula="condition", reference_cell_type="Enterocyte"
)
reference_results = model_salm_ref.sample_hmc()
reference_results.set_fdr(est_fdr=0.4)
reference_results.summary()

100%|██████████| 20000/20000 [00:26<00:00, 764.21it/s]


MCMC sampling finished. (33.535 sec)
Acceptance rate: 50.5%
Compositional Analysis summary:

Data: 6 samples, 8 cell types
Reference index: 1
Formula: condition

Intercepts:
                       Final Parameter  Expected Sample
Cell Type                                              
Endocrine                        0.573        34.851550
Enterocyte                       2.096       159.827901
Enterocyte.Progenitor            1.878       128.521682
Goblet                           1.105        59.328964
Stem                             2.109       161.919228
TA                               1.490        87.190695
TA.Early                         2.238       184.213907
Tuft                            -0.026        19.146073


Effects:
                                               Final Parameter  \
Covariate               Cell Type                                
condition[T.Salmonella] Endocrine                     0.000000   
                        Enterocyte                    0.0

### New implementation

In [44]:
# switching reference cell type
sccoda_data = sccoda_model.prepare(
    sccoda_data,
    modality_key="coda_salm",
    formula="condition",
    reference_cell_type="Enterocyte",
)
sccoda_model.run_nuts(sccoda_data, modality_key="coda_salm")
sccoda_model.set_fdr(sccoda_data, modality_key="coda_salm", est_fdr=0.4)
sccoda_model.summary(sccoda_data, modality_key="coda_salm")

sample: 100%|██████████| 11000/11000 [00:08<00:00, 1264.33it/s, 63 steps of size 6.56e-02. acc. prob=0.74]


### Compare

In [45]:
result_df_old = reference_results.effect_df
result_df_new = sccoda_model.get_effect_df(sccoda_data["coda_salm"])

result_df_combined = pd.DataFrame({
    "Final Parameter (old)": result_df_old["Final Parameter"].values,
    "Final Parameter (new)": result_df_new["Final Parameter"],
    "log2-fold change (old)": result_df_old["log2-fold change"].values,
    "log2-fold change (new)": result_df_new["log2-fold change"],
},index=result_df_new.index)
result_df_combined

Unnamed: 0_level_0,Unnamed: 1_level_0,Final Parameter (old),Final Parameter (new),log2-fold change (old),log2-fold change (new)
Covariate,Cell Type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
conditionT.Salmonella,Endocrine,0.0,0.0,0.263545,0.243037
conditionT.Salmonella,Enterocyte,0.0,0.0,0.263545,0.243037
conditionT.Salmonella,Enterocyte.Progenitor,0.0,0.0,0.263545,0.243037
conditionT.Salmonella,Goblet,0.0,0.0,0.263545,0.243037
conditionT.Salmonella,Stem,-0.473469,-0.441236,-0.419527,-0.393532
conditionT.Salmonella,TA,-0.379547,-0.34407,-0.284025,-0.253351
conditionT.Salmonella,TA.Early,-0.322684,-0.293891,-0.201989,-0.180959
conditionT.Salmonella,Tuft,0.0,0.0,0.263545,0.243037
