This notebook contains the code for the meta-analysis of healthy lung data for ACE2, TMPRSS2, and CTSL. It contains the simple model without interaction terms that was run on the cell-level data (not pseudo-bulk, no holdout analysis)

In [1]:
import scanpy as sc
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
from matplotlib import rcParams
from matplotlib import colors
from matplotlib import patches
import seaborn as sns
import batchglm
import diffxpy.api as de
import patsy as pat
from statsmodels.stats.multitest import multipletests
import logging, warnings
import statsmodels.api as sm

  from pandas.core.index import RangeIndex


In [2]:
plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
sc.logging.print_versions()
de.__version__

logging.getLogger("tensorflow").setLevel(logging.ERROR)
logging.getLogger("batchglm").setLevel(logging.INFO)
logging.getLogger("diffxpy").setLevel(logging.INFO)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 35)
warnings.filterwarnings("ignore", category=DeprecationWarning, module="tensorflow")

scanpy==1.4.5.1 anndata==0.7.1 umap==0.3.10 numpy==1.18.1 scipy==1.4.1 pandas==1.0.1 scikit-learn==0.23.1 statsmodels==0.11.1 python-igraph==0.8.0 louvain==0.6.1


'v0.7.3'

In [3]:
#User inputs
folder = '/storage/groups/ml01/workspace/malte.luecken/2020_cov19_study'

adata_diffxpy = '/storage/groups/ml01/workspace/malte.luecken/2020_cov19_study/COVID19_lung_atlas_revision_v3.h5ad'

output_folder = 'diffxpy_out/'

de_output_base = 'COVID19_lung_atlas_revision_v3_lung_cov19_poissonglm_smoking_nUMIoffset_noInts'

# Read the data

In [4]:
adata = sc.read(adata_diffxpy)

In [5]:
adata

AnnData object with n_obs × n_vars = 1320896 × 3 
    obs: 'age', 'anatomical_region', 'donor', 'last_author/PI', 'lung_vs_nasal', 'notes', 'original_celltype_ann', 'sample', 'sex', 'smoking', 'total_counts', 'smoked_boolean', 'last_author_sample_name', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_new'

In [6]:
adata.obs.age = adata.obs.age.astype(float)

In [7]:
adata.obs.dtypes

age                         float64
anatomical_region          category
donor                      category
last_author/PI             category
lung_vs_nasal              category
notes                      category
original_celltype_ann      category
sample                     category
sex                        category
smoking                    category
total_counts                float64
smoked_boolean             category
last_author_sample_name    category
ann_level_1                category
ann_level_2                category
ann_level_3                category
ann_level_4                category
ann_level_5                category
ann_highest_res               int64
ann_new                        bool
dtype: object

In [8]:
adata.obs['dataset'] = adata.obs['last_author/PI']

In [9]:
adata.obs.dataset.value_counts()

Regev/Rajagopal            322998
Meyer_b                    117535
Kaminski                    95303
Spence                      78401
Barbry/Leroy                76981
Krasnow/Quake               60993
Meyer                       57020
Rawlins                     53409
Regev                       43527
Misharin/Budinger           41266
Eils/Conrad/Kreuter         39778
Seibold                     36248
Whitsett/Xu_10X             34185
Koenigshoff                 33119
Misharin                    28329
Xavier/Regev                25552
Spira/Campbell              24455
Lafyatis/Rojas              24220
Kropski/Banovich_vand       23285
Schultze                    22641
Schiller                    20776
Nawijn                      18197
Teichmann                   12971
Kropski/Banovich_dnar        8359
Shalek                       7603
Linnarsson                   4640
Whitsett/Xu_dropSeq          3267
Mazzilli/Campbell/Beane      2207
Schultze/Falk                1965
Beane         

# Filter the data

Keep only datsets with:
- more than 1 donor
- non-fetal
- lung

In [10]:
# Remove fetal datasets
dats_to_remove = set(['Rawlins', 'Spence', 'Linnarsson'])

In [11]:
dat = adata.obs.groupby(['donor']).agg({'sex':'first', 'age':'first', 'dataset':'first'})

# Single donor filter
don_tab = dat['dataset'].value_counts()
dats_to_remove.update(set(don_tab.index[don_tab == 1]))

In [12]:
dats_to_remove = list(dats_to_remove)
dats_to_remove

['Schultze/Falk', 'Linnarsson', 'Spence', 'Misharin', 'Rawlins']

In [13]:
adata = adata[~adata.obs.dataset.isin(dats_to_remove)].copy()

In [14]:
adata.obs.lung_vs_nasal.value_counts()

lung     1096604
nasal      57548
Name: lung_vs_nasal, dtype: int64

In [15]:
# Filter for only lung data
adata = adata[adata.obs.lung_vs_nasal.isin(['lung']),].copy()

In [16]:
adata

AnnData object with n_obs × n_vars = 1096604 × 3 
    obs: 'age', 'anatomical_region', 'donor', 'last_author/PI', 'lung_vs_nasal', 'notes', 'original_celltype_ann', 'sample', 'sex', 'smoking', 'total_counts', 'smoked_boolean', 'last_author_sample_name', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_new', 'dataset'

In [17]:
adata.obs['sample'].nunique()
adata.obs['donor'].nunique()
adata.obs['dataset'].nunique()

309

185

24

# Binarize smoking status

In [18]:
adata.obs.smoking.value_counts()

never              575345
current            173020
nan                111184
former              53487
non-smoker          42367
never-smoker        33119
active              29396
smoked              28197
current/former      23573
heavy               15663
current/former?      5162
light                3316
Former               1385
Current               610
Never Smoker          426
Current Smoker        354
Name: smoking, dtype: int64

In [19]:
adata.obs['smoking_status'] = adata.obs.smoked_boolean
#adata.obs['smoking_status'] = [True if stat in ['current', 'smoked', 'active', 'former', 'heavy', 'light'] else False if stat in ['never', 'nonsmoker'] else "nan" for stat in adata.obs.smoking]

In [20]:
adata.obs.smoking_status.value_counts()

False    651257
True     334163
nan      111184
Name: smoking_status, dtype: int64

## Filter out data w/o smoking status

In [21]:
pd.crosstab(adata.obs.dataset, adata.obs.smoking)

smoking,Current,Current Smoker,Former,Never Smoker,active,current,current/former,current/former?,former,heavy,light,nan,never,never-smoker,non-smoker,smoked
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Barbry/Leroy,0,0,0,0,0,0,0,0,16423,0,0,0,0,0,42367,0
Beane,0,0,0,0,0,305,0,0,581,0,0,0,0,0,0,0
Beane/Campbell,0,354,0,426,0,0,0,0,0,0,0,0,0,0,0,0
Eils/Conrad/Kreuter,0,0,0,0,0,11185,0,0,0,0,0,0,28593,0,0,0
Kaminski,0,0,0,0,0,0,23573,0,0,0,0,3850,67880,0,0,0
Koenigshoff,0,0,0,0,0,0,0,0,0,0,0,0,0,33119,0,0
Krasnow/Quake,0,0,0,0,0,0,0,0,7524,0,0,0,53469,0,0,0
Kropski/Banovich_dnar,0,0,0,0,0,0,0,0,0,0,0,0,2647,0,0,5712
Kropski/Banovich_vand,0,0,0,0,0,0,0,0,0,0,0,0,800,0,0,22485
Lafyatis/Rojas,0,0,0,0,0,0,0,0,3389,0,0,0,20831,0,0,0


In [22]:
adata = adata[~adata.obs.smoking_status.isin(['nan']),].copy()
adata

AnnData object with n_obs × n_vars = 985420 × 3 
    obs: 'age', 'anatomical_region', 'donor', 'last_author/PI', 'lung_vs_nasal', 'notes', 'original_celltype_ann', 'sample', 'sex', 'smoking', 'total_counts', 'smoked_boolean', 'last_author_sample_name', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_new', 'dataset', 'smoking_status'

In [23]:
adata.obs.dataset.value_counts()
adata.obs['sample'].nunique()
adata.obs['donor'].nunique()

Regev/Rajagopal            322998
Meyer_b                    117535
Kaminski                    91453
Krasnow/Quake               60993
Barbry/Leroy                58790
Meyer                       57020
Misharin/Budinger           41266
Eils/Conrad/Kreuter         39778
Seibold                     36248
Whitsett/Xu_10X             34185
Koenigshoff                 33119
Lafyatis/Rojas              24220
Kropski/Banovich_vand       23285
Teichmann                   12971
Nawijn                      11110
Kropski/Banovich_dnar        8359
Schiller                     5162
Whitsett/Xu_dropSeq          3267
Mazzilli/Campbell/Beane      1995
Beane                         886
Beane/Campbell                780
Name: dataset, dtype: int64

286

164

# Check the data

In [24]:
np.mean(adata.X.astype(int) != adata.X)

0.0

In [25]:
# Check if any non-integer data in a particular dataset
for dat in adata.obs.dataset.unique():
    val = np.mean(adata[adata.obs.dataset.isin([dat]),:].X.astype(int) != adata[adata.obs.dataset.isin([dat]),:].X)
    if val != 0:
        print(f'dataset= {dat}; value= {val}')
        adata[adata.obs.dataset.isin([dat]),:].X[:20,:20].A

All counts are integers

In [26]:
adata.obs.age.value_counts()
adata.obs.sex.value_counts()

57.00    81833
66.00    69371
42.00    66596
59.00    59955
18.00    59849
46.00    56291
64.00    46130
35.00    45869
0.25     29908
67.50    28201
20.00    25362
51.00    24766
30.00    24095
29.00    19248
0.00     19065
32.00    19009
3.00     18387
42.50    16906
41.00    14943
58.00    14086
57.50    13840
23.00    13657
68.00    11852
65.00    11404
56.00    11024
75.00    10941
45.00    10939
55.00    10878
47.00    10278
49.00    10198
21.00     9430
27.00     8693
72.50     7983
22.00     7980
63.00     7600
31.00     7085
33.00     6906
44.00     6681
52.50     6326
38.00     5629
62.00     4679
24.00     4433
62.50     4073
26.00     4066
61.00     3922
67.00     3916
50.00     3831
32.50     3772
80.00     3261
10.00     2694
40.00     2647
17.00     2552
79.00     2550
54.00     2507
76.00     2447
36.00     2218
48.00     1153
25.00      823
70.00      168
69.00      130
87.00      112
43.00       79
34.00       76
78.00       65
74.00       52
Name: age, dtype: int64

female    550340
male      435080
Name: sex, dtype: int64

# Fit models and perform DE

In [27]:
cluster_key = 'ann_level_2'
clust_tbl = adata.obs[cluster_key].value_counts()
clusters = clust_tbl.index[clust_tbl > 1000]
ct_to_rm = clusters[[ct.startswith('1') for ct in clusters]]
clusters = clusters.drop(ct_to_rm.tolist()).tolist()
clusters

['Myeloid',
 'Airway epithelium',
 'Alveolar epithelium',
 'Lymphoid',
 'Fibroblast lineage',
 'Blood vessels',
 'Submucosal Gland',
 'Smooth Muscle',
 'Lymphatics',
 'Mesothelium']

Calculate DE genes per cluster.

In [28]:
adata

AnnData object with n_obs × n_vars = 985420 × 3 
    obs: 'age', 'anatomical_region', 'donor', 'last_author/PI', 'lung_vs_nasal', 'notes', 'original_celltype_ann', 'sample', 'sex', 'smoking', 'total_counts', 'smoked_boolean', 'last_author_sample_name', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_new', 'dataset', 'smoking_status'

In [29]:
adata.obs['total_counts_scaled'] = adata.obs['total_counts']/adata.obs['total_counts'].mean()

In [30]:
formula = "1 + sex + age + smoking_status + dataset"
tested_coef = ["sex[T.male]", "age", "smoking_status[T.True]"]
dmat = de.utils.design_matrix(
    data=adata,
    formula="~" + formula,
    as_numeric=["age"],
    return_type="patsy"
)
dmat[1]

['Intercept',
 'sex[T.male]',
 'smoking_status[T.True]',
 'dataset[T.Beane]',
 'dataset[T.Beane/Campbell]',
 'dataset[T.Eils/Conrad/Kreuter]',
 'dataset[T.Kaminski]',
 'dataset[T.Koenigshoff]',
 'dataset[T.Krasnow/Quake]',
 'dataset[T.Kropski/Banovich_dnar]',
 'dataset[T.Kropski/Banovich_vand]',
 'dataset[T.Lafyatis/Rojas]',
 'dataset[T.Mazzilli/Campbell/Beane]',
 'dataset[T.Meyer]',
 'dataset[T.Meyer_b]',
 'dataset[T.Misharin/Budinger]',
 'dataset[T.Nawijn]',
 'dataset[T.Regev/Rajagopal]',
 'dataset[T.Schiller]',
 'dataset[T.Seibold]',
 'dataset[T.Teichmann]',
 'dataset[T.Whitsett/Xu_10X]',
 'dataset[T.Whitsett/Xu_dropSeq]',
 'age']

## Poisson GLM

In [31]:
# Poisson GLM loop
de_results_lvl2_glm = dict()

# Test over clusters
for clust in clusters:
    adata_tmp = adata[adata.obs[cluster_key] == clust,:].copy()

    print(f'In cluster {clust}:')
    print(adata_tmp.obs['smoking_status'].value_counts())
    print(adata_tmp.obs['sex'].value_counts())

    # Filter out genes to reduce multiple testing burden
    sc.pp.filter_genes(adata_tmp, min_cells=10)
    if adata_tmp.n_vars == 0:
        print('No genes expressed in more than 10 cells!')
        continue
    if len(adata_tmp.obs.smoking_status.value_counts())==1:
        print(f'{clust} only has 1 type of smoker/nonsmoker sample.')
        continue
        
    print(f'Testing {adata_tmp.n_vars} genes...')
    print(f'Testing in {adata_tmp.n_obs} cells...')
    print("")

    # List to store results
    de_results_list = []        

    # Set up design matrix
    dmat = de.utils.design_matrix(
        data=adata_tmp, #[idx_train],
        formula="~" + formula,
        as_numeric=["age"],
        return_type="patsy"
    )
    
    # Test if model is full rank
    if np.linalg.matrix_rank(np.asarray(dmat[0])) < np.min(dmat[0].shape):
        print(f'Cannot test {clust} as design matrix is not full rank.')
        continue
    
    for i, gene in enumerate(adata_tmp.var_names):
        # Specify model
        pois_model = sm.GLM(
            endog=adata_tmp.X[:, i].todense(), #[idx_train, :], 
            exog=dmat[0], 
            offset=np.log(adata_tmp.obs['total_counts_scaled'].values),
            family=sm.families.Poisson()
        )

        # Fit the model
        pois_results = pois_model.fit()


        # Test over coefs
        for coef in tested_coef:
            de_results_temp = pois_results.wald_test(
                [x for i, x in enumerate(pois_model.exog_names) if dmat[1][i] in [coef]]
            )

            # Output the results nicely
            de_results_temp = pd.DataFrame({
                "gene": gene,
                "cell_identity": clust,
                "covariate": coef,
                "coef": pois_results.params[[y == coef for y in dmat[1]]],
                "coef_sd": pois_results.bse[[y == coef for y in dmat[1]]],                 
                "pval": de_results_temp.pvalue
            }, index= [clust+"_"+gene+"_"+coef])

            de_results_list.append(de_results_temp)

    de_results = pd.concat(de_results_list)
    de_results['adj_pvals'] = multipletests(de_results['pval'].tolist(), method='fdr_bh')[1]
    
    # Store the results
    de_results_lvl2_glm[clust] = de_results
    
# Join the dataframes:
full_res_lvl2_glm = pd.concat([de_results_lvl2_glm[i] for i in de_results_lvl2_glm.keys()], ignore_index=True)

In cluster Myeloid:
False    137441
True     109516
Name: smoking_status, dtype: int64
male      130070
female    116887
Name: sex, dtype: int64
Testing 3 genes...
Testing in 246957 cells...

In cluster Airway epithelium:
False    157085
True      61702
Name: smoking_status, dtype: int64
female    125772
male       93015
Name: sex, dtype: int64
Testing 3 genes...
Testing in 218787 cells...

In cluster Alveolar epithelium:
False    124700
True      60785
Name: smoking_status, dtype: int64
female    123154
male       62331
Name: sex, dtype: int64
Testing 3 genes...
Testing in 185485 cells...

In cluster Lymphoid:
False    84562
True     48215
Name: smoking_status, dtype: int64
female    82702
male      50075
Name: sex, dtype: int64
Testing 3 genes...
Testing in 132777 cells...

In cluster Fibroblast lineage:
False    40012
True     13154
Name: smoking_status, dtype: int64
female    31351
male      21815
Name: sex, dtype: int64
Testing 3 genes...
Testing in 53166 cells...

In cluster Bloo

In [32]:
# Also test how the model behaves without adding the smoking coviarate
formula_nosmoke = "1 + sex + age + dataset"
tested_coef_nosmoke = ["sex[T.male]", "age"]


In [33]:
# Poisson GLM loop
de_results_lvl2_glm_nosmoke = dict()

# Test over clusters
for clust in clusters:
    adata_tmp = adata[adata.obs[cluster_key] == clust,:].copy()

    print(f'In cluster {clust}:')
    print(adata_tmp.obs['smoking_status'].value_counts())
    print(adata_tmp.obs['sex'].value_counts())

    # Filter out genes to reduce multiple testing burden
    sc.pp.filter_genes(adata_tmp, min_cells=10)
    if adata_tmp.n_vars == 0:
        print('No genes expressed in more than 10 cells!')
        continue
    if len(adata_tmp.obs.smoking_status.value_counts())==1:
        print(f'{clust} only has 1 type of smoker/nonsmoker sample.')
        continue
        
    print(f'Testing {adata_tmp.n_vars} genes...')
    print("")

    # List to store results
    de_results_list = []        

    # Set up design matrix
    dmat = de.utils.design_matrix(
        data=adata_tmp, #[idx_train],
        formula="~" + formula_nosmoke,
        as_numeric=["age"],
        return_type="patsy"
    )
    
    # Test if model is full rank
    if np.linalg.matrix_rank(np.asarray(dmat[0])) < np.min(dmat[0].shape):
        print(f'Cannot test {clust} as design matrix is not full rank.')
        continue
    
    for i, gene in enumerate(adata_tmp.var_names):
        # Specify model
        pois_model = sm.GLM(
            endog=adata_tmp.X[:, i].todense(), #[idx_train, :], 
            offset=np.log(adata_tmp.obs['total_counts_scaled'].values),
            exog=dmat[0], 
            family=sm.families.Poisson()
        )

        # Fit the model
        pois_results = pois_model.fit()


        # Test over coefs
        for coef in tested_coef_nosmoke:
            de_results_temp = pois_results.wald_test(
                [x for i, x in enumerate(pois_model.exog_names) if dmat[1][i] in [coef]]
            )

            # Output the results nicely
            de_results_temp = pd.DataFrame({
                "gene": gene,
                "cell_identity": clust,
                "covariate": coef,
                "coef": pois_results.params[[y == coef for y in dmat[1]]],
                "coef_sd": pois_results.bse[[y == coef for y in dmat[1]]],                 
                "pval": de_results_temp.pvalue
            }, index= [clust+"_"+gene+"_"+coef])

            de_results_list.append(de_results_temp)

    de_results = pd.concat(de_results_list)
    de_results['adj_pvals'] = multipletests(de_results['pval'].tolist(), method='fdr_bh')[1]
    
    # Store the results
    de_results_lvl2_glm_nosmoke[clust] = de_results
    
# Join the dataframes:
full_res_lvl2_glm_nosmoke = pd.concat([de_results_lvl2_glm_nosmoke[i] for i in de_results_lvl2_glm_nosmoke.keys()], ignore_index=True)

In cluster Myeloid:
False    137441
True     109516
Name: smoking_status, dtype: int64
male      130070
female    116887
Name: sex, dtype: int64
Testing 3 genes...

In cluster Airway epithelium:
False    157085
True      61702
Name: smoking_status, dtype: int64
female    125772
male       93015
Name: sex, dtype: int64
Testing 3 genes...

In cluster Alveolar epithelium:
False    124700
True      60785
Name: smoking_status, dtype: int64
female    123154
male       62331
Name: sex, dtype: int64
Testing 3 genes...

In cluster Lymphoid:
False    84562
True     48215
Name: smoking_status, dtype: int64
female    82702
male      50075
Name: sex, dtype: int64
Testing 3 genes...

In cluster Fibroblast lineage:
False    40012
True     13154
Name: smoking_status, dtype: int64
female    31351
male      21815
Name: sex, dtype: int64
Testing 3 genes...

In cluster Blood vessels:
False    21455
True     21064
Name: smoking_status, dtype: int64
male      31078
female    11441
Name: sex, dtype: int64
Te

## Inspect some results

In [34]:
de_results_lvl2_glm.keys()

dict_keys(['Myeloid', 'Airway epithelium', 'Alveolar epithelium', 'Lymphoid', 'Fibroblast lineage', 'Blood vessels', 'Submucosal Gland', 'Smooth Muscle', 'Lymphatics', 'Mesothelium'])

In [35]:
full_res_lvl2_glm

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
0,ACE2,Myeloid,sex[T.male],0.466549,0.198569,0.01879595,0.02819392
1,ACE2,Myeloid,age,0.001611,0.005975,0.7873725,0.7873725
2,ACE2,Myeloid,smoking_status[T.True],-0.395565,0.233139,0.08975567,0.1154001
3,TMPRSS2,Myeloid,sex[T.male],0.527096,0.042296,1.200709e-35,5.403191e-35
4,TMPRSS2,Myeloid,age,-0.001784,0.001485,0.2294559,0.2581378
5,TMPRSS2,Myeloid,smoking_status[T.True],-0.274375,0.048421,1.457873e-08,3.280215e-08
6,CTSL,Myeloid,sex[T.male],-0.09786,0.002349,0.0,0.0
7,CTSL,Myeloid,age,-0.000837,7.6e-05,5.92875e-28,1.778625e-27
8,CTSL,Myeloid,smoking_status[T.True],0.00836,0.003255,0.01020989,0.0183778
9,ACE2,Airway epithelium,sex[T.male],0.081978,0.036025,0.0228709,0.0228709


In [36]:
full_res_lvl2_glm.loc[full_res_lvl2_glm['gene'] == 'ACE2',]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
0,ACE2,Myeloid,sex[T.male],0.466549,0.198569,0.01879595,0.02819392
1,ACE2,Myeloid,age,0.001611,0.005975,0.7873725,0.7873725
2,ACE2,Myeloid,smoking_status[T.True],-0.395565,0.233139,0.08975567,0.1154001
9,ACE2,Airway epithelium,sex[T.male],0.081978,0.036025,0.0228709,0.0228709
10,ACE2,Airway epithelium,age,0.003145,0.001069,0.003263893,0.00367188
11,ACE2,Airway epithelium,smoking_status[T.True],0.12153,0.038061,0.00140777,0.00180999
18,ACE2,Alveolar epithelium,sex[T.male],0.790606,0.052292,1.2145199999999999e-51,3.643559e-51
19,ACE2,Alveolar epithelium,age,0.019934,0.001778,3.564372e-29,5.346558e-29
20,ACE2,Alveolar epithelium,smoking_status[T.True],-0.353888,0.063195,2.144332e-08,2.412374e-08
27,ACE2,Lymphoid,sex[T.male],0.269889,0.355072,0.4471967,0.5030962


In [37]:
full_res_lvl2_glm.loc[(full_res_lvl2_glm['gene'] == 'ACE2') & (full_res_lvl2_glm['adj_pvals'] < 0.05),]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
0,ACE2,Myeloid,sex[T.male],0.466549,0.198569,0.01879595,0.02819392
9,ACE2,Airway epithelium,sex[T.male],0.081978,0.036025,0.0228709,0.0228709
10,ACE2,Airway epithelium,age,0.003145,0.001069,0.003263893,0.00367188
11,ACE2,Airway epithelium,smoking_status[T.True],0.12153,0.038061,0.00140777,0.00180999
18,ACE2,Alveolar epithelium,sex[T.male],0.790606,0.052292,1.2145199999999999e-51,3.643559e-51
19,ACE2,Alveolar epithelium,age,0.019934,0.001778,3.564372e-29,5.346558e-29
20,ACE2,Alveolar epithelium,smoking_status[T.True],-0.353888,0.063195,2.144332e-08,2.412374e-08
28,ACE2,Lymphoid,age,-0.031751,0.012438,0.01068449,0.01602673
56,ACE2,Submucosal Gland,smoking_status[T.True],0.476008,0.080926,4.053372e-09,1.216012e-08


In [38]:
full_res_lvl2_glm.loc[(full_res_lvl2_glm['gene'] == 'TMPRSS2') & (full_res_lvl2_glm['adj_pvals'] < 0.05),]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
3,TMPRSS2,Myeloid,sex[T.male],0.527096,0.042296,1.200709e-35,5.403191e-35
5,TMPRSS2,Myeloid,smoking_status[T.True],-0.274375,0.048421,1.457873e-08,3.280215e-08
12,TMPRSS2,Airway epithelium,sex[T.male],0.180836,0.011531,1.997748e-55,8.989864000000001e-55
13,TMPRSS2,Airway epithelium,age,0.001193,0.000329,0.0002918464,0.0004377695
14,TMPRSS2,Airway epithelium,smoking_status[T.True],-0.220317,0.01362,7.421204e-59,6.679083e-58
21,TMPRSS2,Alveolar epithelium,sex[T.male],0.073413,0.007158,1.107144e-24,1.423471e-24
22,TMPRSS2,Alveolar epithelium,age,0.020185,0.000249,0.0,0.0
23,TMPRSS2,Alveolar epithelium,smoking_status[T.True],0.145885,0.008382,7.552207000000001e-68,3.3984930000000004e-67
30,TMPRSS2,Lymphoid,sex[T.male],0.306598,0.074111,3.518912e-05,7.917551e-05
31,TMPRSS2,Lymphoid,age,-0.006719,0.002547,0.008349621,0.01502932


In [39]:
full_res_lvl2_glm_nosmoke.loc[full_res_lvl2_glm_nosmoke['gene'] == 'ACE2',]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
0,ACE2,Myeloid,sex[T.male],0.407683,0.197664,0.03915962,0.05873944
1,ACE2,Myeloid,age,0.000733,0.005974,0.9023011,0.9023011
6,ACE2,Airway epithelium,sex[T.male],0.067291,0.035658,0.05914717,0.0709766
7,ACE2,Airway epithelium,age,0.003467,0.001059,0.001061032,0.001591549
12,ACE2,Alveolar epithelium,sex[T.male],0.813865,0.053072,4.454637e-53,8.909273e-53
13,ACE2,Alveolar epithelium,age,0.014438,0.001502,6.874868000000001e-22,6.874868000000001e-22
18,ACE2,Lymphoid,sex[T.male],0.247716,0.343651,0.4710108,0.4710108
19,ACE2,Lymphoid,age,-0.032553,0.012006,0.006697087,0.01004563
24,ACE2,Fibroblast lineage,sex[T.male],-0.008491,0.167437,0.9595539,0.9595539
25,ACE2,Fibroblast lineage,age,0.002133,0.005753,0.7107977,0.8529573


# Level 3 annotation

In [40]:
cluster_key = 'ann_level_3'
clust_tbl = adata.obs[cluster_key].value_counts()
clusters = clust_tbl.index[clust_tbl > 1000]
ct_to_rm = clusters[[ct.startswith('1') or ct.startswith('2') for ct in clusters]]
clusters = clusters.drop(ct_to_rm.tolist()).tolist()
clusters

['Macrophages',
 'Basal',
 'AT2',
 'T cell lineage',
 'Monocytes',
 'Multiciliated lineage',
 'Submucosal Secretory',
 'AT1',
 'Innate lymphoid cells',
 'Secretory',
 'Capillary',
 'Mast cells',
 'B cell lineage',
 'Fibroblasts',
 'Dendritic cells',
 'Venous',
 'Lymphatic EC',
 'Arterial',
 'Rare',
 'Airway smooth muscle']

In [41]:
adata_sub = adata[adata.obs.ann_level_3.isin(clusters),:]

adata_sub
adata_sub.obs.donor.nunique()
adata_sub.obs['sample'].nunique()

View of AnnData object with n_obs × n_vars = 795195 × 3 
    obs: 'age', 'anatomical_region', 'donor', 'last_author/PI', 'lung_vs_nasal', 'notes', 'original_celltype_ann', 'sample', 'sex', 'smoking', 'total_counts', 'smoked_boolean', 'last_author_sample_name', 'ann_level_1', 'ann_level_2', 'ann_level_3', 'ann_level_4', 'ann_level_5', 'ann_highest_res', 'ann_new', 'dataset', 'smoking_status', 'total_counts_scaled'

164

286

## Poisson GLM

In [42]:
# Poisson GLM loop
de_results_lvl3_glm = dict()

# Test over clusters
for clust in clusters:
    adata_tmp = adata_sub[adata_sub.obs[cluster_key] == clust,:].copy()

    print(f'In cluster {clust}:')
    print(adata_tmp.obs['smoking_status'].value_counts())
    print(adata_tmp.obs['sex'].value_counts())

    # Filter out genes to reduce multiple testing burden
    sc.pp.filter_genes(adata_tmp, min_cells=10)
    if adata_tmp.n_vars == 0:
        print('No genes expressed in more than 10 cells!')
        continue
    if len(adata_tmp.obs.smoking_status.value_counts())==1:
        print(f'{clust} only has 1 type of smoker/nonsmoker sample.')
        continue
        
    print(f'Testing {adata_tmp.n_vars} genes...')
    print(f'Testing in {adata_tmp.n_obs} cells...')
    print("")

    # List to store results
    de_results_list = []        

    # Set up design matrix
    dmat = de.utils.design_matrix(
        data=adata_tmp, #[idx_train],
        formula="~" + formula,
        as_numeric=["age"],
        return_type="patsy"
    )
    
    # Test if model is full rank
    if np.linalg.matrix_rank(np.asarray(dmat[0])) < np.min(dmat[0].shape):
        print(f'Cannot test {clust} as design matrix is not full rank.')
        continue
    
    for i, gene in enumerate(adata_tmp.var_names):
        # Specify model
        pois_model = sm.GLM(
            endog=adata_tmp.X[:, i].todense(), #[idx_train, :], 
            exog=dmat[0],
            offset=np.log(adata_tmp.obs['total_counts_scaled'].values),
            family=sm.families.Poisson()
        )

        # Fit the model
        pois_results = pois_model.fit()


        # Test over coefs
        for coef in tested_coef:
            de_results_temp = pois_results.wald_test(
                [x for i, x in enumerate(pois_model.exog_names) if dmat[1][i] in [coef]]
            )

            # Output the results nicely
            de_results_temp = pd.DataFrame({
                "gene": gene,
                "cell_identity": clust,
                "covariate": coef,
                "coef": pois_results.params[[y == coef for y in dmat[1]]],
                "coef_sd": pois_results.bse[[y == coef for y in dmat[1]]],                 
                "pval": de_results_temp.pvalue
            }, index= [clust+"_"+gene+"_"+coef])

            de_results_list.append(de_results_temp)

    de_results = pd.concat(de_results_list)
    de_results['adj_pvals'] = multipletests(de_results['pval'].tolist(), method='fdr_bh')[1]
    
    # Store the results
    de_results_lvl3_glm[clust] = de_results
    
# Join the dataframes:
full_res_lvl3_glm = pd.concat([de_results_lvl3_glm[i] for i in de_results_lvl3_glm.keys()], ignore_index=True)

In cluster Macrophages:
False    84537
True     72427
Name: smoking_status, dtype: int64
male      94376
female    62588
Name: sex, dtype: int64
Testing 3 genes...
Testing in 156964 cells...

In cluster Basal:
False    113299
True      42578
Name: smoking_status, dtype: int64
female    91571
male      64306
Name: sex, dtype: int64
Testing 3 genes...
Testing in 155877 cells...

In cluster AT2:
False    100593
True      54919
Name: smoking_status, dtype: int64
female    103012
male       52500
Name: sex, dtype: int64
Testing 3 genes...
Testing in 155512 cells...

In cluster T cell lineage:
True     26540
False    25599
Name: smoking_status, dtype: int64
female    27637
male      24502
Name: sex, dtype: int64
Testing 3 genes...
Testing in 52139 cells...

In cluster Monocytes:
True     23487
False    19216
Name: smoking_status, dtype: int64
female    21616
male      21087
Name: sex, dtype: int64
Testing 3 genes...
Testing in 42703 cells...

In cluster Multiciliated lineage:
False    26913


In [43]:
# Poisson GLM loop
de_results_lvl3_glm_nosmoke = dict()

# Test over clusters
for clust in clusters:
    adata_tmp = adata_sub[adata_sub.obs[cluster_key] == clust,:].copy()

    print(f'In cluster {clust}:')
    print(adata_tmp.obs['smoking_status'].value_counts())
    print(adata_tmp.obs['sex'].value_counts())

    # Filter out genes to reduce multiple testing burden
    sc.pp.filter_genes(adata_tmp, min_cells=10)
    if adata_tmp.n_vars == 0:
        print('No genes expressed in more than 10 cells!')
        continue
    if len(adata_tmp.obs.smoking_status.value_counts())==1:
        print(f'{clust} only has 1 type of smoker/nonsmoker sample.')
        continue
        
    print(f'Testing {adata_tmp.n_vars} genes...')
    print("")

    # List to store results
    de_results_list = []        

    # Set up design matrix
    dmat = de.utils.design_matrix(
        data=adata_tmp, #[idx_train],
        formula="~" + formula_nosmoke,
        as_numeric=["age"],
        return_type="patsy"
    )
    
    # Test if model is full rank
    if np.linalg.matrix_rank(np.asarray(dmat[0])) < np.min(dmat[0].shape):
        print(f'Cannot test {clust} as design matrix is not full rank.')
        continue
    
    for i, gene in enumerate(adata_tmp.var_names):
        # Specify model
        pois_model = sm.GLM(
            endog=adata_tmp.X[:, i].todense(), #[idx_train, :], 
            exog=dmat[0], 
            offset=np.log(adata_tmp.obs['total_counts_scaled'].values),
            family=sm.families.Poisson()
        )

        # Fit the model
        pois_results = pois_model.fit()


        # Test over coefs
        for coef in tested_coef_nosmoke:
            de_results_temp = pois_results.wald_test(
                [x for i, x in enumerate(pois_model.exog_names) if dmat[1][i] in [coef]]
            )

            # Output the results nicely
            de_results_temp = pd.DataFrame({
                "gene": gene,
                "cell_identity": clust,
                "covariate": coef,
                "coef": pois_results.params[[y == coef for y in dmat[1]]],
                "coef_sd": pois_results.bse[[y == coef for y in dmat[1]]],                 
                "pval": de_results_temp.pvalue
            }, index= [clust+"_"+gene+"_"+coef])

            de_results_list.append(de_results_temp)

    de_results = pd.concat(de_results_list)
    de_results['adj_pvals'] = multipletests(de_results['pval'].tolist(), method='fdr_bh')[1]
    
    # Store the results
    de_results_lvl3_glm_nosmoke[clust] = de_results
    
# Join the dataframes:
full_res_lvl3_glm_nosmoke = pd.concat([de_results_lvl3_glm_nosmoke[i] for i in de_results_lvl3_glm_nosmoke.keys()], ignore_index=True)

In cluster Macrophages:
False    84537
True     72427
Name: smoking_status, dtype: int64
male      94376
female    62588
Name: sex, dtype: int64
Testing 3 genes...

In cluster Basal:
False    113299
True      42578
Name: smoking_status, dtype: int64
female    91571
male      64306
Name: sex, dtype: int64
Testing 3 genes...

In cluster AT2:
False    100593
True      54919
Name: smoking_status, dtype: int64
female    103012
male       52500
Name: sex, dtype: int64
Testing 3 genes...

In cluster T cell lineage:
True     26540
False    25599
Name: smoking_status, dtype: int64
female    27637
male      24502
Name: sex, dtype: int64
Testing 3 genes...

In cluster Monocytes:
True     23487
False    19216
Name: smoking_status, dtype: int64
female    21616
male      21087
Name: sex, dtype: int64
Testing 3 genes...

In cluster Multiciliated lineage:
False    26913
True     10617
Name: smoking_status, dtype: int64
female    21902
male      15628
Name: sex, dtype: int64
Testing 3 genes...

In clus

## Inspect some results

In [44]:
de_results_lvl3_glm.keys()

dict_keys(['Macrophages', 'Basal', 'AT2', 'T cell lineage', 'Monocytes', 'Multiciliated lineage', 'Submucosal Secretory', 'AT1', 'Innate lymphoid cells', 'Secretory', 'Capillary', 'Mast cells', 'B cell lineage', 'Fibroblasts', 'Dendritic cells', 'Venous', 'Lymphatic EC', 'Arterial', 'Rare', 'Airway smooth muscle'])

In [45]:
full_res_lvl3_glm

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
0,ACE2,Macrophages,sex[T.male],0.079347,0.268355,0.7674755,0.7674755
1,ACE2,Macrophages,age,0.006745,0.006974,0.3334057,0.4286644
2,ACE2,Macrophages,smoking_status[T.True],-0.183397,0.328642,0.5768144,0.6489162
3,TMPRSS2,Macrophages,sex[T.male],-0.331306,0.072662,5.125795e-06,1.153304e-05
4,TMPRSS2,Macrophages,age,-0.003322,0.002135,0.1196886,0.2100394
5,TMPRSS2,Macrophages,smoking_status[T.True],0.134248,0.090973,0.1400263,0.2100394
6,CTSL,Macrophages,sex[T.male],-0.167304,0.00261,0.0,0.0
7,CTSL,Macrophages,age,0.002582,8.4e-05,2.6728460000000002e-207,1.2027810000000001e-206
8,CTSL,Macrophages,smoking_status[T.True],0.100365,0.003562,1.222302e-174,3.666907e-174
9,ACE2,Basal,sex[T.male],0.050698,0.047738,0.2882331,0.2882331


In [46]:
full_res_lvl3_glm.loc[full_res_lvl3_glm['gene'] == 'ACE2',]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
0,ACE2,Macrophages,sex[T.male],0.079347,0.268355,0.7674755,0.7674755
1,ACE2,Macrophages,age,0.006745,0.006974,0.3334057,0.4286644
2,ACE2,Macrophages,smoking_status[T.True],-0.183397,0.328642,0.5768144,0.6489162
9,ACE2,Basal,sex[T.male],0.050698,0.047738,0.2882331,0.2882331
10,ACE2,Basal,age,0.003719,0.001304,0.004346742,0.004890085
11,ACE2,Basal,smoking_status[T.True],0.191025,0.048163,7.301195e-05,9.387251e-05
18,ACE2,AT2,sex[T.male],0.827769,0.054502,4.251463e-52,1.371084e-51
19,ACE2,AT2,age,0.019122,0.00189,4.552959e-24,8.195327e-24
20,ACE2,AT2,smoking_status[T.True],-0.406943,0.065738,6.000591e-10,6.750665e-10
27,ACE2,T cell lineage,sex[T.male],0.614737,0.590384,0.2977596,0.4428981


In [47]:
full_res_lvl3_glm.loc[(full_res_lvl3_glm['gene'] == 'ACE2') & (full_res_lvl3_glm['adj_pvals'] < 0.05),]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
10,ACE2,Basal,age,0.003719,0.001304,0.004346742,0.004890085
11,ACE2,Basal,smoking_status[T.True],0.191025,0.048163,7.301195e-05,9.387251e-05
18,ACE2,AT2,sex[T.male],0.827769,0.054502,4.251463e-52,1.371084e-51
19,ACE2,AT2,age,0.019122,0.00189,4.552959e-24,8.195327e-24
20,ACE2,AT2,smoking_status[T.True],-0.406943,0.065738,6.000591e-10,6.750665e-10
45,ACE2,Multiciliated lineage,sex[T.male],0.247672,0.080584,0.002115948,0.004760882
46,ACE2,Multiciliated lineage,age,0.00683,0.002665,0.01038334,0.01869002
56,ACE2,Submucosal Secretory,smoking_status[T.True],0.476008,0.080926,4.053372e-09,1.216012e-08
63,ACE2,AT1,sex[T.male],0.483887,0.20551,0.01854418,0.02781628
78,ACE2,Secretory,sex[T.male],0.248878,0.081754,0.002332912,0.005249052


In [48]:
full_res_lvl3_glm.loc[full_res_lvl3_glm['gene'] == 'TMPRSS2',]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
3,TMPRSS2,Macrophages,sex[T.male],-0.331306,0.072662,5.125795e-06,1.153304e-05
4,TMPRSS2,Macrophages,age,-0.003322,0.002135,0.1196886,0.2100394
5,TMPRSS2,Macrophages,smoking_status[T.True],0.134248,0.090973,0.1400263,0.2100394
12,TMPRSS2,Basal,sex[T.male],0.294338,0.019865,1.137475e-49,2.559319e-49
13,TMPRSS2,Basal,age,0.005119,0.000486,5.92203e-26,1.065965e-25
14,TMPRSS2,Basal,smoking_status[T.True],-0.360639,0.022005,2.2845730000000002e-60,1.028058e-59
21,TMPRSS2,AT2,sex[T.male],-0.004434,0.00764,0.561673,0.561673
22,TMPRSS2,AT2,age,0.021023,0.000271,0.0,0.0
23,TMPRSS2,AT2,smoking_status[T.True],0.136431,0.008986,4.5702800000000005e-52,1.371084e-51
30,TMPRSS2,T cell lineage,sex[T.male],0.389809,0.089583,1.352804e-05,3.043809e-05


In [49]:
full_res_lvl3_glm.loc[(full_res_lvl3_glm['gene'] == 'TMPRSS2') & (full_res_lvl3_glm['adj_pvals'] < 0.05),]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
3,TMPRSS2,Macrophages,sex[T.male],-0.331306,0.072662,5.125795e-06,1.153304e-05
12,TMPRSS2,Basal,sex[T.male],0.294338,0.019865,1.137475e-49,2.559319e-49
13,TMPRSS2,Basal,age,0.005119,0.000486,5.92203e-26,1.065965e-25
14,TMPRSS2,Basal,smoking_status[T.True],-0.360639,0.022005,2.2845730000000002e-60,1.028058e-59
22,TMPRSS2,AT2,age,0.021023,0.000271,0.0,0.0
23,TMPRSS2,AT2,smoking_status[T.True],0.136431,0.008986,4.5702800000000005e-52,1.371084e-51
30,TMPRSS2,T cell lineage,sex[T.male],0.389809,0.089583,1.352804e-05,3.043809e-05
39,TMPRSS2,Monocytes,sex[T.male],1.204337,0.071956,7.018987e-63,6.317089000000001e-62
40,TMPRSS2,Monocytes,age,0.00758,0.002981,0.01099027,0.01978249
48,TMPRSS2,Multiciliated lineage,sex[T.male],0.179693,0.018936,2.325644e-21,2.0930789999999998e-20


In [50]:
full_res_lvl3_glm.loc[full_res_lvl3_glm['gene'] == 'CTSL',]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
6,CTSL,Macrophages,sex[T.male],-0.167304,0.00261,0.0,0.0
7,CTSL,Macrophages,age,0.002582,8.4e-05,2.6728460000000002e-207,1.2027810000000001e-206
8,CTSL,Macrophages,smoking_status[T.True],0.100365,0.003562,1.222302e-174,3.666907e-174
15,CTSL,Basal,sex[T.male],-0.167472,0.010586,2.23589e-56,6.707671e-56
16,CTSL,Basal,age,0.005348,0.000297,1.908543e-72,1.717688e-71
17,CTSL,Basal,smoking_status[T.True],0.077169,0.01072,6.077707e-13,9.116561e-13
24,CTSL,AT2,sex[T.male],0.104171,0.013615,1.993684e-14,2.990526e-14
25,CTSL,AT2,age,-0.005396,0.000459,7.452178000000001e-32,1.67674e-31
26,CTSL,AT2,smoking_status[T.True],0.113451,0.017113,3.363359e-11,4.324319e-11
33,CTSL,T cell lineage,sex[T.male],0.733059,0.04211,7.157937e-68,3.221072e-67


In [51]:
#No smoking model
full_res_lvl3_glm_nosmoke.loc[full_res_lvl3_glm_nosmoke['gene'] == 'ACE2',]
full_res_lvl3_glm_nosmoke.loc[full_res_lvl3_glm_nosmoke['gene'] == 'TMPRSS2',]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
0,ACE2,Macrophages,sex[T.male],0.00851,0.239116,0.9716084,0.9716084
1,ACE2,Macrophages,age,0.005999,0.006856,0.3815489,0.4578586
6,ACE2,Basal,sex[T.male],0.027442,0.047106,0.5601964,0.5601964
7,ACE2,Basal,age,0.004157,0.001286,0.001229571,0.001475486
12,ACE2,AT2,sex[T.male],0.86586,0.055427,5.1899300000000006e-55,1.556979e-54
13,ACE2,AT2,age,0.012488,0.00158,2.680154e-15,4.02023e-15
18,ACE2,T cell lineage,sex[T.male],0.386654,0.541254,0.475,0.5053884
19,ACE2,T cell lineage,age,-0.013028,0.01956,0.5053884,0.5053884
24,ACE2,Monocytes,sex[T.male],0.83381,0.339512,0.01405302,0.02107952
25,ACE2,Monocytes,age,-0.023647,0.015885,0.1365767,0.1365767


Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
2,TMPRSS2,Macrophages,sex[T.male],-0.286965,0.065951,1.354162e-05,2.708324e-05
3,TMPRSS2,Macrophages,age,-0.002497,0.002048,0.2226501,0.3339752
8,TMPRSS2,Basal,sex[T.male],0.31849,0.020021,5.569658999999999e-57,1.113932e-56
9,TMPRSS2,Basal,age,0.004407,0.000491,2.8959719999999996e-19,4.343959e-19
14,TMPRSS2,AT2,sex[T.male],0.002241,0.007591,0.7677888,0.7677888
15,TMPRSS2,AT2,age,0.022913,0.000239,0.0,0.0
20,TMPRSS2,T cell lineage,sex[T.male],0.397094,0.089262,8.642011e-06,1.728402e-05
21,TMPRSS2,T cell lineage,age,-0.004524,0.003528,0.1996597,0.2994895
26,TMPRSS2,Monocytes,sex[T.male],1.130317,0.057874,6.020817999999999e-85,3.612491e-84
27,TMPRSS2,Monocytes,age,0.005294,0.002659,0.04649438,0.05579326


In [52]:
# No smoking model
full_res_lvl3_glm_nosmoke.loc[(full_res_lvl3_glm_nosmoke['gene'] == 'ACE2') & (full_res_lvl3_glm_nosmoke['adj_pvals'] < 0.05),]

Unnamed: 0,gene,cell_identity,covariate,coef,coef_sd,pval,adj_pvals
7,ACE2,Basal,age,0.004157,0.001286,0.001229571,0.001475486
12,ACE2,AT2,sex[T.male],0.86586,0.055427,5.1899300000000006e-55,1.556979e-54
13,ACE2,AT2,age,0.012488,0.00158,2.680154e-15,4.02023e-15
24,ACE2,Monocytes,sex[T.male],0.83381,0.339512,0.01405302,0.02107952
30,ACE2,Multiciliated lineage,sex[T.male],0.236656,0.080213,0.00317434,0.005959311
31,ACE2,Multiciliated lineage,age,0.007576,0.00263,0.003972874,0.005959311
37,ACE2,Submucosal Secretory,age,0.005288,0.002329,0.02319545,0.02783454
42,ACE2,AT1,sex[T.male],0.476816,0.2042,0.01954074,0.02320976
43,ACE2,AT1,age,0.013678,0.006026,0.02320976,0.02320976
52,ACE2,Secretory,sex[T.male],0.25075,0.080287,0.001789195,0.002683792


# Store results

In [55]:
full_res_lvl2_glm.to_csv(folder+'/'+output_folder+de_output_base+'_lvl2_full.csv')

In [56]:
full_res_lvl3_glm.to_csv(folder+'/'+output_folder+de_output_base+'_lvl3_full.csv')