In [1]:
x=1

In [None]:
import warnings

warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
import decoupler as dc
import scanpy as sc
import pandas as pd
import numpy as np
import random
import sc_toolbox
import pertpy 

import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging

from rpy2.robjects import pandas2ri
from rpy2.robjects import r

sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()

%load_ext rpy2.ipython

In [None]:
%%R
library(edgeR)
library(MAST)

In [None]:
macrophages = sc.read('../../../pipeline/data/analysis/hlca_healthy_ipf_embed_balanced/macrophages.h5ad')
macrophages

In [None]:
ipf_macrophages = macrophages[macrophages.obs['lung_condition_coarse'] == 'IPF'].copy()
ipf_macrophages

In [None]:
threshold_idx = int(len(ipf_macrophages) * 0.9)
threshold_value = sorted(ipf_macrophages.obs['cell_attn'])[threshold_idx]
threshold_value

In [None]:
ipf_macrophages.obs['group'] = 'all'
ipf_macrophages.obs.loc[ipf_macrophages.obs['cell_attn'] >= threshold_value, 'group'] = 'top'

In [None]:
ipf_macrophages.obs['group'].value_counts()

In [None]:
ipf_macrophages.obs['sample_group'] = ipf_macrophages.obs['sample'].astype('str') + '_' + ipf_macrophages.obs['group']
ipf_macrophages.obs['sample_group'] = ipf_macrophages.obs['sample_group'].astype('category')

In [None]:
ipf_macrophages

In [None]:
samples_to_exclude = []
for sample in np.unique(ipf_macrophages.obs[['sample_group']]):
    sample_len = len(ipf_macrophages[ipf_macrophages.obs['sample_group'] == sample])
    if sample_len < 10:
        samples_to_exclude.append(sample)

In [None]:
samples_to_exclude

In [None]:
ipf_macrophages = ipf_macrophages[~ipf_macrophages.obs['sample_group'].isin(samples_to_exclude)].copy()
ipf_macrophages

In [None]:
hlca = sc.read('../../../../snakemake/hlca/5fold/hlca_hyperparam/data/pp/hlca_hvg_pearson.h5ad')
hlca

In [None]:
hlca.X.data

In [None]:
ipf_macrophages.obsm['counts'] = hlca[ipf_macrophages.obs_names].X
ipf_macrophages

In [None]:
ipf_macrophages.X = ipf_macrophages.obsm['counts'].copy()
ipf_macrophages.X.data

In [None]:
len(np.unique(ipf_macrophages.obs['sample_group']))

In [None]:
adata_ = dc.get_pseudobulk(ipf_macrophages, 'sample', 'group', mode='sum', min_cells=-1, min_counts=-1)
adata_                            

In [None]:
adata_.layers['counts'] = adata_.X.copy()

In [None]:
sc.pp.normalize_total(adata_, target_sum=1e4)
sc.pp.log1p(adata_)
sc.pp.pca(adata_)

In [None]:
adata_.obs["lib_size"] = np.sum(adata_.layers["counts"], axis=1)
adata_.obs["log_lib_size"] = np.log(adata_.obs["lib_size"])

In [None]:
sc.pl.pca(adata_, color=['study', 'lib_size', 'log_lib_size', 'group', "3'_or_5'"], ncols=1, size=300, frameon=False)

In [None]:
%%R
fit_model <- function(adata_){
    # create an edgeR object with counts and grouping factor
    y <- DGEList(assay(adata_, "X"), group = colData(adata_)$group)
    # filter out genes with low counts
    print("Dimensions before subsetting:")
    print(dim(y))
    print("")
    keep <- filterByExpr(y)
    y <- y[keep, , keep.lib.sizes=FALSE]
    print("Dimensions after subsetting:")
    print(dim(y))
    print("")
    # normalize
    y <- calcNormFactors(y)
    # will add study covariate to the design matrix as there are still some batch effects
    group <- colData(adata_)$group
    replicate <- colData(adata_)$sample
    study <- colData(adata_)$study
    # create a design matrix: here we have multiple donors so also consider that in the design matrix
    design <- model.matrix(~ 0 + group + replicate)
    # estimate dispersion
    y <- estimateDisp(y, design = design)
    # fit the model
    fit <- glmQLFit(y, design)
    return(list("fit"=fit, "design"=design, "y"=y))
}

In [None]:
adata_.X = adata_.layers['counts'].copy()
adata_.X

In [None]:
macro = sc.AnnData(adata_.X, obs=adata_.obs[['group', 'sample_group', 'study', 'sample']])
macro.var_names = adata_.var_names
macro.obs['sample'] = macro.obs['sample'].str.replace('-', '_')
macro

In [None]:
macro = macro.copy()
macro

In [None]:
%%time
%%R -i macro
outs <-fit_model(macro)

In [None]:
%%R
fit <- outs$fit
y <- outs$y

In [None]:
%%R
plotMDS(y, col=ifelse(y$samples$group == "all", "red", "blue"))

In [None]:
%%R
plotBCV(y)

In [None]:
%%R
colnames(y$design)

In [None]:
%%R -o tt
myContrast <- makeContrasts("grouptop - groupall", levels = y$design)
qlf <- glmQLFTest(fit, contrast=myContrast)
# get all of the DE genes and calculate Benjamini-Hochberg adjusted FDR
tt <- topTags(qlf, n = Inf)
tt <- tt$table

In [None]:
tt.shape

In [None]:
tt[:5]

In [None]:
%%R
plotSmear(qlf, de.tags = rownames(tt)[which(tt$FDR<0.01)])

In [None]:
tt_sign = tt[tt['FDR'] < 0.01]
tt_sign

In [None]:
up_genes = list(tt_sign[tt_sign['logFC'] > 1.5].index)
len(up_genes)

In [None]:
macrophages_genes = {
    'SPP1': 'ENSG00000118785',
    'LIPA': 'ENSG00000107798',
    'LPL': 'ENSG00000175445',
    'FDX1': 'ENSG00000137714',
    'SPARC': 'ENSG00000113140',
    'MATK': 'ENSG00000007264',
    'GPC4': 'ENSG00000076716',
    'PALLD': 'ENSG00000129116',
    'MMP7': 'ENSG00000137673',
    'MMP9': 'ENSG00000100985',
    'CHIT1': 'ENSG00000133063',
    'CSTK': 'ENSG00000143387',
    'CHI3L1': 'ENSG00000133048',
    'CSF1': 'ENSG00000184371',
    'FCMR': 'ENSG00000162894',
    'TIMP3': 'ENSG00000100234',
    'COL22A1': 'ENSG00000169436',
    'SIGLEC15': 'ENSG00000197046',
    'CCL2': 'ENSG00000108691',
}

In [None]:
len(up_genes)

In [None]:
len(set(up_genes).intersection(set(macrophages_genes.values())))

In [None]:
set(up_genes) - set(macrophages_genes.values())

{
     'ENSG00000026751', SLAMF7
     'ENSG00000102962', CCL22
     'ENSG00000110092', CCND1
     'ENSG00000122224', LY9
     'ENSG00000125735', TNFSF14
     'ENSG00000130513', GDF15
     'ENSG00000138080', EMILIN1
     'ENSG00000142173', COL6A2
     'ENSG00000143320', CRABP2
     'ENSG00000148773', MKI67
     'ENSG00000151789', ZNF385D
     'ENSG00000159674', SPON2
     'ENSG00000164949', GEM
     'ENSG00000167779', IGFBP2
     'ENSG00000177469', CAVIN1
     'ENSG00000262406', MMP12
}

In [None]:
tt.to_csv('macrophages_top.csv')

In [None]:
missig_up_genes = {
    'ENSG00000069482': 'GAL', 
    # https://erj.ersjournals.com/content/erj/early/2018/08/09/13993003.00564-2018.full.pdf,
    # flavonoid compound that attenuates inflammatory damage and prevents EMT (ath with epithelial cells) in BLM-induced PF mice
    # <- macrophages are not eputhelial though
    # https://www.mdpi.com/1420-3049/27/5/1481
    # not sure
    'ENSG00000102962': 'CCL22',
    # https://pubmed.ncbi.nlm.nih.gov/19715610/ also not sure
    'ENSG00000143320': 'CRABP2',
    # yes in other cell types https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9141193/
}

In [None]:
down_genes = list(tt_sign[tt_sign['logFC'] < 0].index)
down_genes

In [None]:
len(set(down_genes).intersection(set(macrophages_genes.values())))

In [None]:
ipf_macrophages

In [None]:
# CCR4 = ENSG00000183813
sc.pl.umap(ipf_macrophages, color=['ENSG00000183813', 'cell_attn', 'profibrotic_score', 'leiden', 'group'], ncols=1, frameon=False,)

In [None]:
import matplotlib_venn

In [None]:
matplotlib_venn.venn2(subsets = (10, 16, 9), set_labels = ('Profibrotic signature', 'Upregulated DE genes (high cell attention vs rest)'))

# cluster 12 vs rest of high attention cells

In [None]:
ipf_macrophages

In [None]:
ipf_macrophages.obs['group'] = 'rest'
ipf_macrophages.obs.loc[ipf_macrophages.obs['leiden'] == '12', 'group'] = 'leiden12'

In [None]:
ipf_macrophages.obs['group'].value_counts()

In [None]:
ipf_macrophages.obs['sample_group'] = ipf_macrophages.obs['sample'].astype('str') + '_' + ipf_macrophages.obs['group']
ipf_macrophages.obs['sample_group'] = ipf_macrophages.obs['sample_group'].astype('category')

In [None]:
ipf_macrophages

In [None]:
samples_to_exclude = []
for sample in np.unique(ipf_macrophages.obs[['sample_group']]):
    sample_len = len(ipf_macrophages[ipf_macrophages.obs['sample_group'] == sample])
    if sample_len < 10:
        samples_to_exclude.append(sample)

In [None]:
samples_to_exclude

In [None]:
ipf_macrophages = ipf_macrophages[~ipf_macrophages.obs['sample_group'].isin(samples_to_exclude)].copy()
ipf_macrophages

In [None]:
ipf_macrophages.X.data

In [None]:
ipf_macrophages.X = ipf_macrophages.obsm['counts'].copy()
ipf_macrophages.X.data

In [None]:
len(np.unique(ipf_macrophages.obs['sample_group']))

In [None]:
adata_ = dc.get_pseudobulk(ipf_macrophages, 'sample', 'group', mode='sum', min_cells=-1, min_counts=-1)
adata_                            

In [None]:
adata_.layers['counts'] = adata_.X.copy()

In [None]:
macro = sc.AnnData(adata_.X, obs=adata_.obs[['group', 'sample_group', 'study', 'sample']])
macro.var_names = adata_.var_names
macro.obs['sample'] = macro.obs['sample'].str.replace('-', '_')
macro

In [None]:
macro = macro.copy()
macro

In [None]:
%%time
%%R -i macro
outs <-fit_model(macro)

In [None]:
%%R
fit <- outs$fit
y <- outs$y

In [None]:
%%R
plotMDS(y, col=ifelse(y$samples$group == "rest", "red", "blue"))

In [None]:
%%R
plotBCV(y)

In [None]:
%%R
colnames(y$design)

In [None]:
%%R -o tt
myContrast <- makeContrasts("groupleiden12 - grouprest", levels = y$design)
qlf <- glmQLFTest(fit, contrast=myContrast)
# get all of the DE genes and calculate Benjamini-Hochberg adjusted FDR
tt <- topTags(qlf, n = Inf)
tt <- tt$table

In [None]:
tt.shape

In [None]:
tt[:5]

In [None]:
%%R
plotSmear(qlf, de.tags = rownames(tt)[which(tt$FDR<0.05)])

In [None]:
tt_sign = tt[tt['FDR'] < 0.05]
tt_sign

In [None]:
tt_sign.sort_values(by='logFC')[-10:]

In [None]:
# these guys are proliferating macrophages I think
up_genes = {
    'ENSG00000131747': 'TOP2A', #
    'ENSG00000148773': 'MKI67', #
}

In [None]:
sc.pp.normalize_total(ipf_macrophages, target_sum=1e4)
sc.pp.log1p(ipf_macrophages)

In [None]:
ipf_macrophages

In [None]:
# SPP1 https://pubmed.ncbi.nlm.nih.gov/31221805/
sc.pl.umap(ipf_macrophages, color=['leiden', 'lung_condition_coarse', 'ENSG00000118785'], ncols=1, frameon=False)

In [None]:
tt.to_csv('macrophages_leiden12.csv')