# Pseudobulk eQTL Analysis

In [1]:
import gc
from pathlib import Path
import warnings

import anndata as ad
import scanpy as sc
import dask.array as da
import numpy as np
import pandas as pd
from statsmodels.stats.multitest import fdrcorrection
from tqdm.auto import tqdm

import cellink as cl
from cellink._core import DAnn, GAnn
from cellink.tl.gwas import GWAS
from cellink.utils import column_normalize, gaussianize

In [2]:
DATA = Path(cl.__file__).parent.parent.parent / "data"
GENODATA = DATA / "OneK1K_imputation_post_qc_r2_08"

gpc_path = GENODATA / "pcdir/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec"
adata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
gdata_path = GENODATA / "filter_zarr_r08/chr22.dose.filtered.R2_0.8.vcz"

In [3]:
n_gpcs = 20
n_epcs = 15
batch_e_pcs_n_top_genes = 2000
chrom = 22
cis_window = 500_000
cell_type = "CD8 NC"
pb_gex_key = f"PB_{cell_type}"  # pseudobulk expression in dd.G.obsm[key_added]
original_donor_col = "individual"
min_percent_donors_expressed = 0.1
celltype_key = "cell_label"
do_debug = False

In [4]:
if do_debug:
    adata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"

adata = ad.read_h5ad(adata_path)
gdata = cl.io.read_sgkit_zarr(gdata_path)

gene_ann = pd.read_csv(DATA / "gene_annotation.csv").set_index("ensembl_gene_id")
adata.var = pd.concat([adata.var, gene_ann.loc[adata.var.index]], axis=1).rename(
    columns={
        "start_position": GAnn.start,
        "end_position": GAnn.end,
        "chromosome_name": GAnn.chrom,
    }
)
adata.obs[DAnn.donor] = adata.obs[original_donor_col]
adata

AnnData object with n_obs × n_vars = 1272489 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age', 'donor_id'
    var: 'GeneSymbol', 'features', 'start', 'end', 'chrom', 'strand', 'description', 'wikigene_name', 'wikigene_id'

In [5]:
dd = cl.DonorData(G=gdata, C=adata).copy()  # copy to avoid view warnings
dd



In [6]:
sc.pp.normalize_total(dd.C)
sc.pp.log1p(dd.C)
sc.pp.normalize_total(dd.C)

mdata = sc.get.aggregate(dd.C, by=DAnn.donor, func="mean")
mdata.X = mdata.layers.pop("mean")

sc.pp.highly_variable_genes(mdata, n_top_genes=batch_e_pcs_n_top_genes)
sc.tl.pca(mdata, n_comps=n_epcs)

dd.G.obsm["ePCs"] = mdata[dd.G.obs_names].obsm["X_pca"]

In [7]:
dd = dd[..., dd.C.obs[celltype_key] == cell_type, :].copy()
dd



In [8]:
gc.collect()

493

In [9]:
dd.aggregate(key_added=pb_gex_key, sync_var=True, verbose=True)
dd.aggregate(obs=["sex", "age"], func="first", add_to_obs=True)
dd

[2025-02-10 19:00:16,399] INFO:cellink._core.donordata: Aggregated X to PB_CD8 NC
[2025-02-10 19:00:16,400] INFO:cellink._core.donordata: Observation found for 981 donors.




In [10]:
dd.G.obsm[pb_gex_key].shape

(981, 32738)

In [11]:
gpcs = pd.read_csv(gpc_path, sep=r"\s+", index_col=1, header=None).drop(columns=0)
dd.G.obsm["gPCs"] = gpcs.loc[dd.G.obs_names].iloc[:, :n_gpcs]

In [12]:
print(f"{pb_gex_key} shape:", dd.G.obsm[pb_gex_key].shape)
print("dd.shape:", dd.shape)

keep_genes = ((dd.G.obsm[pb_gex_key] > 0).mean(axis=0) >= min_percent_donors_expressed).values
dd = dd[..., keep_genes]
print("after filtering")
print(f"{pb_gex_key} shape:", dd.G.obsm[pb_gex_key].shape)
print("dd.shape:", dd.shape)

PB_CD8 NC shape: (981, 32738)
dd.shape: (981, 143083, 133482, 32738)
after filtering
PB_CD8 NC shape: (981, 14119)
dd.shape: (981, 143083, 133482, 14119)


In [13]:
F = np.concatenate(
    [
        np.ones((dd.shape[0], 1)),
        dd.G.obs[["sex"]].values - 1,
        dd.G.obs[["age"]].values,
        dd.G.obsm["gPCs"].values,
        dd.G.obsm["ePCs"],
    ],
    axis=1,
)
F[:, 2:] = column_normalize(F[:, 2:])

In [14]:
# alternative to dd[:, dd.G.var.chrom == str(chrom), :, dd.C.var.chrom == str(chrom)]
dd = dd.sel(G_var=dd.G.var.chrom == str(chrom), C_var=dd.C.var.chrom == str(chrom)).copy()
dd



In [15]:
results = []
if isinstance(dd.G.X, da.Array | ad._core.views.DaskArrayView):
    if dd.G.is_view:
        dd._G = dd._G.copy()  # TODO: discuss with SWEs
    dd.G.X = dd.G.X.compute()

if do_debug:
    warnings.filterwarnings("ignore", category=RuntimeWarning)

for gene, row in tqdm(dd.C.var.iterrows(), total=dd.shape[3]):
    Y = gaussianize(dd.G.obsm[pb_gex_key][[gene]].values + 1e-5 * np.random.randn(dd.shape[0], 1))

    start = max(0, row.start - cis_window)
    end = row.end + cis_window
    _G = dd.G[:, (dd.G.var.pos < end)]
    _G = _G[:, (_G.var.pos > start)]
    _G = _G[:, (_G.X.std(0) != 0)]
    G = _G.X

    gwas = GWAS(Y, F)
    gwas.process(G)

    snp_idx = gwas.getPv().argmin()

    def _get_top_snp(arr, snp_idx=snp_idx):
        return arr.ravel()[snp_idx].item()

    rdict = {
        "snp": _G.var.iloc[snp_idx].name,
        "egene": gene,
        "n_cis_snps": G.shape[1],
        "pv": _get_top_snp(gwas.getPv()),
        "beta": _get_top_snp(gwas.getBetaSNP()),
        "betaste": _get_top_snp(gwas.getBetaSNPste()),
        "lrt": _get_top_snp(gwas.getLRT()),
    }
    results.append(rdict)

rdf = pd.DataFrame(results)
rdf

  0%|          | 0/360 [00:00<?, ?it/s]

Unnamed: 0,snp,egene,n_cis_snps,pv,beta,betaste,lrt
0,22_17476696_T_C,ENSG00000100181,1688,2.071223e-03,-0.349170,0.113373,9.485322
1,22_17739317_G_A,ENSG00000237438,2939,4.633734e-06,-0.937499,0.204662,20.983014
2,22_17732851_A_G,ENSG00000177663,3188,5.967313e-04,0.312597,0.091054,11.786146
3,22_17986900_G_A,ENSG00000069998,3378,3.983074e-04,1.301109,0.367420,12.540117
4,22_17206294_C_A,ENSG00000185837,3376,1.363376e-03,0.155577,0.048583,10.254690
...,...,...,...,...,...,...,...
355,22_51100218_C_T,ENSG00000205560,3415,1.832598e-03,-0.384827,0.123496,9.710082
356,22_51017082_T_A,ENSG00000100288,3337,2.352505e-04,-0.170751,0.046427,13.526244
357,22_51023924_C_T,ENSG00000205559,3317,9.775330e-05,0.355925,0.091354,15.179609
358,22_51064416_T_C,ENSG00000100299,3068,6.481592e-13,-0.489005,0.068012,51.695355


In [16]:
rdf["pv_adj"] = np.clip(rdf["pv"] * rdf["n_cis_snps"], 0, 1)  # gene-wise Bonferroni
rdf["qv"] = fdrcorrection(rdf["pv_adj"])[1]

(rdf.qv < 0.05).sum()

46