In [1]:
## reinstalling the package
! pwd
! pip uninstall -y cellink
! pip install -q ../../.

/home/lollo/Work/hackathon/code/sc-genetics/docs/notebooks
Found existing installation: cellink 0.0.1
Uninstalling cellink-0.0.1:
  Successfully uninstalled cellink-0.0.1


In [2]:
## importing libraries
import logging
import warnings

import anndata as ad
import scanpy as sc
import numpy as np
import pandas as pd

from anndata.utils import asarray
from pathlib import Path

from cellink.io import read_sgkit_zarr
from cellink import DonorData

warnings.filterwarnings("ignore")

logger = logging.getLogger(__name__)

In [3]:
DEBUG = True

In [4]:
## paths
DATA = Path("/home/lollo/Work/hackathon/data/Yazar_OneK1K")

vcf_file_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcf.gz"

zarr_path = vcf_file_path.parent.parent / "filter_zarr_r08"
zarr_path.mkdir(exist_ok=True)

icf_file_path = zarr_path / vcf_file_path.with_suffix(".icf").name
zarr_file_path = (zarr_path / vcf_file_path.stem).with_suffix(".vcz")

if DEBUG:
    scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
else:
    scdata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"

gene_annotation_path = DATA / "gene_annotation.csv"

In [5]:
## reading single cell data
scdata = ad.read_h5ad(scdata_path)

In [6]:
## reading annotation data
annotation_df = pd.read_csv(gene_annotation_path).loc[
    :, ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"]
]
annotation_df = annotation_df.loc[annotation_df.ensembl_gene_id.isin(scdata.var_names)]

In [7]:
## merging the scdata.var df with the annotations
merged_df = pd.merge(scdata.var, annotation_df, left_index=True, right_on="ensembl_gene_id")
merged_df = merged_df.rename(
    columns={"ensembl_gene_id": "Geneid", "start_position": "start", "end_position": "end", "chromosome_name": "chrom"}
)
merged_df.index = merged_df["Geneid"]
scdata.var = merged_df

In [8]:
## reading genetic data
gdata = read_sgkit_zarr(zarr_file_path)

In [9]:
## initializing donor data
data = DonorData(adata=scdata, gdata=gdata, donor_key_in_sc_adata="individual")
data

[2024-11-02 13:00:10,953] INFO:cellink._core.donordata: Keeping 20/1034 donors
[2024-11-02 13:00:10,954] INFO:cellink._core.donordata: Dropping 1014/1034 donors from genetic data
[2024-11-02 13:00:10,954] INFO:cellink._core.donordata: Dropping 0/20 donors from single-cell data




In [36]:
from sklearn.preprocessing import StandardScaler
from dataclasses import dataclass


@dataclass
class EQTLData:
    """"""

    _donor_data: DonorData
    # _data_root: str
    # _gen_pc_path: str
    _n_sc_comps: int = 500
    # _n_genetic_pcs: int = 500
    _individual_key_in_scdata: str = "individual"
    _sex_key_in_scdata_obs: str = "sex"
    _age_key_in_scdata_obs: str = "age"
    _pseudobulk_aggregation_type: str = "mean"
    _n_top_genes: int = 5000
    _min_individuals_threshold: int = 10
    _donor_key_in_sc_adata: str = "individual"

    @staticmethod
    def _column_normalize(X):
        """"""
        assert X.ndim == 2
        return (X - X.mean(0)) / (X.std(0) * np.sqrt(X.shape[1]))

    def _filter_cells_by_type(self, scdata, cell_type):
        """"""
        scdata_cell = scdata[scdata.obs.cell_label == cell_type]
        return scdata_cell

    def _filter_genes_by_chromosome(self, scdata_cell, target_chromosome):
        """"""
        scdata_cell = scdata_cell[:, scdata_cell.var["chrom"] == target_chromosome]
        return scdata_cell

    def _map_col_scdata_obs_to_pbdata(self, pbdata, column):
        """"""
        ## mapping over the individuals
        individuals = pbdata.obs[self._donor_key_in_sc_adata]
        reference_data = self.scdata.obs[[self._donor_key_in_sc_adata, column]]
        reference_data = reference_data.groupby(self._donor_key_in_sc_adata).agg(["unique"])

        ## function for making sure the values are unique
        def retrieve_unique_value_safe(row):
            assert len(row) == 1
            return row[0]

        ## retrieving the unique values for each donor
        reference_data[column] = reference_data[column].map(retrieve_unique_value_safe)
        ## merging the data and updating column names
        pbdata.obs = pd.merge(pbdata.obs, reference_data[column], left_on=self._donor_key_in_sc_adata, right_index=True)
        pbdata.obs[column] = pbdata.obs["unique"]
        pbdata.obs = pbdata.obs.drop(columns=["unique"], axis=1)
        return pbdata

    def _pseudobulk_scdata(self, scdata_cell):
        """"""
        ## aggregating the data
        pbdata = sc.get.aggregate(
            scdata_cell,
            self._individual_key_in_scdata,
            self._pseudobulk_aggregation_type,
        )
        ## storing data lost in the aggregation
        pbdata.X = pbdata.layers["mean"]
        pbdata = self._map_col_scdata_obs_to_pbdata(pbdata, self._sex_key_in_scdata_obs)
        pbdata = self._map_col_scdata_obs_to_pbdata(pbdata, self._age_key_in_scdata_obs)
        return pbdata

    def _register_fixed_effects(self, pbdata):
        """"""
        ## compute expression PCs
        sc.pp.highly_variable_genes(pbdata, n_top_genes=self._n_top_genes)
        sc.tl.pca(pbdata, use_highly_variable=True, n_comps=self._n_sc_comps)
        pbdata.obsm["E_dpc"] = self._column_normalize(pbdata.obsm["X_pca"])
        ## load genetic PCs
        # gen_pcs = pd.read_csv(self._data_root / self._gen_pc_path, sep=" ", header=None, index_col=1).drop(columns=[0])
        # gen_pcs = gen_pcs.loc[self._donor_data.adata.obs.index, :].iloc[:, :self._n_genetic_pcs].values
        ## load patient covariates
        sex_one_hot = np.eye(2)[(pbdata.obs[self._sex_key_in_scdata_obs].values - 1)]
        age_standardized = StandardScaler().fit_transform(pbdata.obs[self._age_key_in_scdata_obs].values.reshape(-1, 1))
        covariates = np.concatenate((sex_one_hot, age_standardized), axis=1)
        ## store fixed effects in pb_adata
        pbdata.obsm["F"] = np.concatenate(
            (covariates, pbdata.obsm["E_dpc"]),
            axis=1,
            # (covariates, gen_pcs, pbdata.obsm["E_pb"][:, :n_expr_pcs]), axis=1
        )
        pbdata.obsm["F"][:, 2:] = self._column_normalize(pbdata.obsm["F"][:, 2:])
        return pbdata

    def _get_pbdata(self, cell_type, target_chromosome):
        """"""
        ## filtering cells and genes
        scdata_cell = self._filter_cells_by_type(self.scdata, cell_type)
        scdata_cell = self._filter_genes_by_chromosome(scdata_cell, target_chromosome)
        ## pseudobulk aggregation
        pbdata = self._pseudobulk_scdata(scdata_cell)
        ## filter out genes least expressed genes
        sc.pp.filter_genes(pbdata, min_cells=self._min_individuals_threshold)
        ## registering fixed effects
        pbdata = self._register_fixed_effects(pbdata)
        return pbdata

    def get_pb_data(self, cell_type, target_chromosome):
        """"""
        pbdata = self._get_pbdata(cell_type, target_chromosome)
        return DonorData(adata=pbdata, gdata=self.gdata, donor_key_in_sc_adata=self._donor_key_in_sc_adata)

    @property
    def scdata(self):
        """"""
        return self._donor_data.adata

    @property
    def gdata(self):
        """"""
        return self._donor_data.gdata

    @property
    def donor_data(self):
        """"""
        return self._donor_data

In [41]:
@dataclass
class EQTLPipeline:
    """"""

    _eqtl_data: EQTLData

    def _run_gwas(self, cell_type: str, target_chromosome: str, target_gene: str, cis_window: int):
        ## getting current data
        pb_data = self._eqtl_data.get_pb_data(cell_type, target_chromosome)
        ## retrieving the pseudo-bulked data
        Y = pb_data.adata[:, [target_gene]].layers["mean"]
        Y = asarray(Y)
        Y = _my_quantile_transform(Y)
        ## retrieving start and end position for each gene
        start = pb_data.adata.var.loc[target_gene].start
        end = pb_data.adata.var.loc[target_gene].end
        chrom = pb_data.adata.var.loc[target_gene].chrom
        ## retrieving the variants within the cis window
        subgadata = pb_data.gdata[
            :,
            (pb_data.gdata.var.chrom == chrom)
            & (pb_data.gdata.var.pos >= start - cis_window)
            & (pb_data.gdata.var.pos <= end + cis_window),
        ]
        G = subgadata.X.compute()
        gwas = GWAS(Y, F=pb_data.adata.obsm["F"])
        gwas.process(G)
        no_tested_variantes = G.shape[1]
        return gwas, no_tested_variantes

In [44]:
eqtl_data = EQTLData(data, _n_sc_comps=15)
# pb_data = eqtl_data.get_pb_data(cell_type, target_chromosome)
cell_type = "CD4 ET"
target_chromosome = "22"
target_gene = eqtl_data.scdata.var_names[0]
cis_window = 1_000_000

eqtl = EQTLPipeline(eqtl_data)
eqtl._run_gwas(cell_type, target_chromosome, target_gene, cis_window)

[2024-11-02 13:16:33,594] INFO:cellink._core.donordata: Keeping 20/20 donors
[2024-11-02 13:16:33,594] INFO:cellink._core.donordata: Dropping 0/20 donors from genetic data
[2024-11-02 13:16:33,594] INFO:cellink._core.donordata: Dropping 0/20 donors from single-cell data


KeyError: "Values ['ENSG00000243485'], from ['ENSG00000243485'], are not valid obs/ var names or indices."

In [38]:
from sklearn.preprocessing import quantile_transform
from cellink.tl._gwas import GWAS


def _my_quantile_transform(x, seed=1):
    """
    Gaussian quantile transform for values in a pandas Series.    :param x: Input pandas Series.
    :type x: pd.Series
    :param seed: Random seed.
    :type seed: int
    :return: Transformed Series.
    :rtype: pd.Series    .. note::
        “nan” values are kept
    """
    np.random.seed(seed)
    x_transform = x.copy()
    if isinstance(x_transform, pd.Series):
        x_transform = x_transform.to_numpy()
    is_nan = np.isnan(x_transform)
    n_quantiles = np.sum(~is_nan)
    x_transform[~is_nan] = quantile_transform(
        x_transform[~is_nan].reshape([-1, 1]),
        n_quantiles=n_quantiles,
        subsample=n_quantiles,
        output_distribution="normal",
        copy=True,
    )[:, 0]
    # x_transform = pd.Series(x_transform, index = x.index)
    return x_transform


def _run_gwas(pb_data: DonorData, target_gene: str, cis_window: int):
    ## retrieving the pseudo-bulked data
    Y = pb_data.adata[:, [target_gene]].layers["mean"]
    Y = asarray(Y)
    Y = _my_quantile_transform(Y)
    ## retrieving start and end position for each gene
    start = pb_data.adata.var.loc[target_gene].start
    end = pb_data.adata.var.loc[target_gene].end
    chrom = pb_data.adata.var.loc[target_gene].chrom
    ## retrieving the variants within the cis window
    subgadata = pb_data.gdata[
        :,
        (pb_data.gdata.var.chrom == chrom)
        & (pb_data.gdata.var.pos >= start - cis_window)
        & (pb_data.gdata.var.pos <= end + cis_window),
    ]
    G = subgadata.X.compute()
    gwas = GWAS(Y, F=pb_data.adata.obsm["F"])
    gwas.process(G)
    no_tested_variantes = G.shape[1]
    return gwas, no_tested_variantes

In [45]:
CIS_WINDOW = 1_000_000
target_gene = pb_data.adata.var_names[0]
gwas, no_tested_variantes = _run_gwas(pb_data, target_gene, CIS_WINDOW)