In [None]:
## reinstalling the package
! pwd
! pip uninstall -y cellink
! pip install -q ../../.

In [1]:
## importing libraries
import logging
import warnings
import anndata as ad
import numpy as np
import pandas as pd
from pathlib import Path

from cellink.io import read_plink
from cellink import DonorData
from cellink.tl import EQTLData, EQTLPipeline

warnings.filterwarnings("ignore")

logger = logging.getLogger(__name__)

In [2]:
DEBUG = False
target_cell_types = ["CD4 ET", "CD4 NC", "CD8 S100B", "CD8 ET", "B IN", "CD8 NC", "B Mem", "Plasma"]
target_chroms = [
    "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", 
    "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22"
]
cis_window = 1_000_000

In [3]:
## paths
DATA = Path("/home/lollo/Work/hackathon/data/Yazar_OneK1K")
if DEBUG:
    scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
else:
    scdata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"

dump_dir = "/home/lollo/Work/hackathon/out/dumps"
gene_annotation_path = DATA / "gene_annotation.csv"
plink_path = DATA / f"OneK1K_imputation_post_qc_r2_08/plink"

In [4]:
def load_chrom_gdata(chrom: str):
    plink_file = DATA / f"OneK1K_imputation_post_qc_r2_08/plink/chr{chrom}.dose.filtered.R2_0.8"
    print(plink_file)
    gdata = read_plink(plink_file)
    return gdata

In [5]:
scdata = ad.read_h5ad(scdata_path)

In [8]:
for cell_label in scdata.obs.cell_label.unique():
    print(cell_label)

CD4 ET
NK
CD4 NC
CD8 S100B
CD8 ET
B IN
CD8 NC
B Mem
Erythrocytes
NK R
Mono NC
Mono C
Platelets
DC
Plasma
CD4 SOX4


In [None]:
## reading single cell data
scdata = ad.read_h5ad(scdata_path)
## reading annotation data
annotation_df = pd.read_csv(gene_annotation_path).loc[
    :, ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"]
]
annotation_df = annotation_df.loc[annotation_df.ensembl_gene_id.isin(scdata.var_names)]
## merging the scdata.var df with the annotations
merged_df = pd.merge(scdata.var, annotation_df, left_index=True, right_on="ensembl_gene_id")
merged_df = merged_df.rename(
    columns={"ensembl_gene_id": "Geneid", "start_position": "start", "end_position": "end", "chromosome_name": "chrom"}
)
merged_df.index = merged_df["Geneid"]
scdata.var = merged_df

In [None]:
## setting counter of iterations
current_iteration = 1
total_iterations = len(target_chroms)*len(target_cell_types)
## iterating over the target chromosomes
for target_chrom in target_chroms:
    ## retrieving genetics data for current chromosome
    gdata = load_chrom_gdata(target_chrom)
    ## initializing donor data
    data = DonorData(adata=scdata, gdata=gdata, donor_key_in_scdata="individual")
    ## initializing eqtl data
    eqtl_data = EQTLData(data, n_sc_comps=15)
    ## running eqtl pipeline and reporting results on all variants
    eqtl = EQTLPipeline(eqtl_data, dump_results=True, file_prefix="eqtl_all", dump_dir=dump_dir)
    ## iterating target cell_types
    for target_cell_type in target_cell_types:
        ## displaying progress message
        print(f"Iteration {current_iteration}/{total_iterations}. Currently processing cell type {target_cell_type} with chromosome {target_chrom}", end="...")
        results_df_all = eqtl.run(target_cell_type, target_chrom, cis_window)
        current_iteration += 1
        print(f"Finished to run the pipeline, dumping a table of shape {results_df.shape} to {dump_dir}")