In [None]:
## reinstalling the package
! pwd
! pip uninstall -y cellink
! pip install -q ../../.

In [None]:
## importing libraries
import logging
import warnings

import anndata as ad
import pandas as pd

from pathlib import Path

from cellink.io import read_sgkit_zarr
from cellink import DonorData

from cellink.tl import EQTLData, EQTLPipeline

warnings.filterwarnings("ignore")

logger = logging.getLogger(__name__)

In [None]:
## setting some macro variables
DEBUG = True
target_cell_type = "CD4 ET"
target_chrom = "22"
cis_window = 1_000_000

In [None]:
## paths
DATA = Path("/home/lollo/Work/hackathon/data/Yazar_OneK1K")

vcf_file_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcf.gz"

zarr_path = vcf_file_path.parent.parent / "filter_zarr_r08"
zarr_path.mkdir(exist_ok=True)

icf_file_path = zarr_path / vcf_file_path.with_suffix(".icf").name
zarr_file_path = (zarr_path / vcf_file_path.stem).with_suffix(".vcz")

if DEBUG:
    scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
else:
    scdata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"

gene_annotation_path = DATA / "gene_annotation.csv"

In [None]:
## reading single cell data
scdata = ad.read_h5ad(scdata_path)

In [None]:
## reading annotation data
annotation_df = pd.read_csv(gene_annotation_path).loc[
    :, ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"]
]
annotation_df = annotation_df.loc[annotation_df.ensembl_gene_id.isin(scdata.var_names)]

In [None]:
## merging the scdata.var df with the annotations
merged_df = pd.merge(scdata.var, annotation_df, left_index=True, right_on="ensembl_gene_id")
merged_df = merged_df.rename(
    columns={"ensembl_gene_id": "Geneid", "start_position": "start", "end_position": "end", "chromosome_name": "chrom"}
)
merged_df.index = merged_df["Geneid"]
scdata.var = merged_df

In [None]:
## reading genetic data
gdata = read_sgkit_zarr(zarr_file_path)

In [None]:
## initializing donor data
data = DonorData(adata=scdata, gdata=gdata, donor_key_in_sc_adata="individual")

In [None]:
## initializing eqtl data
eqtl_data = EQTLData(data, n_sc_comps=15, n_genetic_pcs=15)

In [None]:
## running eqtl pipeline and reporting results on all variants
eqtl = EQTLPipeline(
    eqtl_data, dump_results=True, file_prefix="eqtl_all", dump_dir="/home/lollo/Work/hackathon/out/dumps"
)
results_df_all = eqtl.run(target_cell_type, target_chrom)