In [1]:
## reinstalling the package
! pwd
! pip uninstall -y cellink
! pip install -q ../../.

/home/lollo/Work/hackathon/code/sc-genetics/docs/notebooks
Found existing installation: cellink 0.0.1
Uninstalling cellink-0.0.1:
  Successfully uninstalled cellink-0.0.1


In [2]:
## importing libraries
import logging
import warnings

import anndata as ad
import scanpy as sc
import numpy as np
import anndata as an
import pandas as pd

from tqdm.notebook import tqdm
from anndata.utils import asarray
from pathlib import Path
from statsmodels.stats.multitest import fdrcorrection

from cellink.io import read_sgkit_zarr
from cellink import DonorData

warnings.filterwarnings('ignore')

logger = logging.getLogger(__name__)

DEBUG = True

In [3]:
## paths
DATA = Path("/home/lollo/Work/hackathon/data/Yazar_OneK1K")

vcf_file_path = DATA / "OneK1K_imputation_post_qc_r2_08/filter_vcf_r08/chr22.dose.filtered.R2_0.8.vcf.gz"

zarr_path = vcf_file_path.parent.parent / "filter_zarr_r08"
zarr_path.mkdir(exist_ok=True)

icf_file_path = zarr_path / vcf_file_path.with_suffix(".icf").name
zarr_file_path = (zarr_path / vcf_file_path.stem).with_suffix(".vcz")

if DEBUG:
    scdata_path = DATA / "debug_OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad"
else:
    scdata_path = DATA / "OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"

gene_annotation_path = DATA / "gene_annotation.csv"

In [4]:
## reading single cell data
scdata = ad.read_h5ad(scdata_path)

In [5]:
## reading annotation data
annotation_df = pd.read_csv(gene_annotation_path).loc[:, ["ensembl_gene_id", "start_position", "end_position", "chromosome_name"]]
annotation_df = annotation_df.loc[annotation_df.ensembl_gene_id.isin(scdata.var_names)] 

In [6]:
## merging the scdata.var df with the annotations
merged_df = pd.merge(scdata.var, annotation_df, left_index=True, right_on="ensembl_gene_id")
merged_df = merged_df.rename(columns = {"ensembl_gene_id": "Geneid", "start_position": "start", "end_position": "end", "chromosome_name": "chrom"})
merged_df.index = merged_df["Geneid"]
scdata.var = merged_df

In [7]:
## reading genetic data
gdata = read_sgkit_zarr(zarr_file_path)

In [8]:
## initializing donor data
data = DonorData(adata=scdata, gdata=gdata, donor_key_in_sc_adata="individual")
data

[2024-11-02 18:42:50,516] INFO:cellink._core.donordata: Keeping 20/1034 donors
[2024-11-02 18:42:50,516] INFO:cellink._core.donordata: Dropping 1014/1034 donors from genetic data
[2024-11-02 18:42:50,517] INFO:cellink._core.donordata: Dropping 0/20 donors from single-cell data




In [9]:
from cellink.tl import EQTLData, EQTLPipeline
eqtl_data = EQTLData(data, _n_sc_comps=15)
# pb_data = eqtl_data.get_pb_data(cell_type, target_chromosome)
target_cell_type = "CD4 ET"
target_chrom = "22"
cis_window = 1_000_000

pb_data = eqtl_data.get_pb_data(target_cell_type, target_chrom)
target_gene = pb_data.adata.var_names[0]

eqtl = EQTLPipeline(eqtl_data, _dump_results=True, _dump_dir="/home/lollo/Work/hackathon/out/dumps")
results_df_all = eqtl.run(target_cell_type, target_chrom, cis_window)
(len(eqtl.target_cell_types), len(eqtl.target_chroms))

[2024-11-02 18:42:50,571] INFO:cellink._core.donordata: Keeping 20/20 donors
[2024-11-02 18:42:50,571] INFO:cellink._core.donordata: Dropping 0/20 donors from genetic data
[2024-11-02 18:42:50,572] INFO:cellink._core.donordata: Dropping 0/20 donors from single-cell data
[2024-11-02 18:42:50,604] INFO:cellink._core.donordata: Keeping 20/20 donors
[2024-11-02 18:42:50,604] INFO:cellink._core.donordata: Dropping 0/20 donors from genetic data
[2024-11-02 18:42:50,605] INFO:cellink._core.donordata: Dropping 0/20 donors from single-cell data


(16, 45)

In [10]:
from cellink.tl import EQTLData, EQTLPipeline
eqtl_data = EQTLData(data, _n_sc_comps=15)
# pb_data = eqtl_data.get_pb_data(cell_type, target_chromosome)
target_cell_type = "CD4 ET"
target_chrom = "22"
cis_window = 1_000_000

pb_data = eqtl_data.get_pb_data(target_cell_type, target_chrom)
target_gene = pb_data.adata.var_names[0]

eqtl = EQTLPipeline(eqtl_data, _mode="best")
results_df_best = eqtl.run(target_cell_type, target_chrom, cis_window)
(len(eqtl.target_cell_types), len(eqtl.target_chroms))

[2024-11-02 18:43:57,774] INFO:cellink._core.donordata: Keeping 20/20 donors
[2024-11-02 18:43:57,774] INFO:cellink._core.donordata: Dropping 0/20 donors from genetic data
[2024-11-02 18:43:57,775] INFO:cellink._core.donordata: Dropping 0/20 donors from single-cell data
[2024-11-02 18:43:57,810] INFO:cellink._core.donordata: Keeping 20/20 donors
[2024-11-02 18:43:57,811] INFO:cellink._core.donordata: Dropping 0/20 donors from genetic data
[2024-11-02 18:43:57,811] INFO:cellink._core.donordata: Dropping 0/20 donors from single-cell data


(16, 45)

In [11]:
results_df_best

Unnamed: 0,cell_type,chrom,gene,no_tested_variants,min_pv,min_pv_variant,min_pv_variant_beta,min_pv_variant_beta_ste,min_pv_variant_lrt
0,CD4 ET,22,ENSG00000177663,5697,3.020206e-07,22_17677104_G_A,-2.268750,0.442927,26.236745
1,CD4 ET,22,ENSG00000069998,5889,2.495242e-07,22_17600148_A_G,-4.039844,0.783210,26.605563
2,CD4 ET,22,ENSG00000093072,5898,3.203543e-10,22_17678455_C_T,12.243714,1.946972,39.546413
3,CD4 ET,22,ENSG00000131100,6755,4.061946e-08,22_18100656_A_T,6.847204,1.247637,30.119650
4,CD4 ET,22,ENSG00000099968,7182,3.296931e-08,22_18293072_G_A,5.776428,1.045528,30.524404
...,...,...,...,...,...,...,...,...,...
231,CD4 ET,22,ENSG00000177989,5817,1.289361e-06,22_17951460_A_G,3.198093,0.660574,23.439027
232,CD4 ET,22,ENSG00000130487,5735,7.491574e-08,22_17943600_G_C,-2.463927,0.458067,28.933252
233,CD4 ET,22,ENSG00000100288,5586,1.939023e-08,22_17673438_T_C,-11.666548,2.076876,31.554682
234,CD4 ET,22,ENSG00000100299,5350,2.470096e-06,22_17407061_T_A,-3.675625,0.780291,22.189589


In [12]:
s
target_chrom = "22"
cis_window = 1_000_000

pb_data = eqtl_data.get_pb_data(target_cell_type, target_chrom)
target_gene = pb_data.adata.var_names[0]

eqtl = EQTLPipeline(eqtl_data)
gwas, no_tested_variants = eqtl._run_gwas(pb_data, target_gene, target_chrom, cis_window)

pvs = np.squeeze(gwas.getPv())
betasnp = np.squeeze(gwas.getBetaSNP())
betasnp_ste = np.squeeze(gwas.getBetaSNPste())
lrts = np.squeeze(gwas.getLRT())
pvs[np.isnan(pvs)] = 1
betasnp[np.isnan(betasnp)] = 0
betasnp_ste[np.isnan(betasnp_ste)] = 0
lrts[np.isnan(lrts)] = 0

print(f"{pvs.mean()=}, {betasnp.mean()=}, {betasnp_ste.mean()=}, {lrts.mean()=}")
print(f"{pvs.std()=}, {betasnp.std()=}, {betasnp_ste.std()=}, {lrts.std()=}")

NameError: name 's' is not defined