In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
from pathlib import Path
import os
import pandas as pd
import scanpy as sc
import cellink as cl
from cellink.tl._burden_testing import *
from scipy.stats import beta
import pickle
import numpy as np

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Mai's test notebook

In [3]:
# test only with chr22 data 
base_data_dir = Path("/s/project/sys_gen_students/2024_2025/project04_rare_variant_sc/")
scdata_path = base_data_dir / "input_data/chr22_OneK1K_cohort_gene_expression_matrix_14_celltypes_w_gene_locations.h5ad.gz"
gdata_dir = "/data/ceph/hdd/project/node_09/sys_gen_students/2024_2025/project04_rare_variant_sc/input_data/filter_vcf_r08/"
DNA_LM_upstream = base_data_dir/ "input_data/annotations/onek1k_inf_scores_upstream_model.tsv"
DNA_LM_downstream = base_data_dir/ "input_data/annotations/onek1k_inf_scores_downstream_model.tsv"
vep_scores = base_data_dir/ "input_data/annotations/onek1k1_chr22_variants_annotated_vep.txt"

In [4]:
zarr_file = os.path.join(gdata_dir, f"chr22.dose.filtered.R2_0.8.vcz")
eigenvec = pd.read_csv(base_data_dir / "input_data/pcdir/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec", sep=' ', header=None)
eigenvec.index = eigenvec[1]
eigenvec = eigenvec.iloc[:, 2:]
scdata = sc.read_h5ad(scdata_path)
gdata = cl.io.read_sgkit_zarr(zarr_file)

In [5]:
gdata # patients x variants

AnnData object with n_obs × n_vars = 1034 × 143083
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'filter'

In [6]:
gdata.var_names

Index(['22_16849573_A_G', '22_16849971_A_T', '22_16850437_G_A',
       '22_16851225_C_T', '22_16851356_C_T', '22_16851640_C_T',
       '22_16851673_A_G', '22_16851899_C_G', '22_16852312_G_A',
       '22_16852652_G_A',
       ...
       '22_51197602_T_A', '22_51198569_G_C', '22_51198868_T_C',
       '22_51198906_G_A', '22_51198998_C_A', '22_51202748_A_G',
       '22_51208568_G_T', '22_51211031_A_G', '22_51213613_C_T',
       '22_51216564_T_C'],
      dtype='object', name='snp_id', length=143083)

In [7]:
gdata.X

Unnamed: 0,Array,Chunk
Bytes,1.10 GiB,76.29 MiB
Shape,"(1034, 143083)","(1000, 10000)"
Dask graph,30 chunks in 5 graph layers,30 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray
"Array Chunk Bytes 1.10 GiB 76.29 MiB Shape (1034, 143083) (1000, 10000) Dask graph 30 chunks in 5 graph layers Data type int64 numpy.ndarray",143083  1034,

Unnamed: 0,Array,Chunk
Bytes,1.10 GiB,76.29 MiB
Shape,"(1034, 143083)","(1000, 10000)"
Dask graph,30 chunks in 5 graph layers,30 chunks in 5 graph layers
Data type,int64 numpy.ndarray,int64 numpy.ndarray


In [4]:
cl.tl.add_vep_annos_to_gdata(vep_scores, gdata,
                             cols_to_explode=["Consequence"],
                             cols_to_dummy=["Consequence"])

[2025-01-19 18:27:20,439] INFO:cellink.tl._annotate_snps_genotype_data: renaming id column #Uploaded_variation into snp_id
[2025-01-19 18:27:20,606] INFO:cellink.tl._annotate_snps_genotype_data: Subsetting annotations to variants that are in gdata
[2025-01-19 18:27:20,606] INFO:cellink.tl._annotate_snps_genotype_data: 0 with missing annotation
[2025-01-19 18:27:21,194] INFO:cellink.tl._annotate_snps_genotype_data: Index(['snp_id', 'Location', 'Allele', 'Gene', 'Feature', 'Feature_type',
       'Consequence', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons', 'Existing_variation', 'IMPACT', 'DISTANCE',
       'STRAND', 'FLAGS', 'BIOTYPE', 'CANONICAL', 'ENSP', 'SIFT', 'PolyPhen',
       'gnomADe_AF', 'gnomADe_AFR_AF', 'gnomADe_AMR_AF', 'gnomADe_ASJ_AF',
       'gnomADe_EAS_AF', 'gnomADe_FIN_AF', 'gnomADe_NFE_AF', 'gnomADe_OTH_AF',
       'gnomADe_SAS_AF', 'CLIN_SIG', 'SOMATIC', 'PHENO', 'CADD_PHRED',
       'CADD_RAW', 'TSSDistance'],
      dtype='object

AnnData object with n_obs × n_vars = 1034 × 143083
    var: 'chrom', 'pos', 'a0', 'a1', 'AF', 'ER2', 'maf', 'R2', 'contig', 'id', 'id_mask', 'quality'
    varm: 'annotations_0', 'annotations_1', 'annotations_2', 'annotations_3', 'annotations_4', 'annotations_5', 'annotations_6', 'annotations_7'

In [5]:
def add_snp_id(DNA_LM):
    DNA_LM['snp_id'] = DNA_LM['Chromosome'] + "_" + DNA_LM['pos'].astype(str) + "_" + DNA_LM['ref'] + "_" + DNA_LM['alt']
    DNA_LM['snp_id'] = DNA_LM['snp_id'].str.replace('chr', '')
    
    # Set 'snap_id' as the index
    DNA_LM.set_index('snp_id', inplace=True)
    return DNA_LM

def reverse_and_update_snp_ids(gdata_df, dna_df):
    updated_index = []
    
    for snp_id in dna_df.index:
        if snp_id in gdata_df.index:
            updated_index.append(snp_id)
        else:
            chrom, pos, ref, alt = snp_id.split("_")
            reversed_snp_id = f"{chrom}_{pos}_{alt}_{ref}"  # Reverse ref and alt
            
            # Check if reversed_snp_id exists in data_df
            if reversed_snp_id in gdata_df.index:
                #print(f"Reversing {snp_id} to {reversed_snp_id}")
                updated_index.append(reversed_snp_id)
            else:
                print(f"Error, unknown snp_id {snp_id}")
                updated_index.append(snp_id)
    
    # Update DNA_LM's index
    dna_df.index = updated_index
    print("\nUpdated DNA_LM index:")
    print(dna_df.index)
    return dna_df

def add_maf_annotation(gdata):
    weighted_snp_maf = beta.pdf(gdata.var["maf"], 1, 25)
    gdata.varm["annotations_0"]["MAF_beta_1.25"] = weighted_snp_maf
    return gdata


def add_DNA_LM(gdata, file, chromosome, colname):

    DNA_LM = pd.read_csv(file,
                sep = '\t')
    DNA_LM = add_snp_id(DNA_LM)
    DNA_LM = reverse_and_update_snp_ids(gdata.varm["annotations_0"],
                                        DNA_LM[DNA_LM["Chromosome"]==f"chr{chromosome}"])

    gdata.varm["annotations_0"][colname] = DNA_LM["influence_score"].reindex(gdata.varm["annotations_0"].index)

    # Rename the merged column if needed
    gdata.varm["annotations_0"].rename(columns={"influence_score": colname}, inplace=True)
    return gdata

In [6]:
# add maf annotaion to gdata
print(f"add maf annotation to gdata ")
gdata = add_maf_annotation(gdata)

# add DNA_LM annotations (downstream and upstream models) to gdata 
print(f"add DNA_LM annotation to gdata ")
gdata = add_DNA_LM(gdata, file=DNA_LM_upstream, chromosome="22", colname='DNA_LM_up')
gdata = add_DNA_LM(gdata, file=DNA_LM_downstream, chromosome="22", colname='DNA_LM_down')

add maf annotation to gdata 
add DNA_LM annotation to gdata 

Updated DNA_LM index:
Index(['22_17069391_G_A', '22_17093288_C_T', '22_17102725_A_G',
       '22_17115621_T_A', '22_17117130_G_A', '22_17128890_A_G',
       '22_17202942_T_G', '22_17227050_G_T', '22_17274178_A_T',
       '22_17276558_C_T',
       ...
       '22_51171783_C_T', '22_51172439_G_A', '22_51172880_G_A',
       '22_51175039_A_C', '22_51175382_A_G', '22_51175798_C_T',
       '22_51177291_C_T', '22_51180520_C_T', '22_51180534_A_G',
       '22_51185359_G_A'],
      dtype='object', length=46826)

Updated DNA_LM index:
Index(['22_17069391_G_A', '22_17093288_C_T', '22_17102725_A_G',
       '22_17115621_T_A', '22_17117130_G_A', '22_17128890_A_G',
       '22_17202942_T_G', '22_17227050_G_T', '22_17274178_A_T',
       '22_17276558_C_T',
       ...
       '22_51171783_C_T', '22_51172439_G_A', '22_51172880_G_A',
       '22_51175039_A_C', '22_51175382_A_G', '22_51175798_C_T',
       '22_51177291_C_T', '22_51180520_C_T', '22_511

## Check results

In [27]:
test_res = pd.read_parquet(base_data_dir/"output/burdens/chr22_all_burdens_test_500_genes.parquet")
test_res

Unnamed: 0_level_0,CADD_PHRED,DNA_LM_up,DNA_LM_down,MAF_beta_1.25,DNA_LM_mixed,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_1,9350.801,13.224558,22.916096,43612.917035,13.224558,ENSG00000233866
2_2,10775.218,16.520484,28.387657,52127.930292,16.520484,ENSG00000233866
3_3,8327.426,12.240323,21.604692,39069.213569,12.240323,ENSG00000233866
4_4,9439.675,18.346526,32.699742,44983.557785,18.346526,ENSG00000233866
6_6,9054.896,13.340017,23.019223,42525.702648,13.340017,ENSG00000233866
...,...,...,...,...,...,...
1077_1078,9205.759,15.563891,27.192362,42441.879194,24.147251,ENSG00000100138
1078_1079,9408.366,14.626046,25.363997,44120.015877,21.432226,ENSG00000100138
1079_1080,9847.835,13.436251,23.860703,43963.871811,21.113776,ENSG00000100138
1080_1081,8382.267,13.858525,24.546299,40791.120729,21.272972,ENSG00000100138


In [13]:
eigenvec

Unnamed: 0_level_0,2,3,4,5,6,7,8,9,10,11,...,22,23,24,25,26,27,28,29,30,31
1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1_1,-0.011735,-0.035175,-0.067388,-0.021872,-0.167117,-0.046335,0.008574,-0.099560,-0.010074,0.030196,...,0.013934,-0.058533,-0.049001,0.008710,0.012238,0.068429,0.011627,0.018399,-0.030633,-0.044223
2_2,-0.003923,0.002343,-0.002582,-0.028752,-0.002341,0.009987,0.021566,-0.026934,-0.068490,-0.034232,...,-0.028807,-0.016083,0.005872,0.070105,-0.032999,-0.026845,0.007225,-0.003704,0.010660,-0.025092
3_3,0.000041,-0.010188,-0.037682,-0.042667,-0.029898,-0.030592,0.032894,-0.048264,-0.026907,-0.031605,...,0.034926,0.005457,0.008154,0.000213,-0.041143,0.006733,0.014559,0.010264,-0.025657,0.004309
4_4,-0.017866,0.043117,-0.010928,-0.000770,0.029455,0.050779,0.031355,-0.034816,-0.009246,-0.012595,...,-0.000663,0.003068,-0.025001,-0.010100,-0.029951,0.044300,0.010624,-0.002545,0.012431,-0.005424
6_6,-0.025886,0.006914,0.053624,-0.031212,-0.019373,0.031912,0.056274,0.010998,0.006968,-0.036443,...,-0.097219,0.069390,0.042397,-0.006429,0.030670,-0.026842,-0.010416,-0.038563,0.025269,-0.029435
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1096_845_2,-0.007796,0.001538,0.016139,0.006417,0.040931,0.025988,-0.022721,-0.000441,-0.005409,0.066184,...,0.022199,0.026127,-0.020137,-0.023061,-0.045081,-0.034237,0.042045,0.023599,0.049356,0.014115
1100_914_2,0.013225,-0.039726,0.006890,-0.031841,0.001264,-0.021764,0.004850,-0.019347,-0.037923,0.047453,...,-0.001384,-0.001463,-0.008513,0.011473,0.018160,-0.018620,0.040127,-0.005344,0.092501,0.045532
1102_932_2,0.012192,-0.003082,-0.004636,-0.064791,0.038963,0.024241,-0.024458,0.022030,-0.025482,0.004505,...,-0.010103,-0.013915,-0.000205,-0.032490,0.010844,0.032930,-0.045313,0.023759,0.007988,-0.011164
1103_926_2,0.006197,-0.014417,0.037533,-0.011927,0.038414,-0.011060,0.022829,0.003065,-0.013780,0.018747,...,-0.032224,-0.016843,0.010262,-0.025906,-0.002050,0.006234,0.055450,0.020416,0.021010,-0.041268


## Save Onek1k chr 22 with gene location for test purposes

In [8]:
scdata_full_path = base_data_dir / "input_data/OneK1K_cohort_gene_expression_matrix_14_celltypes_w_gene_locations.h5ad.gz"
scdata = sc.read_h5ad(scdata_full_path)
scdata

AnnData object with n_obs × n_vars = 1272489 × 32738
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features', 'chromosome', 'start', 'end'

In [5]:
scdata_22 = scdata[:, scdata.var["chromosome"] == "22"]
scdata_22

View of AnnData object with n_obs × n_vars = 1272489 × 655
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features', 'chromosome', 'start', 'end'

In [6]:
scdata_22.var

Unnamed: 0_level_0,GeneSymbol,features,chromosome,start,end
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000233866,LA16c-4G1.3,LA16c-4G1.3,22,15915800.0,15914721.0
ENSG00000225255,LA16c-83F12.6,LA16c-83F12.6,22,15741297.0,15779680.0
ENSG00000198062,POTEH,POTEH,22,15690026.0,15721631.0
ENSG00000236666,POTEH-AS1,POTEH-AS1,22,15703403.0,15699361.0
ENSG00000230471,LA16c-2F2.8,LA16c-2F2.8,22,15625205.0,15557577.0
...,...,...,...,...,...
ENSG00000251322,SHANK3,SHANK3,22,50674408.0,50733212.0
ENSG00000225929,AC000036.4,AC000036.4,22,50738173.0,50735813.0
ENSG00000100312,ACR,ACR,22,50738196.0,50745339.0
ENSG00000254499,AC002056.5,AC002056.5,22,50743520.0,50740593.0


In [7]:
scdata_22.write(base_data_dir / "input_data/chr22_OneK1K_cohort_gene_expression_matrix_14_celltypes_w_gene_locations.h5ad.gz", compression="gzip")

In [14]:
scdata

AnnData object with n_obs × n_vars = 1272489 × 655
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features', 'chromosome', 'start', 'end'

In [15]:
scdata.var

Unnamed: 0_level_0,GeneSymbol,features,chromosome,start,end
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000233866,LA16c-4G1.3,LA16c-4G1.3,22,15915800.0,15914721.0
ENSG00000225255,LA16c-83F12.6,LA16c-83F12.6,22,15741297.0,15779680.0
ENSG00000198062,POTEH,POTEH,22,15690026.0,15721631.0
ENSG00000236666,POTEH-AS1,POTEH-AS1,22,15703403.0,15699361.0
ENSG00000230471,LA16c-2F2.8,LA16c-2F2.8,22,15625205.0,15557577.0
...,...,...,...,...,...
ENSG00000251322,SHANK3,SHANK3,22,50674408.0,50733212.0
ENSG00000225929,AC000036.4,AC000036.4,22,50738173.0,50735813.0
ENSG00000100312,ACR,ACR,22,50738196.0,50745339.0
ENSG00000254499,AC002056.5,AC002056.5,22,50743520.0,50740593.0


In [16]:
if "chromosome" in scdata.var.columns:
        scdata.var.rename(columns={"chromosome": "chrom"}, inplace=True)

In [17]:
scdata

AnnData object with n_obs × n_vars = 1272489 × 655
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features', 'chrom', 'start', 'end'

In [22]:
scdata_cell = scdata[scdata.obs.cell_label == "CD8 ET"]
scdata_cell

View of AnnData object with n_obs × n_vars = 205077 × 655
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features', 'chrom', 'start', 'end'

In [23]:
scdata_cell = scdata_cell[:, scdata_cell.var["chrom"] == "22"]
scdata_cell

View of AnnData object with n_obs × n_vars = 205077 × 655
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'pool', 'individual', 'percent.mt', 'latent', 'nCount_SCT', 'nFeature_SCT', 'cell_type', 'cell_label', 'sex', 'age'
    var: 'GeneSymbol', 'features', 'chrom', 'start', 'end'

## Test distance calculations

In [7]:
#preprocess scdata
sc.pp.normalize_total(scdata, target_sum=1e4)  # Normalize total counts per cell
sc.pp.log1p(scdata)  # Apply log-transform

In [9]:
gdata.varm["annotations_0"]

Unnamed: 0_level_0,Consequence_start_lost,PHENO,gnomADe_AF,Feature_type,CLIN_SIG,gnomADe_OTH_AF,Consequence_intergenic_variant,Location,gnomADe_AFR_AF,Allele,...,Consequence_splice_acceptor_variant,Consequence_splice_donor_5th_base_variant,Consequence_splice_donor_region_variant,Consequence_splice_donor_variant,Consequence_splice_polypyrimidine_tract_variant,Consequence_splice_region_variant,Consequence_stop_gained,Consequence_stop_retained_variant,Consequence_synonymous_variant,Consequence_upstream_gene_variant
snp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
22_16849573_A_G,0,-,,-,-,,1,22:16849573,,G,...,0,0,0,0,0,0,0,0,0,0
22_16849971_A_T,0,-,,-,-,,1,22:16849971,,T,...,0,0,0,0,0,0,0,0,0,0
22_16850437_G_A,0,-,,-,-,,1,22:16850437,,A,...,0,0,0,0,0,0,0,0,0,0
22_16851225_C_T,0,-,,-,-,,1,22:16851225,,T,...,0,0,0,0,0,0,0,0,0,0
22_16851356_C_T,0,-,,-,-,,1,22:16851356,,T,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22_51202748_A_G,0,-,,Transcript,-,,0,22:51202748,,G,...,0,0,0,0,0,0,0,0,0,0
22_51208568_G_T,0,-,,Transcript,-,,0,22:51208568,,T,...,0,0,0,0,0,0,0,0,0,0
22_51211031_A_G,0,-,,Transcript,-,,0,22:51211031,,G,...,0,0,0,0,0,0,0,0,0,0
22_51213613_C_T,0,-,,Transcript,-,,0,22:51213613,,T,...,0,0,0,0,0,0,0,0,0,0


In [10]:
gdata_anno0 = gdata.varm["annotations_0"]

In [8]:
def _get_burden(gd_gene, weight_col):
    this_weights = np.array(gd_gene.varm['annotations_0'][weight_col])
    g_weigthed = gd_gene.X * this_weights
    this_burdens = np.nansum(g_weigthed, axis = 1) #TODO implement alternative weighting functions
    return this_burdens

def _calc_tss_distance_per_gene(variants_df,
                                gene_start,
                                gene_end,
                                col_name_distance,
                                col_name_saige=""):
    """
    Calculate absolute TSS Distance.

    Parameters:
        variants_df (pd.DataFrame): DataFrame with SNP ID as index and contains variant position in column "Position"
        gene_start (int): Start of Gene aka TS
        col_name_distance (str): name of TSS column
        col_name_saige (str, optional): determines whether saige formula should be calculated as well.

    Returns:
        distances_df (pd.DataFrame): Dataframe with tss distance per variant
    """
    distances = {"snp_id": [], col_name_distance: []}
    saige_distances = []
    #print(variants_df.index)
    
    if gene_start > gene_end:
        gene_start, gene_end = gene_end, gene_start  # Reverse strand handling

    for i, row in variants_df.iterrows():
        if gene_start <= row["Position"] <= gene_end:
            distances["snp_id"].append(i)
            distances[col_name_distance].append(0)  # Variant is within the gene

        # Calculate absolute distances to start and end positions
        distance_to_start = abs(row["Position"] - gene_start)  # upstream
        distance_to_end = abs(row["Position"] - gene_end)  # downstream

        # Get the minimum distance = correct distance
        distance = min(distance_to_start, distance_to_end)
        distances["snp_id"].append(i)
        distances[col_name_distance].append(distance)

        # calculate saige if parameter is set
        if col_name_saige != "":
            distance_saige = np.exp(-1e-5 * distance)
            saige_distances.append(distance_saige)

    # add saige to final dataframe if parameter is set
    if col_name_saige != "":
        distances[col_name_saige] = saige_distances

    # return dataframe with tss distance per variant
    distance_df = pd.DataFrame(distances)
    distance_df.set_index("snp_id", inplace=True)
    #print(distance_df)
    return distance_df


def _find_snps_near_gene(gdata,
                         gene_chrom,
                         gene_start,
                         gene_end,
                         bp_range=10000):
    """
    Finds SNPs within a specified range of a gene's location.

    Parameters:
        gdata (pd.DataFrame): DataFrame with a 'Location' column in the format "chromosome:position".
        gene_chrom, gene_start, gene_end (int)
        bp_range (int): Range in base pairs to search upstream and downstream.

    Returns:
        pd.DataFrame: With data of SNPs within the specified range.
    """
    # Parse the gene location
    #import ipdb; ipdb.set_trace()

    # Extract chromosome and position from the SNPs
    gene_chrom = str(gene_chrom)
    gdata_df = gdata.copy()
    gdata_df[['Chromosome', 'Position']] = gdata_df['Location'].str.split(':', expand=True)
    gdata_df['Position'] = gdata_df['Position'].astype(int)

    # Filter for SNPs within the range
    # snps_in_range = gdata_df[
    #     (gdata_df['Chromosome'] == gene_chrom) &
    #     (gdata_df['Position'] >= gene_start - bp_range) &
    #     (gdata_df['Position'] <= gene_end + bp_range)
    # ]

    if gene_start < gene_end:
        # forward strand
        snps_upstream = gdata_df[
            (gdata_df['Chromosome'] == gene_chrom) &
            (gdata_df['Position'] >= gene_start - bp_range)
        ]
        snps_downstream = gdata_df[
            (gdata_df['Chromosome'] == gene_chrom) &
            (gdata_df['Position'] <= gene_start + bp_range)
        ]
    else:
        # reverse strand
        snps_upstream = gdata_df[
            (gdata_df['Chromosome'] == gene_chrom) &
            (gdata_df['Position'] >= gene_start + bp_range)
        ]
        snps_downstream = gdata_df[
            (gdata_df['Chromosome'] == gene_chrom) &
            (gdata_df['Position'] <= gene_start - bp_range)
        ]

    # return snps_in_range.index
    return snps_upstream, snps_downstream


def _compute_burdens_for_gene(this_gd,
                              this_gene,
                              gene_chrom,
                              gene_start,
                              gene_end,
                              weight_cols,
                              annotation_varm="annotations_0",
                              window_size=100000,
                              DNA_LM_up="",
                              DNA_LM_down="",
                              DNA_LM_mixed="DNA_LM_mixed",
                              GENE_TSS_DISTANCE="",
                              GENE_TSS_DISTANCE_SAIGE=""):
    """
    Compute burdenscores for a given gene and given annotations

    Parameters:
        this_gd (pd.DataFrame): ddata.gdata.
        this_gene (str): Ensemble ID.
        gene_chrom, gene_start, gene_end (int)
        weight_cols (list): colnames of variant annotations to compute burden scores for.
        annotation_varm (str): key for pd.DataFrame (gdata.varm[key])
        window_size (int)
        DNA_LM_up (str): colname for DNA_LM upstream model
                    if empty, mixed model is not computed
        DNA_LM_down (str): colname for DNA_LM downstream model
                    if empty, mixed model is not computed
        DNA_LM_mixed (str): name of mixed model column
        GENE_TSS_DISTANCE (str): name for tss distance column
        GENE_TSS_DISTANCE_SAIGE (str): name for tss distance saige column

    Returns:
        pd.DataFrame containing burden scores for this_gene across the weight_cols
    """
    # Return a dataframe with None entries for weight cols, if gene location is nan (this means it could not be found in ensembl)
    if np.isnan(gene_chrom):
        print(f"Failed to retrieve location for gene {this_gene}. No Burden scores computed.")
        # Create a DataFrame with None for all the weight columns
        empty_burdens = pd.DataFrame(
            None,
            index=this_gd.obs.index,  # Assuming these are the sample indices
            columns=weight_cols
        )
        # Add the Geneid column
        empty_burdens["Geneid"] = this_gene
        return empty_burdens

    # Filter the variants using the SNP location and gene location
    this_vars_up_df, this_vars_down_df = _find_snps_near_gene(this_gd.varm[annotation_varm], gene_chrom, gene_start, gene_end, bp_range=window_size)

    # get snps IDs
    this_vars_down = this_vars_down_df.index
    this_vars_up = this_vars_up_df.index

    gd_gene = this_gd[:, this_vars_up.append(this_vars_down).unique()].copy()

    # if mixed model is computed, add column for DNA_LM mixed model
    if DNA_LM_up != "":
        # Add the "DNA_LM_mixed" column to the annotations_0 DataFrame
        gd_gene.varm["annotations_0"][DNA_LM_mixed] = np.nan  # Initialize the column with NaN

        # Assign values to the "DNA_LM_mixed" column based on the conditions
        gd_gene.varm["annotations_0"].loc[this_vars_up, DNA_LM_mixed] = gd_gene.varm["annotations_0"].loc[this_vars_up, DNA_LM_up]
        gd_gene.varm["annotations_0"].loc[this_vars_down, DNA_LM_mixed] = gd_gene.varm["annotations_0"].loc[this_vars_down, DNA_LM_down]

    if GENE_TSS_DISTANCE_SAIGE != "": # calc GENE_TSS_DISTANCE and GENE_TSS_DISTANCE_SAIGE
        gd_gene.varm["annotations_0"][GENE_TSS_DISTANCE] = np.nan # Initialize the column with NaN
        gd_gene.varm["annotations_0"][GENE_TSS_DISTANCE_SAIGE] = np.nan  # Initialize the column with NaN
        # calculate GENE_TSS_DISTANCE and GENE_TSS_DISTANCE_SAIGE independent of up or downstream
        all_variants = pd.concat([this_vars_up_df, this_vars_down_df], axis=0)
        distances = _calc_tss_distance_per_gene(all_variants, gene_start, gene_end, GENE_TSS_DISTANCE, GENE_TSS_DISTANCE_SAIGE)
        #print(distances)
        #print(distances.index)
        # add GENE_TSS_DISTANCE and GENE_TSS_DISTANCE_SAIGE to annotation 0
        gd_gene.varm["annotations_0"].loc[distances.index, GENE_TSS_DISTANCE] = distances[GENE_TSS_DISTANCE]
        gd_gene.varm["annotations_0"].loc[distances.index, GENE_TSS_DISTANCE_SAIGE] = distances[GENE_TSS_DISTANCE_SAIGE]
    elif GENE_TSS_DISTANCE != "":  # calc only GENE_TSS_DISTANCE
        gd_gene.varm["annotations_0"][GENE_TSS_DISTANCE] = np.nan  # Initialize the column with NaN
        # calculate tss distance independent of up or downstream
        all_variants = pd.concat([this_vars_up_df, this_vars_down_df])
        tss_distances = _calc_tss_distance_per_gene(all_variants, gene_start, gene_end, GENE_TSS_DISTANCE)
        # add to gd_gene
        gd_gene.varm["annotations_0"].loc[tss_distances.index, GENE_TSS_DISTANCE] = tss_distances[GENE_TSS_DISTANCE]

    all_burdens_this_gene = []
    for weight_col in weight_cols:
        this_burden = _get_burden(gd_gene, weight_col)
        all_burdens_this_gene.append(this_burden)

    all_burdens_this_gene = np.stack(all_burdens_this_gene, axis=1)
    all_burdens_this_gene = pd.DataFrame(all_burdens_this_gene, index=gd_gene.obs.index, columns=weight_cols)
    all_burdens_this_gene["Geneid"] = this_gene

    return all_burdens_this_gene


In [59]:
this_gd = gdata[:, gdata.var["maf"] < 0.05]

In [68]:
_compute_burdens_for_gene(this_gd, "ENSG00000233866", 22, 15915800.0, 15914721.0, ["GENE_TSS_DISTANCE","GENE_TSS_DISTANCE_SAIGE"], "annotations_0", 10000,GENE_TSS_DISTANCE="GENE_TSS_DISTANCE", GENE_TSS_DISTANCE_SAIGE="GENE_TSS_DISTANCE_SAIGE")

Unnamed: 0_level_0,GENE_TSS_DISTANCE,GENE_TSS_DISTANCE_SAIGE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1_1,5.974337e+10,0.000005,ENSG00000233866
2_2,6.255511e+10,0.000030,ENSG00000233866
3_3,5.302204e+10,0.000003,ENSG00000233866
4_4,6.431704e+10,0.000004,ENSG00000233866
6_6,5.468808e+10,0.000004,ENSG00000233866
...,...,...,...
1096_845_2,5.767224e+10,0.000003,ENSG00000233866
1100_914_2,5.312065e+10,0.000004,ENSG00000233866
1102_932_2,6.263424e+10,0.000003,ENSG00000233866
1103_926_2,6.648737e+10,0.000004,ENSG00000233866


In [70]:
_compute_burdens_for_gene(this_gd, "ENSG00000233866", 22, 15915800.0, 15914721.0, ["GENE_TSS_DISTANCE"], "annotations_0", 10000,GENE_TSS_DISTANCE="GENE_TSS_DISTANCE")

Unnamed: 0_level_0,GENE_TSS_DISTANCE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1_1,5.974337e+10,ENSG00000233866
2_2,6.255511e+10,ENSG00000233866
3_3,5.302204e+10,ENSG00000233866
4_4,6.431704e+10,ENSG00000233866
6_6,5.468808e+10,ENSG00000233866
...,...,...
1096_845_2,5.767224e+10,ENSG00000233866
1100_914_2,5.312065e+10,ENSG00000233866
1102_932_2,6.263424e+10,ENSG00000233866
1103_926_2,6.648737e+10,ENSG00000233866


In [71]:
_compute_burdens_for_gene(this_gd, "ENSG00000233866", 22, 15915800.0, 15914721.0, ["GENE_TSS_DISTANCE_SAIGE"], "annotations_0", 10000,GENE_TSS_DISTANCE_SAIGE="GENE_TSS_DISTANCE_SAIGE")

Unnamed: 0_level_0,GENE_TSS_DISTANCE_SAIGE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1_1,0.000005,ENSG00000233866
2_2,0.000030,ENSG00000233866
3_3,0.000003,ENSG00000233866
4_4,0.000004,ENSG00000233866
6_6,0.000004,ENSG00000233866
...,...,...
1096_845_2,0.000003,ENSG00000233866
1100_914_2,0.000004,ENSG00000233866
1102_932_2,0.000003,ENSG00000233866
1103_926_2,0.000004,ENSG00000233866


In [21]:
def compute_burdens(ddata,
                    max_af=0.05,
                    weight_cols=["DISTANCE", "CADD_PHRED"],
                    annotations_varm="annotations_0",
                    window_size=100000,
                    DNA_LM_up="",
                    DNA_LM_down="",
                    DNA_LM_mixed="DNA_LM_mixed",
                    GENE_TSS_DISTANCE="",
                    GENE_TSS_DISTANCE_SAIGE=""):
    """Compute gene burdens for each gene and sample using different variant annotations

    Parameters
    ----------
    ddata : ddata
        _description_
    max_af : float, optional
        maximum variant minor allele frequency, by default 0.05
    weight_cols : list, optional
        variant annotations used for weighting (columns of gdata.varm.annotations_0), by default ["DISTANCE", "CADD_PHRED"]
    annotations_varm: str, optional
        key for gdata.varm dataframe (eg: gdata.varm["annotations_0"])
    window_size: int, optional
        range around gene TSS, in which variants are regarded for gene burden scores
    DNA_LM_up: str, optional, if DNA_LM mixed model should be computed
        colname of DNA_LM score upstream model
    DNA_LM_down: str, optional, if DNA_LM mixed model should be computed
        colname of DNA_LM score downstream model
    DNA_LM_mixed: str, optional
        colname of DNA_LM score mixed model
    GENE_TSS_DISTANCE: str, optional
        colname for TSS distances and also flag to compute its burden 
    GENE_TSS_DISTANCE_SAIGE:str, optional
        colname for TSS distances using saige formula and also flag to compute its burden 
    Returns
    -------
    pandas.DataFrame
        gene burdens for all genes, individuals and all annotations in weightcols
    """
    if (DNA_LM_up != "" and DNA_LM_down == "") or (DNA_LM_up == "" and DNA_LM_down != ""):
        raise ValueError("If you want to compute the burden scores using the DNA_LM mixed model, you must set both DNA_LM_up and DNA_LM_down, else leave both empty.")

    this_gd = ddata.gdata.copy()
    this_ad = ddata.adata.copy()
    this_gd = this_gd[:, this_gd.var["maf"] < max_af]
    all_burdens = []

    if not all(col in this_ad.var.columns for col in ["chromosome", "start", "end"]):
        # compute all the gene locations
        this_ad.var[['chromosome', 'start', 'end']] = this_ad.var.index.to_series().apply(
            lambda x: pd.Series(_get_gene_location(x))
        )

    # add mixed model to the weight cols for which burden score is computed
    if DNA_LM_up != "" and DNA_LM_down != "" and DNA_LM_mixed not in weight_cols:
        weight_cols.append(DNA_LM_mixed)
        if DNA_LM_up not in weight_cols:
            weight_cols.append(DNA_LM_up)
        if DNA_LM_down not in weight_cols:
            weight_cols.append(DNA_LM_down)

    # add tss distance to the weight cols to initialize burden score computations
    if GENE_TSS_DISTANCE != "" and GENE_TSS_DISTANCE not in weight_cols:
        weight_cols.append(GENE_TSS_DISTANCE)

    # add tss distance saige to the weight cols to initialize burden score computations
    if GENE_TSS_DISTANCE_SAIGE != "":
        if GENE_TSS_DISTANCE_SAIGE not in weight_cols:
            weight_cols.append(GENE_TSS_DISTANCE_SAIGE)
        if GENE_TSS_DISTANCE == "":  # if saige is set then tss distance has to be calculated too
            GENE_TSS_DISTANCE = "GENE_TSS_DISTANCE"

    for gene in tqdm(this_ad.var.index[0:1]):
        gene_chrom = int(this_ad.var.loc[gene, "chromosome"])
        gene_start = int(this_ad.var.loc[gene, "start"])
        gene_end = int(this_ad.var.loc[gene, "end"])

        this_b = _compute_burdens_for_gene(this_gd, gene, gene_chrom, gene_start, gene_end, weight_cols, annotations_varm, window_size, DNA_LM_up, DNA_LM_down, DNA_LM_mixed, GENE_TSS_DISTANCE, GENE_TSS_DISTANCE_SAIGE)
        all_burdens.append(this_b)

    all_burdens = pd.concat(all_burdens)

    return all_burdens

In [10]:
data = cl.DonorData(adata=scdata, gdata=gdata, donor_key_in_sc_adata="individual")

[2025-01-19 18:33:15,519] INFO:cellink._core.donordata: Keeping 981/1034 donors
[2025-01-19 18:33:15,520] INFO:cellink._core.donordata: Dropping 53/1034 donors from genetic data
[2025-01-19 18:33:15,520] INFO:cellink._core.donordata: Dropping 0/981 donors from single-cell data


In [20]:
compute_burdens(data, max_af=0.05, weight_cols=["CADD_PHRED", "DNA_LM_up", "DNA_LM_down", "MAF_beta_1.25"], window_size=100000, DNA_LM_up="DNA_LM_up", DNA_LM_down="DNA_LM_down",GENE_TSS_DISTANCE="GENE_TSS_DISTANCE", GENE_TSS_DISTANCE_SAIGE="GENE_TSS_DISTANCE_SAIGE")

ENSG00000233866


  0%|          | 0/1 [00:00<?, ?it/s]

ENSG00000233866



00%|██████████| 1/1 [00:05<00:00,  5.22s/it]

Unnamed: 0_level_0,CADD_PHRED,DNA_LM_up,DNA_LM_down,MAF_beta_1.25,DNA_LM_mixed,GENE_TSS_DISTANCE,GENE_TSS_DISTANCE_SAIGE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1_1,9350.801,13.224558,22.916096,43612.917035,13.224558,5.974337e+10,0.000005,ENSG00000233866
2_2,10775.218,16.520484,28.387657,52127.930292,16.520484,6.255511e+10,0.000030,ENSG00000233866
3_3,8327.426,12.240323,21.604692,39069.213569,12.240323,5.302204e+10,0.000003,ENSG00000233866
4_4,9439.675,18.346526,32.699742,44983.557785,18.346526,6.431704e+10,0.000004,ENSG00000233866
6_6,9054.896,13.340017,23.019223,42525.702648,13.340017,5.468808e+10,0.000004,ENSG00000233866
...,...,...,...,...,...,...,...,...
1077_1078,9227.155,15.636985,27.382034,42677.295555,15.636985,5.507287e+10,0.000006,ENSG00000233866
1078_1079,9411.798,14.626046,25.363997,44142.005875,14.626046,6.204754e+10,0.000096,ENSG00000233866
1079_1080,9862.022,13.470969,23.919521,44060.525919,13.470969,6.135349e+10,0.000005,ENSG00000233866
1080_1081,8392.390,13.858525,24.546299,40866.836483,13.858525,5.485095e+10,0.000003,ENSG00000233866


In [22]:
compute_burdens(data, max_af=0.05, weight_cols=["CADD_PHRED", "DNA_LM_up", "DNA_LM_down", "MAF_beta_1.25"], window_size=100000, DNA_LM_up="DNA_LM_up", DNA_LM_down="DNA_LM_down",GENE_TSS_DISTANCE="GENE_TSS_DISTANCE")


00%|██████████| 1/1 [00:04<00:00,  4.84s/it]

Unnamed: 0_level_0,CADD_PHRED,DNA_LM_up,DNA_LM_down,MAF_beta_1.25,DNA_LM_mixed,GENE_TSS_DISTANCE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1_1,9350.801,13.224558,22.916096,43612.917035,13.224558,5.974337e+10,ENSG00000233866
2_2,10775.218,16.520484,28.387657,52127.930292,16.520484,6.255511e+10,ENSG00000233866
3_3,8327.426,12.240323,21.604692,39069.213569,12.240323,5.302204e+10,ENSG00000233866
4_4,9439.675,18.346526,32.699742,44983.557785,18.346526,6.431704e+10,ENSG00000233866
6_6,9054.896,13.340017,23.019223,42525.702648,13.340017,5.468808e+10,ENSG00000233866
...,...,...,...,...,...,...,...
1077_1078,9227.155,15.636985,27.382034,42677.295555,15.636985,5.507287e+10,ENSG00000233866
1078_1079,9411.798,14.626046,25.363997,44142.005875,14.626046,6.204754e+10,ENSG00000233866
1079_1080,9862.022,13.470969,23.919521,44060.525919,13.470969,6.135349e+10,ENSG00000233866
1080_1081,8392.390,13.858525,24.546299,40866.836483,13.858525,5.485095e+10,ENSG00000233866


In [23]:
compute_burdens(data, max_af=0.05, weight_cols=["CADD_PHRED", "DNA_LM_up", "DNA_LM_down", "MAF_beta_1.25"], window_size=100000, DNA_LM_up="DNA_LM_up", DNA_LM_down="DNA_LM_down", GENE_TSS_DISTANCE_SAIGE="GENE_TSS_DISTANCE_SAIGE")


00%|██████████| 1/1 [00:04<00:00,  4.95s/it]

Unnamed: 0_level_0,CADD_PHRED,DNA_LM_up,DNA_LM_down,MAF_beta_1.25,DNA_LM_mixed,GENE_TSS_DISTANCE_SAIGE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1_1,9350.801,13.224558,22.916096,43612.917035,13.224558,0.000005,ENSG00000233866
2_2,10775.218,16.520484,28.387657,52127.930292,16.520484,0.000030,ENSG00000233866
3_3,8327.426,12.240323,21.604692,39069.213569,12.240323,0.000003,ENSG00000233866
4_4,9439.675,18.346526,32.699742,44983.557785,18.346526,0.000004,ENSG00000233866
6_6,9054.896,13.340017,23.019223,42525.702648,13.340017,0.000004,ENSG00000233866
...,...,...,...,...,...,...,...
1077_1078,9227.155,15.636985,27.382034,42677.295555,15.636985,0.000006,ENSG00000233866
1078_1079,9411.798,14.626046,25.363997,44142.005875,14.626046,0.000096,ENSG00000233866
1079_1080,9862.022,13.470969,23.919521,44060.525919,13.470969,0.000005,ENSG00000233866
1080_1081,8392.390,13.858525,24.546299,40866.836483,13.858525,0.000003,ENSG00000233866


In [24]:
test = pd.read_csv(base_data_dir/"output/example_output/burden_CellTypeCD8-NC_Chrom22.csv")
test

Unnamed: 0,burden_gene,target_gene,burden_type,pvalue,beta,cell_type,chrom
0,ENSG00000100181,ENSG00000100181,DISTANCE,0.541026,0.000362,CD8 NC,22
1,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.285429,0.000635,CD8 NC,22
2,ENSG00000100181,ENSG00000100181,DNA_LM_influence_score,0.341302,0.000566,CD8 NC,22
3,ENSG00000100181,ENSG00000100181,MAF_beta_1.25,0.365920,0.000537,CD8 NC,22
4,ENSG00000237438,ENSG00000237438,DISTANCE,0.321480,0.000314,CD8 NC,22
...,...,...,...,...,...,...,...
1867,ENSG00000100299,ENSG00000100299,MAF_beta_1.25,0.958578,-0.000063,CD8 NC,22
1868,ENSG00000079974,ENSG00000079974,DISTANCE,1.000000,,CD8 NC,22
1869,ENSG00000079974,ENSG00000079974,CADD_PHRED,1.000000,,CD8 NC,22
1870,ENSG00000079974,ENSG00000079974,DNA_LM_influence_score,1.000000,,CD8 NC,22


In [25]:
all_res = pd.read_pickle(base_data_dir/"output/all_results_DNA_LM_and_MAF.pkl")

In [26]:
all_res

Unnamed: 0,burden_gene,target_gene,burden_type,pvalue,beta,cell_type,chrom,pvalue_corrected,significant
0,ENSG00000100181,ENSG00000100181,DISTANCE,0.766616,0.000125,CD8 ET,22,19376.982274,False
1,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.840065,-0.000085,CD8 ET,22,21233.491502,False
2,ENSG00000100181,ENSG00000100181,DNA_LM_influence_score,0.596951,-0.000224,CD8 ET,22,15088.544000,False
3,ENSG00000100181,ENSG00000100181,MAF_beta_1.25,0.896485,-0.000055,CD8 ET,22,22659.563673,False
0,ENSG00000237438,ENSG00000237438,DISTANCE,0.521166,0.000112,CD8 ET,22,13172.988603,False
...,...,...,...,...,...,...,...,...,...
3,ENSG00000100288,ENSG00000100288,MAF_beta_1.25,0.093014,-0.030436,Erythrocytes,22,2351.011086,False
0,ENSG00000079974,ENSG00000079974,DISTANCE,1.000000,,Erythrocytes,22,25276.000000,False
1,ENSG00000079974,ENSG00000079974,CADD_PHRED,1.000000,,Erythrocytes,22,25276.000000,False
2,ENSG00000079974,ENSG00000079974,DNA_LM_influence_score,1.000000,,Erythrocytes,22,25276.000000,False


In [26]:
all_burdens = pd.read_parquet("/s/project/sys_gen_students/2024_2025/project04_rare_variant_sc/output/burdens/chr22_all_burdens_test_10_genes.parquet")

In [44]:
all_burdens

Unnamed: 0_level_0,CADD_PHRED,DNA_LM_up,DNA_LM_down,MAF_beta_1.25,DNA_LM_mixed,GENE_TSS_DISTANCE,GENE_TSS_DISTANCE_SAIGE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1_1,9350.801,13.224558,22.916096,43612.917035,13.224558,5.974337e+10,0.000005,ENSG00000233866
2_2,10775.218,16.520484,28.387657,52127.930292,16.520484,6.255511e+10,0.000030,ENSG00000233866
3_3,8327.426,12.240323,21.604692,39069.213569,12.240323,5.302204e+10,0.000003,ENSG00000233866
4_4,9439.675,18.346526,32.699742,44983.557785,18.346526,6.431704e+10,0.000004,ENSG00000233866
6_6,9054.896,13.340017,23.019223,42525.702648,13.340017,5.468808e+10,0.000004,ENSG00000233866
...,...,...,...,...,...,...,...,...
1077_1078,9227.155,15.636985,27.382034,42677.295555,15.636985,5.295985e+10,0.006536,ENSG00000267338
1078_1079,9411.798,14.626046,25.363997,44142.005875,14.626046,5.983927e+10,0.105210,ENSG00000267338
1079_1080,9862.022,13.470969,23.919521,44060.525919,13.470969,5.902827e+10,0.005629,ENSG00000267338
1080_1081,8392.390,13.858525,24.546299,40866.836483,13.858525,5.283878e+10,0.003227,ENSG00000267338


In [None]:
eigenvec = pd.read_csv("/s/project/sys_gen_students/2024_2025/project04_rare_variant_sc/input_data/pcdir/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec", sep=' ', header=None)
eigenvec.index = eigenvec[1]
eigenvec = eigenvec.iloc[:, 2:]
eigenvec = eigenvec[eigenvec.index.isin(all_burdens.index.unique())]

In [18]:
#data = pd.read_pickle("/s/project/sys_gen_students/2024_2025/project04_rare_variant_sc/output/pb_data_object_test_before.pkl")

In [41]:
data = pd.read_pickle("/s/project/sys_gen_students/2024_2025/project04_rare_variant_sc/output/annotations/chr22_data_test_10_genes.pkl")

In [42]:
data.adata.var_names

Index(['ENSG00000233866', 'ENSG00000225255', 'ENSG00000198062',
       'ENSG00000236666', 'ENSG00000230471', 'ENSG00000130538',
       'ENSG00000273362', 'ENSG00000198445', 'ENSG00000100181',
       'ENSG00000267338',
       ...
       'ENSG00000254413', 'ENSG00000100288', 'ENSG00000205559',
       'ENSG00000008735', 'ENSG00000100299', 'ENSG00000251322',
       'ENSG00000225929', 'ENSG00000100312', 'ENSG00000254499',
       'ENSG00000079974'],
      dtype='object', name='Geneid', length=655)

In [24]:
target_genes = ['ENSG00000233866', 'ENSG00000225255', 'ENSG00000198062', 'ENSG00000236666',
 'ENSG00000230471', 'ENSG00000130538', 'ENSG00000273362', 'ENSG00000198445',
 'ENSG00000100181', 'ENSG00000267338']

In [35]:
scdata_22 = scdata[:, scdata.var["chromosome"] == "22"]

In [37]:
def preprocess_scdata(scdata):
    scdata = scdata.copy()  # don't mess with view changes just in case
    sc.pp.normalize_total(scdata, target_sum=1e4)  # Normalize total counts per cell
    sc.pp.log1p(scdata)  # Apply log-transform
    return scdata

scdata_22_pp = preprocess_scdata(scdata_22)

In [43]:
for gene in target_genes:
    if gene in data.adata.var_names:
        print(gene)

ENSG00000233866
ENSG00000225255
ENSG00000198062
ENSG00000236666
ENSG00000230471
ENSG00000130538
ENSG00000273362
ENSG00000198445
ENSG00000100181
ENSG00000267338


In [None]:
gdata.uns["gene_burdens"]

## Plotting

In [6]:
result_500_path = base_data_dir / "output/association/chr22_all_results_test_500_genes.pkl"

In [7]:
result_500 = pd.read_pickle(result_500_path)
result_500

Unnamed: 0,burden_gene,target_gene,burden_type,pvalue,beta,cell_type,chromosome
0,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.410408,-0.173248,CD8 ET,22
1,ENSG00000100181,ENSG00000100181,DNA_LM_up,0.455557,-0.157398,CD8 ET,22
2,ENSG00000100181,ENSG00000100181,DNA_LM_down,0.405714,-0.175694,CD8 ET,22
3,ENSG00000100181,ENSG00000100181,MAF_beta_1.25,0.438648,-0.164011,CD8 ET,22
4,ENSG00000100181,ENSG00000100181,DNA_LM_mixed,0.455557,-0.157398,CD8 ET,22
...,...,...,...,...,...,...,...
2,ENSG00000100138,ENSG00000100138,DNA_LM_down,0.805454,-0.001872,CD8 NC,22
3,ENSG00000100138,ENSG00000100138,MAF_beta_1.25,0.706450,-0.002880,CD8 NC,22
4,ENSG00000100138,ENSG00000100138,DNA_LM_mixed,0.545695,-0.004589,CD8 NC,22
5,ENSG00000100138,ENSG00000100138,GENE_TSS_DISTANCE,0.408981,-0.006333,CD8 NC,22


In [8]:
result_test_10_path = base_data_dir / "output/association/chr22_all_results_test_10_genes.pkl"
result_test_10 = pd.read_pickle(result_test_10_path)
result_test_10

Unnamed: 0,burden_gene,target_gene,burden_type,pvalue,beta,cell_type,chromosome
0,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.410408,-0.173248,CD8 ET,22
1,ENSG00000100181,ENSG00000100181,DNA_LM_up,0.455557,-0.157398,CD8 ET,22
2,ENSG00000100181,ENSG00000100181,DNA_LM_down,0.405714,-0.175694,CD8 ET,22
3,ENSG00000100181,ENSG00000100181,MAF_beta_1.25,0.438648,-0.164011,CD8 ET,22
4,ENSG00000100181,ENSG00000100181,DNA_LM_mixed,0.455557,-0.157398,CD8 ET,22
5,ENSG00000100181,ENSG00000100181,GENE_TSS_DISTANCE,0.072062,-0.3789,CD8 ET,22
6,ENSG00000100181,ENSG00000100181,GENE_TSS_DISTANCE_SAIGE,0.086121,0.361357,CD8 ET,22
0,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.326078,0.213591,CD8 NC,22
1,ENSG00000100181,ENSG00000100181,DNA_LM_up,0.197995,0.278324,CD8 NC,22
2,ENSG00000100181,ENSG00000100181,DNA_LM_down,0.186317,0.286095,CD8 NC,22


In [9]:
result_10_path = base_data_dir / "output/association/chr22_all_results_10_genes.pkl"
result_10 = pd.read_pickle(result_10_path)
result_10

Unnamed: 0,burden_gene,target_gene,burden_type,pvalue,beta,cell_type,chromosome
0,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.410408,-0.173248,CD8 ET,22
1,ENSG00000100181,ENSG00000100181,DNA_LM_up,0.455557,-0.157398,CD8 ET,22
2,ENSG00000100181,ENSG00000100181,DNA_LM_down,0.405714,-0.175694,CD8 ET,22
3,ENSG00000100181,ENSG00000100181,MAF_beta_1.25,0.438648,-0.164011,CD8 ET,22
4,ENSG00000100181,ENSG00000100181,DNA_LM_mixed,0.455557,-0.157398,CD8 ET,22
5,ENSG00000100181,ENSG00000100181,GENE_TSS_DISTANCE,0.072062,-0.3789,CD8 ET,22
6,ENSG00000100181,ENSG00000100181,GENE_TSS_DISTANCE_SAIGE,0.086121,0.361357,CD8 ET,22
0,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.326078,0.213591,CD8 NC,22
1,ENSG00000100181,ENSG00000100181,DNA_LM_up,0.197995,0.278324,CD8 NC,22
2,ENSG00000100181,ENSG00000100181,DNA_LM_down,0.186317,0.286095,CD8 NC,22


In [10]:
CD8_ET_Chrom22_path = base_data_dir / "output/association/dump/burden_CellTypeCD8-ET_Chrom22.csv"
CD8_ET_Chrom22 = pd.read_csv(CD8_ET_Chrom22_path)
CD8_ET_Chrom22

Unnamed: 0,burden_gene,target_gene,burden_type,pvalue,beta,cell_type,chromosome
0,ENSG00000100181,ENSG00000100181,CADD_PHRED,0.410408,-0.173248,CD8 ET,22
1,ENSG00000100181,ENSG00000100181,DNA_LM_up,0.455557,-0.157398,CD8 ET,22
2,ENSG00000100181,ENSG00000100181,DNA_LM_down,0.405714,-0.175694,CD8 ET,22
3,ENSG00000100181,ENSG00000100181,MAF_beta_1.25,0.438648,-0.164011,CD8 ET,22
4,ENSG00000100181,ENSG00000100181,DNA_LM_mixed,0.455557,-0.157398,CD8 ET,22
...,...,...,...,...,...,...,...
3229,ENSG00000079974,ENSG00000079974,DNA_LM_down,0.853328,0.015616,CD8 ET,22
3230,ENSG00000079974,ENSG00000079974,MAF_beta_1.25,0.984727,-0.001621,CD8 ET,22
3231,ENSG00000079974,ENSG00000079974,DNA_LM_mixed,0.858854,0.015022,CD8 ET,22
3232,ENSG00000079974,ENSG00000079974,GENE_TSS_DISTANCE,0.969589,0.003215,CD8 ET,22


In [11]:
burdens_500_path = base_data_dir / "output/burdens/chr22_all_burdens_test_500_genes.parquet"
burdens_500 = pd.read_parquet(burdens_500_path)
burdens_500

Unnamed: 0_level_0,CADD_PHRED,DNA_LM_up,DNA_LM_down,MAF_beta_1.25,DNA_LM_mixed,GENE_TSS_DISTANCE,GENE_TSS_DISTANCE_SAIGE,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1_1,9350.801,13.224558,22.916096,43612.917035,13.224558,5.974337e+10,0.000005,ENSG00000233866
2_2,10775.218,16.520484,28.387657,52127.930292,16.520484,6.255511e+10,0.000030,ENSG00000233866
3_3,8327.426,12.240323,21.604692,39069.213569,12.240323,5.302204e+10,0.000003,ENSG00000233866
4_4,9439.675,18.346526,32.699742,44983.557785,18.346526,6.431704e+10,0.000004,ENSG00000233866
6_6,9054.896,13.340017,23.019223,42525.702648,13.340017,5.468808e+10,0.000004,ENSG00000233866
...,...,...,...,...,...,...,...,...
1077_1078,9205.759,15.563891,27.192362,42441.879194,24.147251,3.164140e+10,4.141776,ENSG00000100138
1078_1079,9408.366,14.626046,25.363997,44120.015877,21.432226,3.088100e+10,2.944592,ENSG00000100138
1079_1080,9847.835,13.436251,23.860703,43963.871811,21.113776,3.274119e+10,4.741015,ENSG00000100138
1080_1081,8382.267,13.858525,24.546299,40791.120729,21.272972,2.853632e+10,3.845786,ENSG00000100138


In [None]:
burdens_500

In [12]:
burdens_10_path = base_data_dir / "output/chr22_computed_burdens_test_10_genes.pkl"
burdens_10 = pd.read_pickle(burdens_10_path)
burdens_10

Unnamed: 0_level_0,CADD_PHRED,DNA_LM_up,DNA_LM_down,MAF_beta_1.25,DNA_LM_mixed,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1_1,9350.801,13.224558,22.916096,43612.917035,13.224558,ENSG00000233866
2_2,10775.218,16.520484,28.387657,52127.930292,16.520484,ENSG00000233866
3_3,8327.426,12.240323,21.604692,39069.213569,12.240323,ENSG00000233866
4_4,9439.675,18.346526,32.699742,44983.557785,18.346526,ENSG00000233866
6_6,9054.896,13.340017,23.019223,42525.702648,13.340017,ENSG00000233866
...,...,...,...,...,...,...
1077_1078,9227.155,15.636985,27.382034,42677.295555,15.636985,ENSG00000267338
1078_1079,9411.798,14.626046,25.363997,44142.005875,14.626046,ENSG00000267338
1079_1080,9862.022,13.470969,23.919521,44060.525919,13.470969,ENSG00000267338
1080_1081,8392.390,13.858525,24.546299,40866.836483,13.858525,ENSG00000267338


In [14]:
burdens_old_path = base_data_dir / "output/all_results_DNA_LM_and_MAF_100k.pkl"
burdens_old = pd.read_pickle(burdens_old_path)
burdens_old

Unnamed: 0_level_0,DISTANCE,CADD_PHRED,DNA_LM_influence_score,MAF_beta_1.25,Geneid
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1_1,0.0,0.000,0.000000,0.000000,ENSG00000198445
2_2,0.0,0.000,0.000000,0.000000,ENSG00000198445
3_3,0.0,0.000,0.000000,0.000000,ENSG00000198445
4_4,0.0,0.000,0.000000,0.000000,ENSG00000198445
6_6,0.0,0.000,0.000000,0.000000,ENSG00000198445
...,...,...,...,...,...
1077_1078,13629.0,61.807,0.342296,186.380891,ENSG00000079974
1078_1079,54694.0,330.785,1.901447,1011.361912,ENSG00000079974
1079_1080,11470.0,50.978,0.202178,118.862321,ENSG00000079974
1080_1081,10698.0,45.424,0.155111,98.863939,ENSG00000079974


## Check onek1k data from download compared to what we go

In [4]:
download_adata = sc.read_h5ad(base_data_dir/"input_data/08984b3c-3189-4732-be22-62f1fe8f15a4.h5ad")

In [None]:
download_adata