In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import argparse
from pathlib import Path
import os
import pandas as pd
import scanpy as sc
import cellink as cl
from cellink.tl._burden_testing import *
from scipy.stats import beta

  from .autonotebook import tqdm as notebook_tqdm


# TEST

In [6]:
base_data_dir = Path("/s/project/sys_gen_students/2024_2025/project04_rare_variant_sc/")
scdata_path = base_data_dir / "input_data/OneK1K_cohort_gene_expression_matrix_14_celltypes.h5ad.gz"
gdata_dir = "/data/ceph/hdd/project/node_09/sys_gen_students/2024_2025/project04_rare_variant_sc/input_data/filter_vcf_r08/"
DNA_LM_upstream = base_data_dir/ "input_data/annotations/onek1k_inf_scores_upstream_model.tsv"
DNA_LM_downstream = base_data_dir/ "input_data/annotations/onek1k_inf_scores_downstream_model.tsv"
vep_scores = base_data_dir/ "input_data/annotations/onek1k1_all_variants_annotated_vep.txt"

zarr_file = os.path.join(gdata_dir, f"chr22.dose.filtered.R2_0.8.vcz")
eigenvec = pd.read_csv(base_data_dir / "input_data/pcdir/wgs.dose.filtered.R2_0.8.filtered.pruned.eigenvec", sep = ' ')
scdata = sc.read_h5ad(scdata_path)
gdata = cl.io.read_sgkit_zarr(zarr_file)

In [7]:
def add_maf_annotation(gdata):
    weighted_snp_maf = beta.pdf(gdata.var["maf"], 1, 25)
    gdata.varm["annotations_0"]["MAF_beta_1.25"] = weighted_snp_maf
    return gdata

def add_DNA_LM(gdata, file, chromosome, colname):

    DNA_LM = pd.read_csv(file,
                sep = '\t')
    DNA_LM = add_snp_id(DNA_LM)
    DNA_LM = reverse_and_update_snp_ids(gdata.varm["annotations_0"],
                                        DNA_LM[DNA_LM["Chromosome"]==f"chr{chromosome}"])
    
    gdata.varm["annotations_0"][colname] = DNA_LM["influence_score"].reindex(gdata.varm["annotations_0"].index)

    # Rename the merged column if needed
    gdata.varm["annotations_0"].rename(columns={"influence_score": colname}, inplace=True)
    return gdata

def add_snp_id(DNA_LM):
    DNA_LM['snp_id'] = DNA_LM['Chromosome'] + "_" + DNA_LM['pos'].astype(str) + "_" + DNA_LM['ref'] + "_" + DNA_LM['alt']
    DNA_LM['snp_id'] = DNA_LM['snp_id'].str.replace('chr', '')
    
    # Set 'snap_id' as the index
    DNA_LM.set_index('snp_id', inplace=True)
    return DNA_LM

def reverse_and_update_snp_ids(gdata_df, dna_df):
    updated_index = []
    
    for snp_id in dna_df.index:
        if snp_id in gdata_df.index:
            updated_index.append(snp_id)
        else:
            chrom, pos, ref, alt = snp_id.split("_")
            reversed_snp_id = f"{chrom}_{pos}_{alt}_{ref}"  # Reverse ref and alt
            
            # Check if reversed_snp_id exists in data_df
            if reversed_snp_id in gdata_df.index:
                #print(f"Reversing {snp_id} to {reversed_snp_id}")
                updated_index.append(reversed_snp_id)
            else:
                print(f"Error, unknown snp_id {snp_id}")
                updated_index.append(snp_id)
    
    # Update DNA_LM's index
    dna_df.index = updated_index
    print("\nUpdated DNA_LM index:")
    print(dna_df.index)
    return dna_df


In [8]:
cl.tl.add_vep_annos_to_gdata(vep_scores, gdata,
                             cols_to_explode=["Consequence"],
                             cols_to_dummy=["Consequence"])

  annos = pd.read_csv(vep_anno_file, sep="\t", skiprows=_get_vep_start_row(vep_anno_file))


: 