### Test for intersecting variants and genes 

In [None]:
import pandas as pd 
import pysam 
from io import StringIO
from pybedtools import BedTool
from pathlib import Path

In [None]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)
chrom = "chr21"
vrnts_bed_path = wkdir_path.joinpath("4_vrnt_enrich/snp_indel_count_carriers/vrnts_bed/chr21_vrnts.bed")
genes_bed_path = wkdir_path.joinpath("4_vrnt_enrich/snp_indel_count_carriers/genes_bed/chr21_genes.bed")
window = 1000000
root_dir_path = wkdir_path.joinpath("4_vrnt_enrich/snp_indel_count_carriers")

# constants 
window_intersect_cols={0:"chrom_gene", 1:"start_gene", 2:"end_gene", 3: "gene_id", 
                       4:"score", 5:"strand", 6:"chrom_vrnt", 7:"start_vrnt", 8:"end_vrnt", 9:"vrnt_id", 10:"AF"}

# check root exists
root_dir_path = Path(root_dir)
root_dir_path.mkdir(parents=True, exist_ok=True)

### Find variants intersecting gene windows 
intersect_bed_dir = root_dir_path.joinpath("intersect_beds")
Path(intersect_bed_dir).mkdir(parents=True, exist_ok=True)
intersect_bed_path = intersect_bed_dir.joinpath(f"{chrom}_gene_vrnts_intersect.tsv")
vrnts_bed = BedTool(vrnts_bed_path)
gene_bed= BedTool(genes_bed_path)
print(f"Finding variant IDs in gene windows ...")
window_intersect_vrnts_str = StringIO(str(gene_bed.window(vrnts_bed, w=window)))
window_intersect_vrnts_df = pd.read_csv(window_intersect_vrnts_str, sep="\t", header=None).rename(columns=window_intersect_cols)
window_intersect_vrnts_df.to_csv(intersect_bed_path, sep="\t", index=False)
print(f"Completed.")
# genes with variants in windows 
intersect_genes_dir = root_dir_path.joinpath("intersect_genes")
Path(intersect_genes_dir).mkdir(parents=True, exist_ok=True)
intersect_genes_path = intersect_genes_dir.joinpath(f"{chrom}_gene_ids_intersect.txt")
intersect_gene_ids_list = window_intersect_vrnts_df.gene_id.unique()
print(f"{chrom} number of genes with variant in windows: {len(intersect_gene_ids_list)}")
# write gene list 
print("Writing genes to file ...")
with open(intersect_genes_path, 'w') as genes_out:
    for gene_id in intersect_gene_ids_list: 
        genes_out.write(f"{gene_id}\n")
print("Completed.")