### Filtering criteria for deletion gene fusions 

* Overlap of misexpressed gene - 5'end only 
* Overlapping gene on same strand as misexpressed gene
* Overlapping gene expresssed 
* Overlapping gene only partially deleted 

In [3]:
import pandas as pd
from pathlib import Path
from pybedtools import BedTool
from io import StringIO
import pysam
import numpy as np

In [4]:
# constants 
tpm_cutoff = 0.5
# variables 
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)
# paths 
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
gencode_bed_path = wkdir_path.joinpath("3_misexp_genes/bed_files/all_genes.bed")
tpm_mtx_path = wkdir_path.joinpath("1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv")
# constants 
tpm_cutoff = 0.5

In [5]:
# load misexpression-associated variants 
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")
# list of misexpression-associated DELs 
misexp_del_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == "DEL"]
misexp_dels = misexp_del_feat_df.vrnt_id.unique()
print(f"Number of misexpression-associated deletions: {len(misexp_dels)}")

Number of misexpression-associated deletions: 87


In [6]:
# load Gencode genes .bed file 
gencode_bed = BedTool(gencode_bed_path)

In [7]:
# only include overlapping genes that are transcribed (median TPM > 0.5)
# load gene expression matrix 
tpm_mtx_df = pd.read_csv(tpm_mtx_path)
# add median expression of gene across INTERVAL 
median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index("gene_id").median(axis=1))
median_tpm_df = median_tpm_df.rename(columns={0:"median_tpm"}).reset_index()
genes_median_tpm05 = median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique()
print(f"Number of genes with median TPM > {tpm_cutoff}: {len(genes_median_tpm05)}")

Number of genes with median TPM > 0.5: 17418


In [8]:
### select misexpressed variant-gene pairs with partial 5' overlap
misexp_del_5prime_overlap_df = misexp_del_feat_df[misexp_del_feat_df.position == "Partial overlap 5' end"]
misexp_del_5prime_overlap = misexp_del_5prime_overlap_df.vrnt_id.unique()
print(f"Number of DELs overlapping 5' end of misexpressed gene: {len(misexp_del_5prime_overlap)}")
print(f"DELs overlapping 5' end of misexpressed gene: {', '.join(misexp_del_5prime_overlap)}")
print(f"Percengate overlap 5' end of misexpressed gene: {len(misexp_del_5prime_overlap)/len(misexp_dels) * 100}")

Number of DELs overlapping 5' end of misexpressed gene: 7
DELs overlapping 5' end of misexpressed gene: DEL_chr4_13529219_13608506, 284739, DEL_chr9_135511950_135523352, 143283, 173184, 240913, DEL_chr20_63291972_63297412
Percengate overlap 5' end of misexpressed gene: 8.045977011494253


In [9]:
# load variants in tested windows 
vrnt_id_in_window_chr_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed")
vrnt_id_in_window_chr_bed = BedTool(vrnt_id_in_window_chr_path)
# intersect SVs and genes 
sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))
# read intersect between SVs and genes 
columns={0: "sv_chrom", 1: "sv_start", 2: "sv_end", 3:"vrnt_id", 
         4:"gene_chrom", 5:"overlap_gene_start", 6:"overlap_gene_end", 7:"overlap_gene_id", 
         8:"score", 9:"overlap_gene_strand", 10:"gene_overlap"
        }
sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep="\t", header=None, dtype={3:str}).rename(columns=columns)
# subset to relevant features 
sv_overlap_gene_feat_df = sv_overlap_gene_df[["vrnt_id", "overlap_gene_start", "overlap_gene_end", "overlap_gene_id", "overlap_gene_strand"]]

In [10]:
# add overlapping genes to misexpression DELs that overlap 5' end of misexpressed gene  
misexp_del_5prime_overlap_df = pd.merge(misexp_del_5prime_overlap_df, 
                                       sv_overlap_gene_feat_df, 
                                       on="vrnt_id", 
                                       how="left")

In [11]:
def overlaps_gene_3_prime(row): 
    if row["overlap_gene_strand"] == "+": 
        return (row["sv_start"] < row["overlap_gene_end"]) & (row["sv_start"] > row["overlap_gene_start"]) & (row["sv_end"] > row["overlap_gene_end"])
    elif row["overlap_gene_strand"] == "-": 
        return (row["sv_start"] < row["overlap_gene_start"]) & (row["overlap_gene_end"] > row["sv_end"]) & (row["overlap_gene_start"] < row["sv_end"])
    else: 
        return np.nan

In [12]:
# annotate genes that overlap the 3' end 
misexp_del_5prime_overlap_df["overlap_gene_3_prime"] = misexp_del_5prime_overlap_df.apply(overlaps_gene_3_prime, axis=1)
# variant overlaps gene that is expressed in blood 
misexp_del_5prime_overlap_df["overlap_gene_expressed"] = np.where(misexp_del_5prime_overlap_df.overlap_gene_id.isin(genes_median_tpm05), True, False)

In [13]:
# transcript fusion variant prioritisation 
misexp_del_fusion_candidates_df = misexp_del_5prime_overlap_df[(misexp_del_5prime_overlap_df.overlap_gene_strand == misexp_del_5prime_overlap_df.gene_strand) & 
                                                               (misexp_del_5prime_overlap_df.overlap_gene_3_prime) & 
                                                               (misexp_del_5prime_overlap_df.overlap_gene_expressed)]

In [14]:
misexp_del_fusion_candidates = misexp_del_fusion_candidates_df.vrnt_id.unique()
print(f"Number of DEL transcript fusion candidates: {len(misexp_del_fusion_candidates)}")
print(f"Tx fusion candidate DELs:")
for vrnt_id in misexp_del_fusion_candidates: 
    print(f" - {vrnt_id}")

Number of DEL transcript fusion candidates: 3
Tx fusion candidate DELs:
 - DEL_chr4_13529219_13608506
 - 284739
 - 143283
