### Identification of DUP transcript fusion candidates

Criteria: 

* Duplication overlaps 3' end of misexpressed gene 
* Duplication partially overlaps 5' end of an expressed gene 
* Misexpressed and readthrough gene on the same strand 
* Misexpressed gene upstream of transcribed gene 

In [1]:
import pandas as pd
from pathlib import Path
from pybedtools import BedTool
from io import StringIO
import pysam
import numpy as np

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)
# inputs
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
gencode_bed_path = wkdir_path.joinpath("3_misexp_genes/bed_files/all_genes.bed")
tpm_mtx_path = wkdir_path.joinpath("1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv")
# constants 
tpm_cutoff = 0.5

In [3]:
# only include overlapping genes that are transcribed (median TPM > 0.5)
# load gene expression matrix 
tpm_mtx_df = pd.read_csv(tpm_mtx_path)
# add median expression of gene across INTERVAL 
median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index("gene_id").median(axis=1))
median_tpm_df = median_tpm_df.rename(columns={0:"median_tpm"}).reset_index()
genes_median_tpm05 = median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique()
print(f"Number of genes with median TPM > {tpm_cutoff}: {len(genes_median_tpm05)}")

Number of genes with median TPM > 0.5: 17418


In [7]:
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")
misexp_dup_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == "DUP"]
misexp_gene_cols = {"gene_id": "misexp_gene_id", "gene_start": "misexp_gene_start", 
                    "gene_end": "misexp_gene_end", "gene_strand": "misexp_gene_strand"}
misexp_dup_feat_df = misexp_dup_feat_df.rename(columns=misexp_gene_cols)

misexp_dups = misexp_dup_feat_df.vrnt_id.unique()
print(f"Number of misexpression-associated duplications: {len(misexp_dups)}")

Number of misexpression-associated duplications: 16


In [8]:
# Overlap 3' end of gene 
dup_overlap_3prime_gene_df = misexp_dup_feat_df[misexp_dup_feat_df.position == "Partial overlap 3' end"]
dup_overlap_3prime_gene_vrnt_id = dup_overlap_3prime_gene_df.vrnt_id.unique()
print(f"Number of DUP overlapping the gene 3' end: {len(dup_overlap_3prime_gene_vrnt_id)}")

Number of DUP overlapping the gene 3' end: 5


In [9]:
# subset to required variant-level features 
vrnt_features_cols = ["vrnt_id", "misexp_gene_id", "chrom", "sv_start", "sv_end", "misexp_gene_start", "misexp_gene_end", "misexp_gene_strand"]
dup_overlap_3prime_gene_trunc_df = dup_overlap_3prime_gene_df[vrnt_features_cols].drop_duplicates()

**Select DUPs that overlap gene 5'**

In [10]:
# load variants in tested windows 
vrnt_id_in_window_chr_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed")
vrnt_id_in_window_chr_bed = BedTool(str(vrnt_id_in_window_chr_path))
gencode_bed = BedTool(str(gencode_bed_path))
# intersect SVs and genes 
sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))
# read intersect between SVs and genes 
columns={0: "sv_chrom", 1: "sv_start", 2: "sv_end", 3:"vrnt_id", 
         4:"gene_chrom", 5:"overlap_gene_start", 6:"overlap_gene_end", 7:"overlap_gene_id", 
         8:"score", 9:"overlap_gene_strand", 10:"gene_overlap"
        }
sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep="\t", header=None, dtype={3:str}).rename(columns=columns)
# subset to relevant features 
sv_overlap_gene_feat_df = sv_overlap_gene_df[["vrnt_id", "overlap_gene_start", "overlap_gene_end", "overlap_gene_id", "overlap_gene_strand"]]

In [11]:
def overlaps_gene_5_prime(row): 
    if row["overlap_gene_strand"] == "+": 
        return (row["sv_start"] < row["overlap_gene_start"]) & (row["sv_end"] > row["overlap_gene_start"]) & (row["sv_end"] < row["overlap_gene_end"])
    elif row["overlap_gene_strand"] == "-": 
        return (row["sv_start"] > row["overlap_gene_start"]) & (row["overlap_gene_end"] > row["sv_start"]) & (row["overlap_gene_end"] < row["sv_end"])
    else: 
        return np.nan

In [12]:
# add overlapping genes to misexpression DELs that are upstream 
misexp_dup_overlap_genes_df = pd.merge(dup_overlap_3prime_gene_trunc_df, 
                                       sv_overlap_gene_feat_df, 
                                       on="vrnt_id", 
                                       how="left")
# annotate genes that overlap the 3' end 
misexp_dup_overlap_genes_df["overlap_gene_5_prime"] = misexp_dup_overlap_genes_df.apply(overlaps_gene_5_prime, axis=1)
# variant overlaps gene that is expressed in blood 
misexp_dup_overlap_genes_df["overlap_gene_expressed"] = np.where(misexp_dup_overlap_genes_df.overlap_gene_id.isin(genes_median_tpm05), True, False)

In [13]:
misexp_dup_candidates_df = misexp_dup_overlap_genes_df[(misexp_dup_overlap_genes_df.overlap_gene_5_prime) &
                                                      (misexp_dup_overlap_genes_df.overlap_gene_expressed) &
                                                      (misexp_dup_overlap_genes_df.misexp_gene_strand == misexp_dup_overlap_genes_df.overlap_gene_strand)
                                                     ].reset_index(drop=True)

In [14]:
misexp_dup_fusion_candidates = misexp_dup_candidates_df.vrnt_id.unique()
print(f"Number of DUP fusion candidates: {len(misexp_dup_fusion_candidates)}")
print(f"Tx fusion candidate DUPs:")
for vrnt_id in misexp_dup_fusion_candidates: 
    print(f" - {vrnt_id}")

Number of DUP fusion candidates: 4
Tx fusion candidate DUPs:
 - 422023
 - 397951
 - 401916
 - 402648
