### Duplications leading to tx readthrough filter

Criteria: 

* Duplication overlaps entire misexpressed gene 
* Duplication partially overlaps 5' end of an expressed gene (median TPM > 0.5)
* Misexpressed and overlapping gene on the same strand 
* No active gene located in the expected readthrough region on same strand 
* Misexpressed gene upstream of transcribed gene 

In [1]:
import pandas as pd
from pathlib import Path
from pybedtools import BedTool
from io import StringIO
import pysam
import numpy as np

In [2]:
# working directory 
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)
# input files 
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
tpm_mtx_path = wkdir_path.joinpath("1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv")
gencode_bed_path = wkdir_path.joinpath("3_misexp_genes/bed_files/all_genes.bed")
# output
out_dir = wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/duplications/")
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)
# constants 
tpm_cutoff = 0.5

In [3]:
# only include overlapping genes that are transcribed (median TPM > 0.5)
# load gene expression matrix 
tpm_mtx_df = pd.read_csv(tpm_mtx_path)
# add median expression of gene across INTERVAL 
median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index("gene_id").median(axis=1))
median_tpm_df = median_tpm_df.rename(columns={0:"median_tpm"}).reset_index()
genes_median_tpm05 = median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique()

In [4]:
# load GENCODE genes bed file 
gencode_bed = BedTool(gencode_bed_path)

In [5]:
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")
misexp_dup_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == "DUP"]
misexp_gene_cols = {"gene_id": "misexp_gene_id", "gene_start": "misexp_gene_start", 
                    "gene_end": "misexp_gene_end", "gene_strand": "misexp_gene_strand"}
misexp_dup_feat_df = misexp_dup_feat_df.rename(columns=misexp_gene_cols)

misexp_dups = misexp_dup_feat_df.vrnt_id.unique()
print(f"Number of misexpression-associated duplications: {len(misexp_dups)}")

Number of misexpression-associated duplications: 16


In [6]:
# select DUPs overlapping the entire misexpressed gene 
dup_overlap_entire_gene_df = misexp_dup_feat_df[misexp_dup_feat_df.position == "Entire gene"]
dup_overlap_entire_gene = dup_overlap_entire_gene_df.vrnt_id.unique()
print(f"Number of variants overlapping the entire gene: {len(dup_overlap_entire_gene)}")

Number of variants overlapping the entire gene: 5


In [7]:
# subset to required variant-level features 
vrnt_features_cols = ["vrnt_id", "misexp_gene_id", "chrom", "sv_start", "sv_end", "misexp_gene_start", "misexp_gene_end", "misexp_gene_strand"]
dup_entire_gene_overlap_trunc_df = dup_overlap_entire_gene_df[vrnt_features_cols].drop_duplicates()

**Select DUPs that overlap 5' end of a gene**

In [8]:
# load variants in tested windows 
vrnt_id_in_window_chr_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed")
vrnt_id_in_window_chr_bed = BedTool(vrnt_id_in_window_chr_path)
# intersect SVs and genes 
sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))
# read intersect between SVs and genes 
columns={0: "sv_chrom", 1: "sv_start", 2: "sv_end", 3:"vrnt_id", 
         4:"gene_chrom", 5:"overlap_gene_start", 6:"overlap_gene_end", 7:"overlap_gene_id", 
         8:"score", 9:"overlap_gene_strand", 10:"gene_overlap"
        }
sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep="\t", header=None, dtype={3:str}).rename(columns=columns)
# subset to relevant features 
sv_overlap_gene_feat_df = sv_overlap_gene_df[["vrnt_id", "overlap_gene_start", "overlap_gene_end", "overlap_gene_id", "overlap_gene_strand"]]

In [9]:
def overlaps_gene_5_prime(row): 
    if row["overlap_gene_strand"] == "+": 
        return (row["sv_start"] < row["overlap_gene_start"]) & (row["sv_end"] > row["overlap_gene_start"]) & (row["sv_end"] < row["overlap_gene_end"])
    elif row["overlap_gene_strand"] == "-": 
        return (row["sv_start"] > row["overlap_gene_start"]) & (row["overlap_gene_end"] > row["sv_start"]) & (row["overlap_gene_end"] < row["sv_end"])
    else: 
        return np.nan

In [10]:
# add overlapping genes to misexpression DELs that are upstream 
misexp_dup_overlap_genes_df = pd.merge(dup_entire_gene_overlap_trunc_df, 
                                       sv_overlap_gene_feat_df, 
                                       on="vrnt_id", 
                                       how="left")
# annotate genes that overlap the 3' end 
misexp_dup_overlap_genes_df["overlap_gene_5_prime"] = misexp_dup_overlap_genes_df.apply(overlaps_gene_5_prime, axis=1)
# variant overlaps gene that is expressed in blood 
misexp_dup_overlap_genes_df["overlap_gene_expressed"] = np.where(misexp_dup_overlap_genes_df.overlap_gene_id.isin(genes_median_tpm05), True, False)

In [11]:
### check for active genes in readthrough region 
# maybe want to adjust this to require entire gene overlap?
region_gene_cols = {0:"sv_chrom", 1:"sv_start", 2:"sv_end", 3:"name", 4:"score", 5:"strand",  
                    6: "gene_chrom", 7:"gene_start", 8:"gene_end", 9:"gene_id", 10:"gene_score", 11:"gene_strand", 
                    12:"overlap"
                   }

def no_active_intervening_gene(row):
    """ 
    Check if there is an active gene on the same strand as the misexpressed gene in the upstream 
    region between the SV and the misexpressed gene. Input must be subset to upstream SVs for 
    valid .bed file coordinates. 
    
    Returns True if no active intervening gene and False if there is one. 
    """
    chrom = row["chrom"]
    gene_strand = row["misexp_gene_strand"]
    vrnt_gene_id = f'{row["vrnt_id"]}_{row["misexp_gene_id"]}'
    if row["misexp_gene_strand"] == "+":
        region_start = row["sv_start"]
        region_end = row["misexp_gene_start"]
    elif row["misexp_gene_strand"] == "-":
        region_start = row["misexp_gene_end"]
        region_end = row["sv_end"]   
    else: 
        raise ValueError("Strand not recognised.")
    # create .bed file for region and intersect with gene bed file 
    region_bed = BedTool(f"{chrom} {region_start} {region_end} {vrnt_gene_id} 0 {gene_strand}", from_string=True)
    # only report genes that are 100% contained within the readthrough region 
    region_gene_intersect_str = StringIO(str(region_bed.intersect(gencode_bed, wo=True, F=1)))
    # check if string is empty - if empty no intervening genes return False 
    # if not empty check if intervening gens are active 
    if region_gene_intersect_str.getvalue().strip():
        region_gene_intersect_str.seek(0) # move cursor to beginning
        region_gene_intersect_df = pd.read_csv(region_gene_intersect_str, sep="\t", header=None, dtype={3:str}).rename(columns=region_gene_cols)
        # N.B. subset to genes on the same strand as the misexpressed gene 
        region_gene_intersect_shared_strand_df = region_gene_intersect_df[region_gene_intersect_df.gene_strand == gene_strand]
        genes_in_region = set(region_gene_intersect_shared_strand_df.gene_id.unique())
        if len(genes_in_region.intersection(genes_median_tpm05)) > 0: 
            return False
        else: 
            return True
    else: 
        return True

In [12]:
misexp_dup_overlap_genes_df["no_active_gene"] = misexp_dup_overlap_genes_df.apply(no_active_intervening_gene, axis=1)

In [13]:
# subset to DUPs that have overlap gene 5' end, 
# downstream gene expressed in whole blood
# on same strand as the misexpressed gene
# no intervening active genes 
misexp_dup_candidates_df = misexp_dup_overlap_genes_df[(misexp_dup_overlap_genes_df.overlap_gene_5_prime) &
                                                      (misexp_dup_overlap_genes_df.overlap_gene_expressed) &
                                                      (misexp_dup_overlap_genes_df.misexp_gene_strand == misexp_dup_overlap_genes_df.overlap_gene_strand) &
                                                      (misexp_dup_overlap_genes_df.no_active_gene)
                                                     ].reset_index(drop=True)

In [14]:
misexp_tx_readthrough_dups = misexp_dup_candidates_df.vrnt_id.unique()
print(f"Number of DUP tx readthrough candidates: {len(misexp_tx_readthrough_dups)}")
print(f"DUP tx readthrough candidates: {misexp_tx_readthrough_dups}")

Number of DUP tx readthrough candidates: 5
DUP tx readthrough candidates: ['408686' '425231' '397101' '414685' '414879']


In [15]:
# select closest gene 
def gene_distance(row): 
    """"""
    if row["misexp_gene_strand"] == "+": 
        return row["misexp_gene_start"] - row["sv_start"]
    elif row["misexp_gene_strand"] == "-":
        return row["sv_end"] - row["misexp_gene_end"]
    else:
        raise ValueError("Strand not recognised.")
        
    
misexp_dup_candidates_df["distance"] = misexp_dup_candidates_df.apply(gene_distance, axis=1)

In [16]:
idx_closest_distance = misexp_dup_candidates_df.groupby("vrnt_id").distance.idxmin()
misexp_dup_candidates_closest_df = misexp_dup_candidates_df.loc[idx_closest_distance].reset_index(drop=True)

**Metrics of transcriptional readthrough candidate DELs**

In [17]:
# merge with misexpression variant features for metrics 
misexp_tx_read_vrnt_gene_df = misexp_dup_candidates_closest_df[["vrnt_id", "misexp_gene_id", "overlap_gene_id", "distance"]].drop_duplicates()
misexp_tx_read_vrnt_gene_df = misexp_tx_read_vrnt_gene_df.rename(columns={"misexp_gene_id": "gene_id"})
print(f"Number of transcriptional readthrough candidate variant-gene pairs: {len(misexp_tx_read_vrnt_gene_df)}")

Number of transcriptional readthrough candidate variant-gene pairs: 5


In [18]:
# add variant features 
misexp_tx_read_vrnt_feat_df = pd.merge(misexp_tx_read_vrnt_gene_df, 
                                       misexp_vrnt_feat_df, 
                                       on=["vrnt_id", "gene_id"], 
                                       how="inner")
# write to file 
misexp_tx_read_vrnt_feat_path = out_dir_path.joinpath("misexp_dup_tx_readthrough_candidates.tsv")
misexp_tx_read_vrnt_feat_df.to_csv(misexp_tx_read_vrnt_feat_path, sep="\t", index=False)

In [19]:
# write list of variants to file 
misexp_tx_read_vrnts = misexp_dup_candidates_closest_df.vrnt_id.unique()
misexp_tx_read_vrnts_path = out_dir_path.joinpath("misexp_dup_tx_vrnts.txt") 
with open(misexp_tx_read_vrnts_path, 'w') as f_out: 
    for vrnt_id in misexp_tx_read_vrnts: 
        f_out.write(f"{vrnt_id}\n")