### Filtering criteria for transcriptional readthrough SVs 

Filtering criteria:
* SV is located upstream of misexpressed gene
* SV overlaps 3' end of upstream gene. Upstream gene: 
    * On same strand of misexpressed gene 
    * Expressed in whole blood (median TPM > 0.5)
    * SV overlaps a terminal exon polyA site 
* No intervening expressed genes between misexpressed gene and upstream gene
    * Intervening genes are not overlapped by SV or misexpressed gene 
* Closest gene to SV 

In [1]:
import pandas as pd
from pathlib import Path
from pybedtools import BedTool
from io import StringIO
import pysam
import numpy as np

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)
# input paths
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
tpm_mtx_path = wkdir_path.joinpath("1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv")
polya_site_atlas_path = wkdir_path.joinpath("reference/polyA_site/atlas.clusters.2.0.GRCh38.96.tsv.gz")
polya_site_atlas_bed_path = wkdir_path.joinpath("reference/polyA_site/atlas.clusters.2.0.GRCh38.96.bed.gz")
vrnt_id_in_window_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_chr_num_misexp_genes.bed")
gencode_bed_path = wkdir_path.joinpath("3_misexp_genes/bed_files/all_genes.bed")
# output directory 
out_dir=wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/deletions")
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)

In [3]:
# load misexpression-associated variants 
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")
# list of misexpression-associated DELs 
misexp_del_feat_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.SVTYPE == "DEL"]
misexp_dels = misexp_del_feat_df.vrnt_id.unique()
print(f"Number of misexpression-associated deletions: {len(misexp_dels)}")
# load genes .bed file 
gencode_bed = BedTool(gencode_bed_path)
# load gene expression TPM matrix 
tpm_mtx_df = pd.read_csv(tpm_mtx_path)

Number of misexpression-associated deletions: 87


In [4]:
# identify active genes in INTERVAL (median TPM > 0.5)
tpm_cutoff = 0.5
# median TPM of gene across INTERVAL 
median_tpm_df = pd.DataFrame(tpm_mtx_df.set_index("gene_id").median(axis=1))
median_tpm_df = median_tpm_df.rename(columns={0:"median_tpm"}).reset_index()
genes_median_tpm05 = set(median_tpm_df[median_tpm_df.median_tpm > tpm_cutoff].gene_id.unique())
print(f"Number of genes with median TPM > {tpm_cutoff}: {len(genes_median_tpm05)}")

Number of genes with median TPM > 0.5: 17418


**Annotate DELs positioned upstream of misexpressed gene**

In [5]:
vrnt_gene_dist_df_list = []
misexp_del_gene_df = misexp_del_feat_df[["vrnt_id", "chrom", "sv_start", "sv_end", "gene_id", "gene_start", "gene_end", "gene_strand"]].drop_duplicates()
for index, row in misexp_del_gene_df.iterrows(): 
    # calculate shortest distance between SV and misexpressed gene 
    vrnt_id, chrom, sv_start, sv_end = row["vrnt_id"], row["chrom"], row["sv_start"], row["sv_end"], 
    gene_id, gene_start, gene_end, gene_strand = row["gene_id"], row["gene_start"], row["gene_end"], row["gene_strand"]
    sv_bed = BedTool(f"{chrom} {sv_start} {sv_end} {vrnt_id}", from_string=True)
    gene_bed = BedTool(f"{chrom} {gene_start} {gene_end} {gene_id} 0 {gene_strand}", from_string=True)
    # -D uses negative distances to report closest upstream features
    gene_bed_distance_str = StringIO(str(gene_bed.closest(sv_bed, D="a")))
    gene_bed_distance_df = pd.read_csv(gene_bed_distance_str, sep="\t", header=None)
    vrnt_gene_dist_df_list.append(gene_bed_distance_df)
vrnt_gene_dist_cols = {0:"gene_chrom", 1:"gene_start", 2:"gene_end", 3:"misexp_gene_id", 
                       4:"score", 5:"gene_strand", 6:"sv_chrom", 7:"sv_start", 8:"sv_end", 
                       9:"vrnt_id", 10:"distance"}
vrnt_gene_dist_df = pd.concat(vrnt_gene_dist_df_list).rename(columns=vrnt_gene_dist_cols)
vrnt_gene_dist_df['vrnt_id'] = vrnt_gene_dist_df['vrnt_id'].astype(str)
if misexp_del_gene_df.shape[0] != vrnt_gene_dist_df.shape[0]: 
    raise ValueError("Variant-gene pair number in input does not match output.")
# select variants upstream 
vrnt_id_upstream_df = vrnt_gene_dist_df[vrnt_gene_dist_df.distance < 0]

In [6]:
vrnt_id_upstream = set(vrnt_id_upstream_df.vrnt_id.unique())
print(f"Number of deletions upstream of misexpressed genes: {len(vrnt_id_upstream)}")
# check against custom annotation method
vrnt_id_upstream_feat_df = misexp_del_feat_df[misexp_del_feat_df.position == "Upstream"]
vrnt_id_upstream_feat = set(vrnt_id_upstream_feat_df.vrnt_id.unique())
print(f"Upstream variants identical: {vrnt_id_upstream==vrnt_id_upstream_feat}")

Number of deletions upstream of misexpressed genes: 54
Upstream variants identical: True


In [7]:
# subset to relevant variant features and upstream DELs 
misexp_del_sv_feat_df = misexp_del_feat_df[["vrnt_id", "chrom", "sv_start", "sv_end", "gene_id", "gene_start", "gene_end", "gene_strand", "position"]].drop_duplicates()
misexp_gene_cols = {"gene_id": "misexp_gene_id", "gene_start": "misexp_gene_start", 
                    "gene_end": "misexp_gene_end", "gene_strand": "misexp_gene_strand"}
misexp_del_sv_feat_df = misexp_del_sv_feat_df.rename(columns=misexp_gene_cols)
# subset to SVs positioned upstream 
misexp_del_upstream_df = misexp_del_sv_feat_df[misexp_del_sv_feat_df.position == "Upstream"].copy()

In [8]:
# add distance between upstream SV and misexpressed gene 
del_upstream_vrnt_gene_distance_df = vrnt_id_upstream_df[["vrnt_id", "misexp_gene_id", "distance"]].drop_duplicates()
misexp_del_sv_feat_upstream_df = pd.merge(misexp_del_upstream_df, 
                                          del_upstream_vrnt_gene_distance_df, 
                                          on=["vrnt_id", "misexp_gene_id"], 
                                          how="left")
# check dimensions do not change 
len(misexp_del_sv_feat_upstream_df) == len(misexp_del_upstream_df)

True

**Annotate upstream DELs that have no active genes on the same strand as the misexpressed gene in the readthrough region**

In [9]:
def no_active_intervening_gene(row):
    """ 
    Check if there is an active gene on the same strand as the misexpressed gene in the upstream 
    region between the SV and the misexpressed gene. Input must be subset to upstream SVs for 
    valid .bed file coordinates. 
    
    Returns True if no active intervening gene and False if there is one. 
    """
    region_gene_cols = {0:"sv_chrom", 1:"sv_start", 2:"sv_end", 3:"name", 4:"score", 5:"strand",  
                    6: "gene_chrom", 7:"gene_start", 8:"gene_end", 9:"gene_id", 10:"gene_score", 
                    11:"gene_strand", 12:"overlap"
                   }
    
    chrom = row["chrom"]
    gene_strand = row["misexp_gene_strand"]
    vrnt_gene_id = f'{row["vrnt_id"]}_{row["misexp_gene_id"]}'
    if row["misexp_gene_strand"] == "+":
        region_start = row["sv_end"]
        region_end = row["misexp_gene_start"]
    elif row["misexp_gene_strand"] == "-":
        region_start = row["misexp_gene_end"]
        region_end = row["sv_start"]   
    else: 
        raise ValueError("Strand not recognised.")
    # create .bed file for region and intersect with gene bed file 
    region_bed = BedTool(f"{chrom} {region_start} {region_end} {vrnt_gene_id} 0 {gene_strand}", from_string=True)
    # require gene to be completely enclosed in the readthrough region F=1 
    region_gene_intersect_str = StringIO(str(region_bed.intersect(gencode_bed, wo=True, F=1)))
    # check if string is empty - if empty no intervening genes return False 
    # if not empty check if intervening gens are active 
    if region_gene_intersect_str.getvalue().strip():
        region_gene_intersect_str.seek(0) # move cursor to beginning
        region_gene_intersect_df = pd.read_csv(region_gene_intersect_str, sep="\t", header=None, dtype={3:str}).rename(columns=region_gene_cols)
        # subset to genes on the same strand as the misexpressed gene 
        region_gene_intersect_shared_strand_df = region_gene_intersect_df[region_gene_intersect_df.gene_strand == gene_strand]
        genes_in_region = set(region_gene_intersect_shared_strand_df.gene_id.unique())
        if len(genes_in_region.intersection(genes_median_tpm05)) > 0: 
            return False
        else: 
            return True
    else: 
        return True

In [10]:
misexp_del_sv_feat_upstream_df["no_active_gene"] = misexp_del_sv_feat_upstream_df.apply(no_active_intervening_gene, axis=1)

**Annotated genes that overlap 3' end of any gene**

In [11]:
# load variants in tested windows 
vrnt_id_in_window_chr_path = wkdir_path.joinpath("5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed")
vrnt_id_in_window_chr_bed = BedTool(vrnt_id_in_window_chr_path)
# intersect SVs and genes 
sv_overlap_gene_str = StringIO(str(vrnt_id_in_window_chr_bed.intersect(gencode_bed, wo=True)))
# read intersect between SVs and genes 
columns={0: "sv_chrom", 1: "sv_start", 2: "sv_end", 3:"vrnt_id", 
         4:"gene_chrom", 5:"overlap_gene_start", 6:"overlap_gene_end", 7:"overlap_gene_id", 
         8:"score", 9:"overlap_gene_strand", 10:"gene_overlap"
        }
sv_overlap_gene_df = pd.read_csv(sv_overlap_gene_str, sep="\t", header=None, dtype={3:str}).rename(columns=columns)
# subset to relevant features 
sv_overlap_gene_feat_df = sv_overlap_gene_df[["vrnt_id", "overlap_gene_start", "overlap_gene_end", "overlap_gene_id", "overlap_gene_strand"]]

In [12]:
def overlaps_gene_3_prime(row): 
    if row["overlap_gene_strand"] == "+": 
        return (row["sv_start"] < row["overlap_gene_end"]) & (row["sv_start"] > row["overlap_gene_start"]) & (row["sv_end"] > row["overlap_gene_end"])
    elif row["overlap_gene_strand"] == "-": 
        return (row["sv_start"] < row["overlap_gene_start"]) & (row["overlap_gene_end"] > row["sv_end"]) & (row["overlap_gene_start"] < row["sv_end"])
    else: 
        return np.nan

In [13]:
# add overlapping genes to misexpression DELs that are upstream 
misexp_del_overlap_genes_df = pd.merge(misexp_del_sv_feat_upstream_df, 
                                       sv_overlap_gene_feat_df, 
                                       on="vrnt_id", 
                                       how="left")
# annotate genes that overlap the 3' end 
misexp_del_overlap_genes_df["overlap_gene_3_prime"] = misexp_del_overlap_genes_df.apply(overlaps_gene_3_prime, axis=1)
# variant overlaps gene that is expressed in blood 
misexp_del_overlap_genes_df["overlap_gene_expressed"] = np.where(misexp_del_overlap_genes_df.overlap_gene_id.isin(genes_median_tpm05), True, False)

**Annotate SVs that overlaps a terminal exon (TE) polyA site in the 3' overlapping gene**

In [14]:
### polyA site (PAS) overlap 

# polyA information
polya_site_df = pd.read_csv(polya_site_atlas_path, sep="\t", dtype={0:str}).rename(columns={"gene_id": "gene_id"})
# polyA sites (PAS) bed
polya_site_atlas_bed = BedTool(polya_site_atlas_bed_path)
# load variants in tested windows 
vrnt_id_in_window_bed = BedTool(vrnt_id_in_window_path)
# intersect with polyA sites 
sv_overlap_polya_str = StringIO(str(vrnt_id_in_window_bed.intersect(polya_site_atlas_bed, wo=True)))
polya_intersect_columns={0: "sv_chrom", 1: "sv_start", 2: "sv_end", 3:"vrnt_id", 
         4:"polya_chrom", 5:"polya_start", 6:"polya_end", 7:"name", 
         8:"polya_mean_tpm_1", 9:"polya_strand", 10:"polya_frac", 11:"polya_support", 
         12:"polya_mean_tpm_2",13:"polya_annotation", 14:"polya_signals", 
         15:"overlap"
        }
sv_overlap_polya_df = pd.read_csv(sv_overlap_polya_str, sep="\t", header=None, dtype={3:str}).rename(columns=polya_intersect_columns)
# subset to misexpression deletions 
misexp_dels_polya_df = sv_overlap_polya_df[sv_overlap_polya_df.vrnt_id.isin(misexp_dels)]
# add polyA site gene ID 
polya_site_gene_id_df = polya_site_df[["name", "gene_id"]]
misexp_del_polya_gene_id_df = pd.merge(misexp_dels_polya_df, polya_site_gene_id_df, on="name", how="left") 
# subset only to TE (terminal exon) polyA sites 
misexp_del_polya_te_gene_id_df = misexp_del_polya_gene_id_df[misexp_del_polya_gene_id_df.polya_annotation == "TE"]
# expand out genes with multiple annotations separated by |
misexp_del_overlap_gene_polya_te_df = misexp_del_polya_te_gene_id_df[["vrnt_id", "gene_id", "polya_strand"]].drop_duplicates()
misexp_del_overlap_gene_polya_te_df["overlap_gene_id"] = misexp_del_overlap_gene_polya_te_df.gene_id.str.split("|")
misexp_del_overlap_gene_polya_te_df = misexp_del_overlap_gene_polya_te_df.explode("overlap_gene_id", ignore_index=True)
# drop gene ID column 
misexp_del_overlap_gene_polya_te_drop_dups_df = misexp_del_overlap_gene_polya_te_df.drop(columns=["gene_id"]).drop_duplicates()
misexp_del_overlap_gene_polya_te_drop_dups_df["polya_te_overlap"] = True 

In [15]:
# merge with misexpressed deletion dataframe 
misexp_del_overlap_genes_polya_df = pd.merge(misexp_del_overlap_genes_df, 
                                             misexp_del_overlap_gene_polya_te_drop_dups_df, 
                                             on=["vrnt_id", "overlap_gene_id"], 
                                             how="left")
# check dataframe length unchanged 
len(misexp_del_overlap_genes_polya_df) == len(misexp_del_overlap_genes_df)

True

In [16]:
### Transcriptional readthrough variant prioritisation 

# number of variants upstream 
num_upstream_misexp_dels = misexp_del_overlap_genes_polya_df.vrnt_id.unique()
print(f"Number of DELs upstream of misexpressed gene: {len(num_upstream_misexp_dels)}")
# number of upstream DELs with no active gene between SV breakpoint and misexpressed gene 
filter_1_df = misexp_del_overlap_genes_polya_df[(misexp_del_overlap_genes_polya_df.position == "Upstream") & 
                                                (misexp_del_overlap_genes_polya_df.no_active_gene)]
filter_1_vrnt_ids = set(filter_1_df.vrnt_id.unique())
print(f"Remaining DELs upstream of misexpressed gene with no intervening active gene on same strand: {len(filter_1_vrnt_ids)}")

# number of variants overlapping 3' end of a gene and polyA site 
filter_2_df = filter_1_df[(filter_1_df.overlap_gene_3_prime == True) 
                         ]
filter_2_vrnt_ids = set(filter_2_df.vrnt_id.unique())
print(f"Remaining DELs overlapping 3' end of upstream gene: {len(filter_2_vrnt_ids)}")

# SV overlaps polyA site in the 3' region of the overlapped gene
filter_3_df = filter_2_df[filter_2_df.polya_te_overlap == True]
filter_3_vrnt_ids = set(filter_3_df.vrnt_id.unique())
print(f"Remaining DELs overlap polyA site in upstream gene: {len(filter_3_vrnt_ids)}")

# upstream gene is expressed
filter_4_df = filter_3_df[(filter_3_df.overlap_gene_expressed == True)]
filter_4_vrnt_ids = set(filter_4_df.vrnt_id.unique())
print(f"Upstream gene is expressed: {len(filter_4_vrnt_ids)}")

# upstream gene is on same strand 
filter_5_df = filter_4_df[(filter_4_df.misexp_gene_strand == filter_4_df.overlap_gene_strand)]
filter_5_vrnt_ids = set(filter_5_df.vrnt_id.unique())
print(f"Upstream gene is on same strand: {len(filter_5_vrnt_ids)}")    

misexp_tx_read_vrnt_ids = filter_5_vrnt_ids
misexp_tx_read_vrnts_df = filter_5_df.reset_index(drop=True)
print(f"Final set of candidate tx readthrough DELs: {len(misexp_tx_read_vrnt_ids)}")

Number of DELs upstream of misexpressed gene: 54
Remaining DELs upstream of misexpressed gene with no intervening active gene on same strand: 52
Remaining DELs overlapping 3' end of upstream gene: 13
Remaining DELs overlap polyA site in upstream gene: 13
Upstream gene is expressed: 12
Upstream gene is on same strand: 12
Final set of candidate tx readthrough DELs: 12


**Select the closest gene to the SV if multiple genes are misexpressed**

In [17]:
# N.B. use idxmax as distances are negative for upstream
idx_closest_distance = misexp_tx_read_vrnts_df.groupby("vrnt_id").distance.idxmax()
misexp_tx_read_vrnts_closest_df = misexp_tx_read_vrnts_df.loc[idx_closest_distance].reset_index(drop=True)

**Metrics of transcriptional readthrough candidate DELs**

In [18]:
# merge with misexpression variant features for metrics 
misex_tx_read_vrnt_gene_df = misexp_tx_read_vrnts_closest_df[["vrnt_id", "misexp_gene_id", "overlap_gene_id", "distance"]].drop_duplicates()
misex_tx_read_vrnt_gene_df = misex_tx_read_vrnt_gene_df.rename(columns={"misexp_gene_id": "gene_id"})
print(f"Number of transcriptional readthrough candidate variant-gene pairs: {len(misex_tx_read_vrnt_gene_df)}")
print(f"Transcriptional readthrough candidate variants: {set(misex_tx_read_vrnt_gene_df.vrnt_id.tolist())}")

Number of transcriptional readthrough candidate variant-gene pairs: 12
Transcriptional readthrough candidate variants: {'293588', 'DEL_chr2_100567565_100578556', '252546', '179078', '284739', 'DEL_chr12_102008109_102015494', 'DEL_chr17_58705468_58742168', 'DEL_chr8_103436284_103461045', '201247', '123122', '188910', 'DEL_chr3_187069321_187094542'}


In [19]:
# add variant features 
misexp_tx_read_vrnt_feat_df = pd.merge(misex_tx_read_vrnt_gene_df, 
                                       misexp_vrnt_feat_df, 
                                       on=["vrnt_id", "gene_id"], 
                                       how="inner")

In [20]:
# write to file 
misexp_tx_read_vrnt_feat_path = out_dir_path.joinpath("misexp_del_tx_readthrough_candidates.tsv")
misexp_tx_read_vrnt_feat_df.to_csv(misexp_tx_read_vrnt_feat_path, sep="\t", index=False)

In [21]:
# write variants to list 
misexp_tx_read_vrnts = misexp_tx_read_vrnt_feat_df.vrnt_id.unique()
misexp_tx_read_vrnts_path = out_dir_path.joinpath("misexp_del_tx_vrnts.txt") 
with open(misexp_tx_read_vrnts_path, 'w') as f_out: 
    for vrnt_id in misexp_tx_read_vrnts: 
        f_out.write(f"{vrnt_id}\n")