### STAR fusion stringent QC 

In [1]:
import pandas as pd 
from pathlib import Path 

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)
# inputs
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")

In [3]:
star_fusion_stringent_fusion_path = wkdir_path.joinpath("6_misexp_dissect/star_fusion/results_stringent_qc")

In [4]:
star_fusion_stringent_rna_path = wkdir_path.joinpath("6_misexp_dissect/star_fusion/star_fusion_rna_ids_max_sensitivity.txt")
star_fusion_stringent_rna = pd.read_csv(star_fusion_stringent_rna_path, sep="\t", header=None)[0].tolist()
print(f"Number of RNA IDs: {len(star_fusion_stringent_rna)}")
misexp_vrnt_feat_fusion_df = misexp_vrnt_feat_df[misexp_vrnt_feat_df.rna_id.isin(star_fusion_stringent_rna)]                                                     

Number of RNA IDs: 14


In [5]:
misexp_tx_fusion_stringent_df_list = []
for index,row in misexp_vrnt_feat_fusion_df.iterrows():
    vrnt_id = row["vrnt_id"]
    misexp_gene_id = row["gene_id"]
    rna_id = row["rna_id"]
    # load STAR fusion predictions 
    star_fusion_predict_path = star_fusion_stringent_fusion_path.joinpath(f"{rna_id}/FusionInspector-validate/finspector.FusionInspector.fusions.abridged.tsv")
    star_fusion_predict_df = pd.read_csv(star_fusion_predict_path, sep="\t")
    # check if misexpressed gene in LeftGene
    if not star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].empty:
        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].copy()
        misexp_fusion_df["rna_id"] = rna_id 
        misexp_fusion_df["vrnt_id"] = vrnt_id 
        misexp_fusion_df["gene_id"] = misexp_gene_id 
        misexp_tx_fusion_stringent_df_list.append(misexp_fusion_df)
     # check if misexpressed gene in RightGene
    elif not star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].empty:
        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].copy()
        misexp_fusion_df["rna_id"] = rna_id 
        misexp_fusion_df["vrnt_id"] = vrnt_id 
        misexp_fusion_df["gene_id"] = misexp_gene_id 
        misexp_tx_fusion_stringent_df_list.append(misexp_fusion_df)
    else:
        continue

In [6]:
misexp_tx_fusion_stringent_df = pd.concat(misexp_tx_fusion_stringent_df_list).reset_index(drop=True)

In [7]:
# check that fusion events occur in all samples carrying variant 
vrnts_consistent_fusion = []
misexp_tx_fusion_vrnt_ids = misexp_tx_fusion_stringent_df.vrnt_id.unique()
print(f"Number of misexpression-associated SVs in cis to fusion event: {len(misexp_tx_fusion_vrnt_ids)}")
for vrnt_id in misexp_tx_fusion_vrnt_ids: 
    fusion_rna_ids = set(misexp_tx_fusion_stringent_df[misexp_tx_fusion_stringent_df.vrnt_id == vrnt_id].rna_id.unique())
    misexp_rna_ids = set(misexp_vrnt_feat_df[misexp_vrnt_feat_df.vrnt_id == vrnt_id].rna_id.unique())
    if fusion_rna_ids == misexp_rna_ids: 
        vrnts_consistent_fusion.append(vrnt_id)
print(f"Number of misexpression-associated SVs with consistent fusion events: {len(vrnts_consistent_fusion)}")

Number of misexpression-associated SVs in cis to fusion event: 16
Number of misexpression-associated SVs with consistent fusion events: 16


In [8]:
fusion_stringent_names = misexp_tx_fusion_stringent_df['#FusionName'].unique()
print(f"Number of different fusion gene pairs (stringent QC): {len(fusion_stringent_names)}")
# some fusion events have the same name but involve different breakpoints 
misexp_tx_fusion_stringent_df["fusion_id"] = misexp_tx_fusion_stringent_df["#FusionName"] + "|" + misexp_tx_fusion_stringent_df["LeftBreakpoint"] + "|" + misexp_tx_fusion_stringent_df["RightBreakpoint"]
fusion_ids_stringent = misexp_tx_fusion_stringent_df.fusion_id.unique()
print(f"Number of different types of fusion event (stringent QC): {len(fusion_ids_stringent)}")

# select fusion events that are consistent across all variant carriers 
misexp_tx_fusion_stringent_carriers_df = misexp_tx_fusion_stringent_df[["vrnt_id", "rna_id", "fusion_id", "LeftBreakpoint", "RightBreakpoint"]].drop_duplicates()
# count carriers 
misexp_tx_fusion_stringent_carriers_count_df = misexp_tx_fusion_stringent_carriers_df.groupby(["vrnt_id", "fusion_id", "LeftBreakpoint", "RightBreakpoint"], as_index=False).rna_id.count().rename(columns={"rna_id": "carrier_count_fusion"})
# merge with number of carriers per misexpression-associated SV
misexp_vrnt_carrier_df = misexp_vrnt_feat_df[["vrnt_id", "rna_id"]].drop_duplicates()
# count carriers 
misexp_vrnt_carrier_count_df = misexp_vrnt_carrier_df.groupby("vrnt_id", as_index=False).rna_id.nunique().rename(columns={"rna_id": "carrier_count_total"})
misexp_fusion_stringent_vrnt_carrier_df = pd.merge(misexp_vrnt_carrier_count_df, 
                                         misexp_tx_fusion_stringent_carriers_count_df, 
                                         on="vrnt_id", how="inner")
# restrict to fusion events observed across all carriers 
misexp_fusion_stringent_vrnt_consistent_df = misexp_fusion_stringent_vrnt_carrier_df[misexp_fusion_stringent_vrnt_carrier_df.carrier_count_total == misexp_fusion_stringent_vrnt_carrier_df.carrier_count_fusion].copy()

vrnt_id_consitent_fusion_stringent = misexp_fusion_stringent_vrnt_consistent_df.vrnt_id.unique()
print(f"Number of variant IDs where all carriers have a high evidence fusion: {len(vrnt_id_consitent_fusion_stringent)}")
fusion_id_consistent_stringent = misexp_fusion_stringent_vrnt_consistent_df.fusion_id.unique()
print(f"Number of fusion IDs where all carriers have a high evidence fusion: {len(fusion_id_consistent_stringent)}")

Number of different fusion gene pairs (stringent QC): 10
Number of different types of fusion event (stringent QC): 12
Number of variant IDs where all carriers have a high evidence fusion: 16
Number of fusion IDs where all carriers have a high evidence fusion: 12


In [9]:
# subset to variants only observed in cis to fusion event 
misexp_tx_fusion_stringent_consistent_df = misexp_tx_fusion_stringent_df[misexp_tx_fusion_stringent_df.vrnt_id.isin(vrnt_id_consitent_fusion_stringent)]
fusion_names_consistent = misexp_tx_fusion_stringent_consistent_df["#FusionName"].unique()
print(f"Number of different fusion gene pairs (stringent QC) in cis to variant: {len(fusion_names_consistent)}") 

Number of different fusion gene pairs (stringent QC) in cis to variant: 10


In [10]:
# write variant-gene-sample pairs supported by fusion event 
fusion_vrnt_gene_smpl_df = misexp_tx_fusion_stringent_consistent_df[["vrnt_id", "gene_id", "rna_id"]].drop_duplicates()
# write to file 
fusion_vrnt_gene_smpl_path = star_fusion_stringent_fusion_path.joinpath("tx_fusion_vrnts_gene_smpl.tsv")
fusion_vrnt_gene_smpl_df.to_csv(fusion_vrnt_gene_smpl_path, sep="\t", index=False)

In [11]:
print(f"Total number of transcript fusion candidate variants: {len(vrnt_id_consitent_fusion_stringent)}")
tx_fusion_vrnts_path = star_fusion_stringent_fusion_path.joinpath("tx_fusion_vrnts_list.txt")
with open(tx_fusion_vrnts_path, "w") as f_out: 
    for vrnt_id in vrnt_id_consitent_fusion_stringent: 
        f_out.write(f"{vrnt_id}\n")

Total number of transcript fusion candidate variants: 16


Examined FusionInspector results in IGV: 

### Deletions

**Prioritised by fusion criteria pipeline**
* BOD1L1--LINC01097
    * Variant ID: DEL_chr4_13529219_13608506
    * Gene ID: ENSG00000281202 (LINC01097) misexpressed 
    * RNA ID: INT_RNA7960028 

* FBXO8--LINC02268 fusion 
    * 284739 (likely causal variant)
        * Three other variants in cis window: 
            * 284735 intronic variant in both genes 
            * DEL_chr4_174181857_174193905 intronic variant ENSG00000248174, no predicted effect ENSG00000250957
            * DEL_chr4_174236814_174243134 upstream variant (not tx readthrough prioritised)
    * 2 genes misexpressed: ENSG00000248174 & ENSG00000250957
        * ENSG00000250957 is located inside the other gene ENSG00000248174 (LINC02268). 
        * ENSG00000250957 misexpressed via transcriptional readthrough. 
    * 2 supporting reads for fusion transcript 
    * Variant prioritised in both fusion transcript workflow and transcriptional readthrough pipeline 
        * ENSG00000248174 misexpressed via fusion 
        * ENSG00000250957 misexpress via readthrough 

**Unclear mechanism**
* CPPED1--AC010333.2
    * RNA ID: INT_RNA7960040
    * Variant ID: 158491
        * Intronic variant in the misexpressed gene involved in fusion
    * Two genes misexpressed at locus: ENSG00000259876 (novel gene) & ENSG00000259899 (novel gene)
    * ENSG00000259899 is involved in fusion 
    * Fusion transcript linking 3rd exon to last exon - 4 reads spanning splice site 
    * Potential cryptic splicing event generating novel splice site (not possible to confirm)
    
* MAP2K3--LINC01563
    * Variant ID: 173184
        * Affects one gene in one sample 
    * Gene ID: ENSG00000236819 (LINC01563)
    * RNA ID: INT_RNA7960873
    * Two different fusion events, hence duplicated in file
    * This has not been picked up by DEL fusion prioritisation because:
        * Upstream gene is USP22, it is on the wrong strand to form fusion product 
        * LINC01563 is on + strand as is MAP2K3, however it is upstream of MAP2K3, unclear how transcript fusion is generated 
    * Nice example for deletion mechanism - 34 reads spanning breakpoint
    * Appears to be fusion of the 5'UTR of MAP2K3 to the 3' UTR of LINC01563 - novel non-coding transcript
       
* TAF1C--CDH13
    * Variant ID: DEL_chr16_83921765_83922967, DEL_chr16_83997914_83999941, DEL_chr16_83966491_84032889
        * All variants affect same gene in same sample 
    * RNA ID: INT_RNA7959485
    * Gene ID: ENSG00000140945
    * Only one read supporting fusion transcript and no long anchor support
    * Multiple variants, unclear which is causal if any
    
**Readthrough and intergenic splicing**
* ST6GAL1--RTP1 - transcriptional readthrough with intergenic splicing 
    * Variant IDs: DEL_chr3_187079365_187083300 and DEL_chr3_187069321_187094542
        * DEL_chr3_187069321_187094542 is associated with misexpression of two genes: ENSG00000175077 & ENSG00000283175 - one is further away, mechanism is unclear 
        * Variant pair only found in one sample, one is prioritised by tx readthrough while other is not
    * Gene ID: ENSG00000175077 (RTP1)
    * RNA ID: INT_RNA7710961

* TBCA--OTP - 293588 - transcriptional readthrough with intergenic splicing 
    * Variant ID: 293588
        * Only affects one gene in one sample 
    * RNA_ID: INT_RNA7710599
    * Gene ID: ENSG00000171540
    * Transcriptional readthrough event with intergenic fusion 

### Duplications
* STON2--LINC02308 - previously characterised DUP leading to Tx fusion
    * 2 samples: INT_RNA7710877 & INT_RNA7710267
    * Variant ID: 397951 (present in both samples) 
        * Only associated with one misexpressed gene 
    * ENSG00000258675 (LINC02308)
* KARS--CPHXL - previously chracterised DUP leading to Tx fusion
    * RNA ID: INT_RNA7710560
    * Variant ID: 401916 
        * Only affects one gene in one sample 
    * Gene ID: ENSG00000283755 (CPHXL)
* AC005747.1--MYH1 - previously characterised DUP leading to Tx fusion
    * RNA ID: INT_RNA7961048
    * Variant ID: 402648
        * Only affects one gene in one sample 
    * Gene ID: ENSG00000109061 (MYH1)