### Process STAR Fusion Max Sensitivity Results

In [1]:
import pandas as pd 
from pathlib import Path 

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")
star_fusion_max_sensitivity = wkdir_path.joinpath("6_misexp_dissect/star_fusion/results_max_sensitivity")
star_fusion_max_sensitivity_path = Path(star_fusion_max_sensitivity)

In [3]:
misexp_tx_fusion_df_list = []
for index,row in misexp_vrnt_feat_df.iterrows():
    vrnt_id = row["vrnt_id"]
    misexp_gene_id = row["gene_id"]
    rna_id = row["rna_id"]
    # load STAR fusion predictions 
    star_fusion_predict_path = star_fusion_max_sensitivity_path.joinpath(f"{rna_id}/star-fusion.fusion_predictions.abridged.tsv")
    star_fusion_predict_df = pd.read_csv(star_fusion_predict_path, sep="\t")
    # check if misexpressed gene in LeftGene
    if not star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].empty:
        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.LeftGene.str.contains(misexp_gene_id)].copy()
        misexp_fusion_df["rna_id"] = rna_id 
        misexp_fusion_df["vrnt_id"] = vrnt_id 
        misexp_fusion_df["gene_id"] = misexp_gene_id 
        misexp_tx_fusion_df_list.append(misexp_fusion_df)
    # check if misexpressed gene in RightGene
    elif not star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].empty:
        misexp_fusion_df = star_fusion_predict_df[star_fusion_predict_df.RightGene.str.contains(misexp_gene_id)].copy()
        misexp_fusion_df["rna_id"] = rna_id 
        misexp_fusion_df["vrnt_id"] = vrnt_id 
        misexp_fusion_df["gene_id"] = misexp_gene_id 
        misexp_tx_fusion_df_list.append(misexp_fusion_df)
    else:
        continue

In [4]:
misexp_tx_fusion_df = pd.concat(misexp_tx_fusion_df_list).reset_index(drop=True)
fusion_names = misexp_tx_fusion_df['#FusionName'].unique()
print(f"Number of different fusion gene pairs: {len(fusion_names)}")

# some fusion events have the same name but involve different breakpoints 
misexp_tx_fusion_df["fusion_id"] = misexp_tx_fusion_df["#FusionName"] + "|" + misexp_tx_fusion_df["LeftBreakpoint"] + "|" + misexp_tx_fusion_df["RightBreakpoint"]
fusion_ids = misexp_tx_fusion_df.fusion_id.unique()
print(f"Number of different types of fusion event: {len(fusion_ids)}")

Number of different fusion gene pairs: 15
Number of different types of fusion event: 21


In [5]:
# check that fusion events occur in all samples carrying variant 
vrnts_consistent_fusion = []
misexp_tx_fusion_vrnt_ids = misexp_tx_fusion_df.vrnt_id.unique()
print(f"Number of misexpression-associated SVs in cis to fusion event: {len(misexp_tx_fusion_vrnt_ids)}")
for vrnt_id in misexp_tx_fusion_vrnt_ids: 
    fusion_rna_ids = set(misexp_tx_fusion_df[misexp_tx_fusion_df.vrnt_id == vrnt_id].rna_id.unique())
    misexp_rna_ids = set(misexp_vrnt_feat_df[misexp_vrnt_feat_df.vrnt_id == vrnt_id].rna_id.unique())
    if fusion_rna_ids == misexp_rna_ids: 
        vrnts_consistent_fusion.append(vrnt_id)
print(f"Number of misexpression-associated SVs with consistent fusion events: {len(vrnts_consistent_fusion)}")

Number of misexpression-associated SVs in cis to fusion event: 24
Number of misexpression-associated SVs with consistent fusion events: 22


In [6]:
# select fusion events that are supported by all variant carriers 
fusion_vrnt_carriers_df = misexp_tx_fusion_df[["vrnt_id", "rna_id", "fusion_id", "LeftBreakpoint", "RightBreakpoint"]].drop_duplicates()
# count number of samples fusion is observed in  
fusion_vrnt_carriers_count_df = fusion_vrnt_carriers_df.groupby(["vrnt_id", "fusion_id", "LeftBreakpoint", "RightBreakpoint"], as_index=False).rna_id.count().rename(columns={"rna_id": "carrier_count_fusion"})
# count carriers 
misexp_vrnt_carrier_df = misexp_vrnt_feat_df[["vrnt_id", "rna_id"]].drop_duplicates()
misexp_vrnt_carrier_count_df = misexp_vrnt_carrier_df.groupby("vrnt_id", as_index=False).rna_id.nunique().rename(columns={"rna_id": "carrier_count_total"})
misexp_fusion_vrnt_carrier_df = pd.merge(misexp_vrnt_carrier_count_df, 
                                         fusion_vrnt_carriers_count_df, 
                                         on="vrnt_id", how="inner")
# restrict to fusion events observed across all carriers 
misexp_fusion_vrnt_consistent_df = misexp_fusion_vrnt_carrier_df[misexp_fusion_vrnt_carrier_df.carrier_count_total == misexp_fusion_vrnt_carrier_df.carrier_count_fusion].copy()

In [7]:
vrnt_id_consitent_fusion = misexp_fusion_vrnt_consistent_df.vrnt_id.unique()
print(f"Number of variant IDs where all carriers have a low evidence fusion: {len(vrnt_id_consitent_fusion)}")
fusion_id_consistent = misexp_fusion_vrnt_consistent_df.fusion_id.unique()
print(f"Number of fusion IDs where all carriers have a low evidence fusion: {len(fusion_id_consistent)}")

Number of variant IDs where all carriers have a low evidence fusion: 22
Number of fusion IDs where all carriers have a low evidence fusion: 17


In [8]:
misexp_tx_fusion_consistent_df = misexp_tx_fusion_df[misexp_tx_fusion_df.fusion_id.isin(fusion_id_consistent)].reset_index(drop=True).copy()
misexp_gene_tx_fusion_events_df = misexp_tx_fusion_consistent_df.groupby("gene_id", as_index=False)[["rna_id", "fusion_id", "vrnt_id"]].nunique()
fusion_events_misexp_gene = misexp_gene_tx_fusion_events_df.gene_id.unique()
print(f"Number of fusion genes involving a misexpressed gene: {len(fusion_events_misexp_gene)}")
# Number of fusion 
low_evidence_fusion_transcripts = misexp_tx_fusion_consistent_df["#FusionName"].unique()
print(f"Number of fusion transcripts: {len(low_evidence_fusion_transcripts)}")

Number of fusion genes involving a misexpressed gene: 12
Number of fusion transcripts: 12


In [9]:
# write RNA IDs to file for stringent filtering 
misexp_rna_id_fusion = misexp_tx_fusion_df.rna_id.unique()
print(f"Number of RNA IDs for stringent STAR fusion filtering: {len(misexp_rna_id_fusion)}")

Number of RNA IDs for stringent STAR fusion filtering: 14


In [10]:
star_fusion_rna_ids_max_sensitivity_path = wkdir_path.joinpath("6_misexp_dissect/star_fusion/star_fusion_rna_ids_max_sensitivity.txt")
with open(star_fusion_rna_ids_max_sensitivity_path, "w") as f_out: 
    for rna_id in misexp_rna_id_fusion: 
        f_out.write(f"{rna_id}\n")