### Comparison of different misexpression mechanisms 

* Assign each misexpressed gene-sample pair to a mechanism 

In [1]:
from pathlib import Path 
import pandas as pd

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

outdir = wkdir_path.joinpath("6_misexp_dissect/combined")
outdir.mkdir(parents=True, exist_ok=True)

In [3]:
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")

In [4]:
# transcriptional readthrough candidate SVs 
tx_read_vrnt_gene_smpl_path = wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/combine/tx_read_vrnts_gene_smpl.tsv")
tx_read_vrnt_gene_smpl_df = pd.read_csv(tx_read_vrnt_gene_smpl_path, sep="\t")
tx_read_gene_smpl_df = tx_read_vrnt_gene_smpl_df[["gene_id", "rna_id"]].drop_duplicates()

In [5]:
tx_fusion_vrnt_gene_smpl_path = wkdir_path.joinpath("6_misexp_dissect/star_fusion/results_stringent_qc/tx_fusion_vrnts_gene_smpl.tsv")
tx_fusion_vrnt_gene_smpl_df = pd.read_csv(tx_fusion_vrnt_gene_smpl_path, sep="\t")
tx_fusion_gene_smpl_df = tx_fusion_vrnt_gene_smpl_df[["gene_id", "rna_id"]].drop_duplicates()

In [6]:
# check overlap tx fusion and tx readthrough 
tx_read_fusion_merge_df = pd.merge(tx_fusion_gene_smpl_df, tx_read_gene_smpl_df, on=["gene_id", "rna_id"], how="left", indicator=True)
# find two intergenic fusion examples - label these as tx readthrough
tx_read_fusion_gene_id = tx_read_fusion_merge_df[tx_read_fusion_merge_df["_merge"] == "both"].gene_id.unique() # RTP1 and OTP 
print(tx_read_fusion_gene_id)  # RTP1 and OTP 
tx_fusion_gene_smpl_rmv_tx_read_df = tx_read_fusion_merge_df[tx_read_fusion_merge_df['_merge'] == 'left_only'].drop(columns=['_merge'])

['ENSG00000175077' 'ENSG00000171540']


In [7]:
mech_vrnts =set(tx_fusion_vrnt_gene_smpl_df.vrnt_id.unique()).union(set(tx_read_vrnt_gene_smpl_df.vrnt_id.unique()))

In [8]:
tx_fusion_gene_smpl_rmv_tx_read_df["mechanism"] = "Transcript fusion"
tx_read_gene_smpl_df["mechanism"] = "Transcriptional readthrough"
tx_read_fusion_gene_smpl_df = pd.concat([tx_fusion_gene_smpl_rmv_tx_read_df, tx_read_gene_smpl_df])
# check no duplicates 
tx_read_fusion_gene_smpl_df[["gene_id", "rna_id"]].drop_duplicates().shape[0] == tx_read_fusion_gene_smpl_df.shape[0]

True

In [9]:
# Gene inversion mechanism 
misexp_gene_inv_df = misexp_vrnt_feat_df[(misexp_vrnt_feat_df.SVTYPE == "INV") & 
                                         (misexp_vrnt_feat_df.gene_id == "ENSG00000114547")
                                        ]
misexp_gene_inv_no_dups_df = misexp_gene_inv_df[["gene_id", "rna_id"]].drop_duplicates()
misexp_gene_inv_no_dups_df["mechanism"] = "Gene inversion"
# combine misexpression mechanisms 
misexp_mech_known_df = pd.concat([tx_read_fusion_gene_smpl_df, misexp_gene_inv_no_dups_df])

In [10]:
misexp_gene_smpl_all_df = misexp_vrnt_feat_df[["gene_id", "rna_id", "TPM", "z-score"]].drop_duplicates()
misexp_gene_smpl_all_mech_df = pd.merge(misexp_gene_smpl_all_df, misexp_mech_known_df, on=["gene_id", "rna_id"], how="left").fillna("Unknown mechanism")
# count gene-sample pairs for each mechanism 
misexp_gene_smpl_mech_count_df = misexp_gene_smpl_all_mech_df.groupby(["mechanism"], as_index=False)[["gene_id", "rna_id"]].count()
misexp_gene_smpl_mech_count_df = misexp_gene_smpl_mech_count_df[["mechanism", "gene_id"]].rename(columns={"gene_id": "count"})

In [11]:
# write to file 
misexp_gene_smpl_mech_count_path = outdir.joinpath("misexp_gene_smpl_mech_count.tsv")
misexp_gene_smpl_mech_count_df.to_csv(misexp_gene_smpl_mech_count_path, sep="\t", index=False)

In [12]:
# proportion of misexpression events with misexpression-associated SV that are unkown 
total_gene_smpl_pair = misexp_gene_smpl_all_mech_df.shape[0]
print(f"Total misexpressed gene sample pairs: {total_gene_smpl_pair}")
# unkown mechanism 
total_mech_gene_smpl_pair = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism != "Unknown mechanism"].shape[0]
print(f"Total misexpressed gene-sample pairs with putative mechanism: {total_mech_gene_smpl_pair}")
print(f"Proportion events with putative mechanism: {total_mech_gene_smpl_pair}/{total_gene_smpl_pair}")
print(f"Percentage events with putative mechanism: {(total_mech_gene_smpl_pair/total_gene_smpl_pair) *100}")
# write to file 
misexp_level_gene_smpl_mech_path = outdir.joinpath("misexp_level_gene_smpl_mech.tsv")
misexp_gene_smpl_all_mech_df.to_csv(misexp_level_gene_smpl_mech_path, sep="\t", index=False)

Total misexpressed gene sample pairs: 98
Total misexpressed gene-sample pairs with putative mechanism: 41
Proportion events with putative mechanism: 41/98
Percentage events with putative mechanism: 41.83673469387755


In [13]:
### average z-scores across mechanisms 
unknown_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == "Unknown mechanism"]["z-score"].tolist()
tx_read_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == "Transcriptional readthrough"]["z-score"].tolist()
tx_fusion_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == "Transcript fusion"]["z-score"].tolist()
gene_inversion_zscores = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == "Gene inversion"]["z-score"].tolist()
# mean expression levels 
mean_tx_read = sum(tx_read_zscores)/len(tx_read_zscores)
print(f"Transcriptional readthrough mean z-score: {mean_tx_read}")
mean_unknown = sum(unknown_zscores)/len(unknown_zscores)
print(f"Unknown mechanism mean z-score: {mean_unknown}")
mean_tx_fusion = sum(tx_fusion_zscores)/len(tx_fusion_zscores)
print(f"Transcript fusion mean z-score: {mean_tx_fusion}")
mean_gene_inv = sum(gene_inversion_zscores)/len(gene_inversion_zscores)
print(f"Gene inversion mean z-score: {mean_gene_inv}")

Transcriptional readthrough mean z-score: 33.95421999539768
Unknown mechanism mean z-score: 25.887858193512624
Transcript fusion mean z-score: 52.04248156837565
Gene inversion mean z-score: 15.940637785356975


### Remaining misexpression-associated variants 

* Examine TAD and CTCF cCRE binding sites 

In [14]:
# subset to gene-sample pairs without a clear mechanism 
# this will remove suspected non-causal SVs for gene-sample pairs where we have identified a mechanism 
unknown_mech_gene_smpl_df = misexp_gene_smpl_all_mech_df[misexp_gene_smpl_all_mech_df.mechanism == "Unknown mechanism"]
misexp_vrnt_feat_unkown_df = pd.merge(misexp_vrnt_feat_df, 
                                      unknown_mech_gene_smpl_df, 
                                      on=["gene_id", "rna_id", "TPM", "z-score"], 
                                      how="inner")

In [15]:
# overlapping TAD boundaries and CTCF binding sites 

In [16]:
# Duplication criteria 
# overlap TAD boundary and CTCF binding site 
# overlap Genic enhancer or enhancer 
# overlap entire gene 
misexp_vrnt_feat_unkown_df[(misexp_vrnt_feat_unkown_df["SVTYPE"] == "DUP") &
                           (misexp_vrnt_feat_unkown_df["gm12878_shared_intersect_tad_boundary"] == 1) &
                           (misexp_vrnt_feat_unkown_df["CTCFonlyCTCFbound_all"] == 1) & 
                           (misexp_vrnt_feat_unkown_df["position"] == "Entire gene") &
                           ((misexp_vrnt_feat_unkown_df["EnhG"] == 1) | 
                           (misexp_vrnt_feat_unkown_df["Enh"] == 1))
                           ]

Unnamed: 0,vrnt_id,gene_id,gene_msc,SVTYPE,msc,chrom,sv_start,sv_end,gene_start,gene_end,...,oe_lof_upper,approved_target,decipher_gene,omim_gene,egan_id,rna_id,TPM,z-score,genotype,mechanism
4,425231,ENSG00000236714,non_coding_transcript_exon_variant,DUP,coding_sequence_variant,chr5,142642218,142903267,142716228,142761035,...,,0,0,0,EGAN00001585971,INT_RNA7959310,22.477649,67.353095,"(0, 1)",Unknown mechanism


In [17]:
# Deletion criteria, 
# do not overlap the misexpressed gene 
# overlap TAD boundary 
# overlap CTCF-binding site 
gene_overlap_positions = ["Partial overlap 5' end", "Partial overlap 3' end", 'Internal', "Entire gene"]
misexp_vrnt_feat_unkown_df[(misexp_vrnt_feat_unkown_df["SVTYPE"] == "DEL") &
                           (misexp_vrnt_feat_unkown_df["gm12878_shared_intersect_tad_boundary"] == 1) &
                           (misexp_vrnt_feat_unkown_df["CTCFonlyCTCFbound_all"] == 1) &
                           ~(misexp_vrnt_feat_unkown_df["position"].isin(gene_overlap_positions))
                           ]

Unnamed: 0,vrnt_id,gene_id,gene_msc,SVTYPE,msc,chrom,sv_start,sv_end,gene_start,gene_end,...,oe_lof_upper,approved_target,decipher_gene,omim_gene,egan_id,rna_id,TPM,z-score,genotype,mechanism
7,DEL_chr12_49269781_49291419,ENSG00000258334,downstream_gene_variant,DEL,stop_lost,chr12,49269781,49291419,49292630,49324576,...,,0,0,0,EGAN00001240929,INT_RNA7879099,1.606799,54.427192,"(0, 1)",Unknown mechanism
65,96700,ENSG00000180785,no_predicted_effect,DEL,transcript_ablation,chr11,4495092,4614586,4643419,4655488,...,1.68,0,0,1,EGAN00001361345,INT_RNA7710317,2.233265,45.275653,"(0, 1)",Unknown mechanism
73,96700,ENSG00000180785,no_predicted_effect,DEL,transcript_ablation,chr11,4495092,4614586,4643419,4655488,...,1.68,0,0,1,EGAN00001584413,INT_RNA7879036,2.224477,45.09719,"(0, 1)",Unknown mechanism
