### Calculate FPKM and FBNC over expected readthrough region for candidate DELs and DUPs

In [1]:
import pandas as pd
import pysam 
from pathlib import Path 
import numpy as np
from scipy.stats import spearmanr

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

# inputs 
misexp_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/vrnt_features/misexp_vrnt_features.tsv")
covariates_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/phenotypes/rna_seq/processed_v97/covariates/master/master_covariates_v97_swapd_depth_fastq_rin_cell_sex_pcs_season_batch_fc_pipelines_updtd.tsv"
rna_id_pass_qc_path = wkdir_path.joinpath("1_rna_seq_qc/aberrant_smpl_qc/smpls_pass_qc_4568.csv")
wgs_rna_paired_smpls_path = wkdir_path.joinpath("1_rna_seq_qc/wgs_rna_match/paired_wgs_rna_postqc_prioritise_wgs.tsv")
vcf_path="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/filtered_merged_gs_svp_10728.vcf.gz"
tpm_mtx_path = wkdir_path.joinpath("1_rna_seq_qc/tpm_mtx/tpm_4568samples_59144genes_smpl_qc.csv")
# deletion and duplication readthrough candidates 
del_tx_read_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/deletions/misexp_del_tx_readthrough_candidates.tsv")
dup_tx_read_vrnt_feat_path = wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/duplications/misexp_dup_tx_readthrough_candidates.tsv")
# output
out_dir = wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/combine")
out_dir.mkdir(parents=True, exist_ok=True)  

In [3]:
### Deletions 
# readthrough candidate deletions 
del_tx_read_vrnt_feat_df = pd.read_csv(del_tx_read_vrnt_feat_path, sep="\t")
del_tx_read = del_tx_read_vrnt_feat_df.vrnt_id.unique()
print(f"Number of DEL transcriptional readthrough candidates: {len(del_tx_read)}")

# load readthrough region read count and coverage across all variants 
del_readthrough_region_dir = wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/deletions/readthrough_region")
all_del_readthrough_region_df_list = []
for vrnt_id in del_tx_read: 
    carriers = del_tx_read_vrnt_feat_df[del_tx_read_vrnt_feat_df.vrnt_id == vrnt_id].rna_id.unique()
    vrnt_readthrough_region_path = del_readthrough_region_dir.joinpath(f"{vrnt_id}/{vrnt_id}_intergenic_cov.tsv")
    vrnt_readthrough_region_df = pd.read_csv(vrnt_readthrough_region_path, sep="\t")
    # annotate carriers and non-carriers 
    vrnt_readthrough_region_df["carrier"] = np.where(vrnt_readthrough_region_df.rna_id.isin(carriers), 'DEL', "Non-carrier")
    all_del_readthrough_region_df_list.append(vrnt_readthrough_region_df)
    
all_del_readthrough_region_df = pd.concat(all_del_readthrough_region_df_list)

# load covariates 
covariates_df = pd.read_csv(covariates_path, sep="\t")
rna_id_read_depth_df = covariates_df[["rna_id", "RawReadDepth"]]
# add read depth 
all_del_readthrough_region_df = pd.merge(all_del_readthrough_region_df, 
                                          rna_id_read_depth_df, 
                                          on="rna_id", 
                                          how="left"
                                            )
# compute FPKM 
all_del_readthrough_region_df["fpkm"] = 1000000000 * (all_del_readthrough_region_df["features"]/(all_del_readthrough_region_df["total_len"] * all_del_readthrough_region_df["RawReadDepth"]))

# calculate FPKM and FBNC z-scores per region 
for feature in ["fpkm", "cov_fraction"]: 
    group_mean = all_del_readthrough_region_df.groupby("vrnt_gene_id")[feature].transform("mean")
    group_std = all_del_readthrough_region_df.groupby("vrnt_gene_id")[feature].transform("std")
    all_del_readthrough_region_df[f"{feature}_zscore"] = (all_del_readthrough_region_df[feature] - group_mean)/group_std
all_del_readthrough_region_df["SVTYPE"] = "DEL"

Number of DEL transcriptional readthrough candidates: 12


In [4]:
### Duplications 
dup_tx_read_vrnt_feat_df = pd.read_csv(dup_tx_read_vrnt_feat_path, sep="\t")
dup_tx_read = dup_tx_read_vrnt_feat_df.vrnt_id.unique()
print(f"Number of DUP transcriptional readthrough candidates: {len(dup_tx_read)}")

# load readthrough region read count and coverage across all variants 
dup_readthrough_region_dir = wkdir_path.joinpath("6_misexp_dissect/tx_readthrough/duplications/readthrough_region")
all_dup_readthrough_region_df_list = []
for vrnt_id in dup_tx_read: 
    carriers = dup_tx_read_vrnt_feat_df[dup_tx_read_vrnt_feat_df.vrnt_id == vrnt_id].rna_id.unique()
    vrnt_readthrough_region_path = dup_readthrough_region_dir.joinpath(f"{vrnt_id}/{vrnt_id}_intergenic_cov.tsv")
    vrnt_readthrough_region_df = pd.read_csv(vrnt_readthrough_region_path, sep="\t")
    # annotate carriers and non-carriers 
    vrnt_readthrough_region_df["carrier"] = np.where(vrnt_readthrough_region_df.rna_id.isin(carriers), "DUP", "Non-carrier")
    all_dup_readthrough_region_df_list.append(vrnt_readthrough_region_df)
    
all_dup_readthrough_region_df = pd.concat(all_dup_readthrough_region_df_list)
# add read depth 
all_dup_readthrough_region_df = pd.merge(all_dup_readthrough_region_df, 
                                              rna_id_read_depth_df, 
                                              on="rna_id", 
                                              how="left"
                                            )
# compute FPKM 
all_dup_readthrough_region_df["fpkm"] = 1000000000 * (all_dup_readthrough_region_df["features"]/(all_dup_readthrough_region_df["total_len"] * all_dup_readthrough_region_df["RawReadDepth"]))

# calculate FPKM and coverage z-scores per region 
for feature in ["fpkm", "cov_fraction"]: 
    group_mean = all_dup_readthrough_region_df.groupby("vrnt_gene_id")[feature].transform("mean")
    group_std = all_dup_readthrough_region_df.groupby("vrnt_gene_id")[feature].transform("std")
    all_dup_readthrough_region_df[f"{feature}_zscore"] = (all_dup_readthrough_region_df[feature] - group_mean)/group_std
all_dup_readthrough_region_df["SVTYPE"] = "DUP"

Number of DUP transcriptional readthrough candidates: 5


In [5]:
# list of combined transcriptional readthrough-associated variants 
tx_read_vrnts = set(dup_tx_read).union(set(del_tx_read))
print(f"Total number of transcriptional readthrough candidate variants: {len(tx_read_vrnts)}")
tx_read_vrnts_path = out_dir.joinpath("tx_read_vrnts_list.txt")
with open(tx_read_vrnts_path, "w") as f_out: 
    for vrnt_id in tx_read_vrnts: 
        f_out.write(f"{vrnt_id}\n")

Total number of transcriptional readthrough candidate variants: 17


In [6]:
# combine DELs and DUPs 
all_vrnt_readthrough_region_df = pd.concat([all_dup_readthrough_region_df, all_del_readthrough_region_df])
# write variant-gene-sample to file for mechanism count
tx_read_vrnt_gene_smpl_df = all_vrnt_readthrough_region_df[all_vrnt_readthrough_region_df.carrier != "Non-carrier"][["vrnt_id", "gene_id", "rna_id"]]
tx_read_vrnt_gene_smpl_path = out_dir.joinpath("tx_read_vrnts_gene_smpl.tsv")
tx_read_vrnt_gene_smpl_df.to_csv(tx_read_vrnt_gene_smpl_path, sep="\t", index=False)

In [7]:
# correlation between fraction bases non-zero coverage (FBNC) and read depth 
pearson_fbnc_depth = all_vrnt_readthrough_region_df.cov_fraction.corr(all_vrnt_readthrough_region_df.RawReadDepth)
print(f"Pearson correlation FBNC and read depth: {pearson_fbnc_depth}")
spearman_fbnc_depth = all_vrnt_readthrough_region_df.cov_fraction.corr(all_vrnt_readthrough_region_df.RawReadDepth, method="spearman")
print(f"Spearman correlation FBNC and read depth: {spearman_fbnc_depth}")
# no correlation so do not correct for read depth 

Pearson correlation FBNC and read depth: 0.027865948602230295
Spearman correlation FBNC and read depth: 0.015193064166358943


In [10]:
### subset to WGS samples with SV calls
rna_id_pass_qc_set = set(pd.read_csv(rna_id_pass_qc_path, sep="\t", header=None)[0])
# egan ID, RNA ID sample links 
wgs_rna_paired_smpls_df = pd.read_csv(wgs_rna_paired_smpls_path, sep="\t")
egan_ids_with_rna = wgs_rna_paired_smpls_df[wgs_rna_paired_smpls_df.rna_id.isin(rna_id_pass_qc_set)].egan_id.tolist()
# load VCF and subset to EGAN IDs with RNA 
print("Loading input VCF ...")
vcf_path = vcf_path
vcf = pysam.VariantFile(vcf_path, mode = "r")
print("VCF loaded.")
print("Subset VCF to samples with RNA-seq ...")
vcf_samples = [sample for sample in vcf.header.samples]
vcf_egan_ids_with_rna = set(egan_ids_with_rna).intersection(set(vcf_samples))
vcf.subset_samples(vcf_egan_ids_with_rna)
vcf_samples_with_rna = [sample for sample in vcf.header.samples]
print(f"Number of samples in VCF with RNA ID and passing QC: {len(vcf_samples_with_rna)}")
# subset egan ID and RNA ID links to samples with SV calls and passing QC 
wgs_rna_paired_smpls_with_sv_calls_df = wgs_rna_paired_smpls_df[wgs_rna_paired_smpls_df.egan_id.isin(vcf_samples_with_rna)]
# write EGAN-RNA ID pairs to file
rna_id_pass_qc_sv_calls = wgs_rna_paired_smpls_with_sv_calls_df.rna_id.unique().tolist()
print(f"Number of RNA IDs passing QC: {len(rna_id_pass_qc_sv_calls)}")

Loading input VCF ...
VCF loaded.
Subset VCF to samples with RNA-seq ...
Number of samples in VCF with RNA ID and passing QC: 2640
Number of RNA IDs passing QC: 2640


In [11]:
# subset to WGS samples 
all_vrnt_readthrough_region_wgs_df = all_vrnt_readthrough_region_df[all_vrnt_readthrough_region_df.rna_id.isin(rna_id_pass_qc_sv_calls)]
num_vrnts = all_vrnt_readthrough_region_wgs_df.shape[0]/len(rna_id_pass_qc_sv_calls)
print(f"Estimated number of variants: {num_vrnts}")

Estimated number of variants: 17.0


**Correlation between misexpression and FPKM/coverage z-score for carriers and non-carriers**

In [12]:
# load gene expression levels 
misexp_tpm_zscore_path = wkdir_path.joinpath("1_rna_seq_qc/zscore_tpm_flat/tpm_zscore_4568smpls_8739genes_tpm0.1_frac_5.0perc_flat.csv")
misexp_tpm_zscore_df = pd.read_csv(misexp_tpm_zscore_path, sep=",")

In [13]:
all_vrnt_readthrough_region_wgs_tpm_df = pd.merge(all_vrnt_readthrough_region_wgs_df, 
                                              misexp_tpm_zscore_df, 
                                              on=["rna_id", "gene_id"], 
                                              how="inner")

In [14]:
# carrier correlation 
carrier_vrnt_readthrough_region_wgs_tpm_df = all_vrnt_readthrough_region_wgs_tpm_df[all_vrnt_readthrough_region_wgs_tpm_df.carrier.isin(["DEL", "DUP"])]

In [15]:
# carrier correlation 
carrier_vrnt_readthrough_region_wgs_tpm_df = all_vrnt_readthrough_region_wgs_tpm_df[all_vrnt_readthrough_region_wgs_tpm_df.carrier.isin(["DEL", "DUP"])]
spearman_fpkm_carrier, pval_fpkm_carrier = spearmanr(carrier_vrnt_readthrough_region_wgs_tpm_df.fpkm_zscore.tolist(), carrier_vrnt_readthrough_region_wgs_tpm_df["z-score"].tolist())
spearman_cov_carrier, pval_cov_carrier = spearmanr(carrier_vrnt_readthrough_region_wgs_tpm_df.cov_fraction_zscore.tolist(), carrier_vrnt_readthrough_region_wgs_tpm_df["z-score"].tolist())
print(f"Correlation between coverage and misexpression level for carriers: {spearman_cov_carrier}, p-value: {pval_cov_carrier}")
print(f"Correlation between FPKM and misexpression level for carriers: {spearman_fpkm_carrier}, p-value: {pval_fpkm_carrier}")

Correlation between coverage and misexpression level for carriers: 0.741784060204143, p-value: 5.096828360863981e-05
Correlation between FPKM and misexpression level for carriers: 0.7190511709507181, p-value: 0.00011060602344143254


In [16]:
# non-carriers correlation 
noncarrier_vrnt_readthrough_region_wgs_tpm_df = all_vrnt_readthrough_region_wgs_tpm_df[~all_vrnt_readthrough_region_wgs_tpm_df.carrier.isin(["DEL", "DUP"])]
spearman_cov_noncarrier, pval_cov_noncarrier = spearmanr(noncarrier_vrnt_readthrough_region_wgs_tpm_df.cov_fraction_zscore.tolist(), noncarrier_vrnt_readthrough_region_wgs_tpm_df["z-score"].tolist())
spearman_fpkm_noncarrier, pval_fpkm_noncarrier = spearmanr(noncarrier_vrnt_readthrough_region_wgs_tpm_df.fpkm_zscore.tolist(), noncarrier_vrnt_readthrough_region_wgs_tpm_df["z-score"].tolist())
print(f"Correlation between coverage and misexpression level for non-carriers: {spearman_cov_noncarrier}, p-value: {pval_cov_noncarrier}")
print(f"Correlation between FPKM and misexpression level for non-carriers: {spearman_fpkm_noncarrier}, p-value: {pval_fpkm_noncarrier}")

Correlation between coverage and misexpression level for non-carriers: 0.05058856493752987, p-value: 8.115974380058901e-27
Correlation between FPKM and misexpression level for non-carriers: 0.11708571679438892, p-value: 1.1347571294237164e-136


**Readthrough region length**

In [17]:
vrnt_tx_read_len_df = all_vrnt_readthrough_region_wgs_tpm_df[["vrnt_id", "total_len"]].drop_duplicates()
print(f"Median readthrough region length: {vrnt_tx_read_len_df.total_len.median()}")
print(f"Max readthrough region length: {vrnt_tx_read_len_df.total_len.max()}")

Median readthrough region length: 14905.0
Max readthrough region length: 102943


**Consequences of readthrough associated variants**

In [18]:
# load misexpression-associated variant features 
misexp_vrnt_feat_df = pd.read_csv(misexp_vrnt_feat_path, sep="\t")

# merge variant-gene pairs 
vrnt_gene_pairs_tx_read_df = all_vrnt_readthrough_region_wgs_tpm_df[["vrnt_id", "gene_id"]].drop_duplicates()
vrnt_gene_pairs_tx_read_df.vrnt_id = vrnt_gene_pairs_tx_read_df.vrnt_id.astype(str)

vrnts_tx_read_feat_df = pd.merge(misexp_vrnt_feat_df, 
                                 vrnt_gene_pairs_tx_read_df, 
                                 on=["vrnt_id", "gene_id"], 
                                 how="inner")
vrnts_tx_read_info_df = vrnts_tx_read_feat_df[["vrnt_id", "gene_id", "gene_msc", "SVTYPE"]].drop_duplicates()
vrnt_tx_read_count_consq_df = vrnts_tx_read_info_df.groupby(["SVTYPE", "gene_msc"], as_index=False).vrnt_id.count()
vrnt_tx_read_count_consq_df = vrnt_tx_read_count_consq_df.rename(columns={"vrnt_id": "vrnt_count"})

In [19]:
consequence_names_dict = {
    'no_predicted_effect': "No predicted effect",
    'non_coding_transcript_exon_variant': "Non-coding transcript",
    'upstream_gene_variant': "Upstream (5 kb)",
    'transcript_amplification': "Transcript amplification"
}
vrnt_tx_read_count_consq_df["consq_name"] = vrnt_tx_read_count_consq_df.gene_msc.replace(consequence_names_dict)

In [20]:
vrnt_tx_read_count_consq_path = out_dir.joinpath("del_dup_tx_readthrough_consq.tsv")
vrnt_tx_read_count_consq_df.to_csv(vrnt_tx_read_count_consq_path, sep="\t", index=False)

***Write results to file***

In [21]:
all_vrnt_readthrough_region_wgs_tpm_path = out_dir.joinpath("del_dup_readthrough_region_wgs.tsv")
all_vrnt_readthrough_region_wgs_tpm_df.to_csv(all_vrnt_readthrough_region_wgs_tpm_path, sep="\t", index=False)