### Misexpression metrics 

* Number and proportion of misexpression events across different z-score cutoffs 
* Number and proportion of genes with at least one misexpression event across different cutoffs 
* Number and proportion of samples that have a misexpression events across different 
* Number of misexpression events in each z-score bin

In [1]:
from pathlib import Path
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
wkdir = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/"
wkdir_path = Path(wkdir)

# inputs 
input_dir= wkdir_path.joinpath("2_misexp_qc")
input_path = Path(input_dir)
output_dir_path = input_path.joinpath("misexp_metrics")
output_dir_path.mkdir(parents=True, exist_ok=True)

# variables 
mixexp_tpm_cutoff = 0.5
zscore_cutoffs = [2, 10, 20, 30, 40]

In [3]:
input_path = Path(input_dir)
output_dir_path = input_path.joinpath("misexp_metrics")
output_dir_path.mkdir(parents=True, exist_ok=True)

In [4]:
## load gene expression matrix
tpm_zscore_flat_path = input_path.joinpath("misexp_gene_cov_corr/tpm_zscore_4568smpls_8610genes_flat_misexp_corr_qc.csv")
tpm_zscore_flat_df = pd.read_csv(tpm_zscore_flat_path) 

In [5]:
# all inactive genes passing QC 
inactive_gene_id_pass_qc_path = wkdir_path.joinpath("2_misexp_qc/misexp_gene_cov_corr/gene_id_post_tech_cov_qc_8650.txt")
inactive_gene_id_pass_qc = pd.read_csv(inactive_gene_id_pass_qc_path, sep="\t", header=None)[0].tolist()
num_inactive_gene_id_pass_qc = len(inactive_gene_id_pass_qc)
print(f"Number of inactive genes passing QC: {len(inactive_gene_id_pass_qc)}")

Number of inactive genes passing QC: 8650


In [6]:
genes_pass_qc = tpm_zscore_flat_df.gene_id.unique()
gene_number = len(genes_pass_qc)
print(f"Number of genes in gene expression matrix: {gene_number}") # removed 40 with TPM = 0 across all samples
smpl_pass_qc = tpm_zscore_flat_df.rna_id.unique()
smpl_number = len(smpl_pass_qc)
print(f"Number of samples in gene expression matrix: {smpl_number}")

# write inactive genes with z-scores to file 
inactive_genes_path = output_dir_path.joinpath(f"inactive_genes_{gene_number}.txt")
with open(inactive_genes_path, 'w') as f_out:
    for gene in genes_pass_qc: 
        f_out.write(f"{gene}\n")

Number of genes in gene expression matrix: 8610
Number of samples in gene expression matrix: 4568


In [7]:
# total number of gene-sample pairs 
total_events = len(inactive_gene_id_pass_qc) * len(smpl_pass_qc)
print(f"Total number of gene-sample pairs: {total_events}")

# number of gene-sample pairs with a TPM > 0.1
total_events_grtr_tpm1 = tpm_zscore_flat_df[tpm_zscore_flat_df.TPM > 0.1].shape[0]
print(f"Total number of gene-sample pairs with TPM > 0.1: {total_events_grtr_tpm1}")

Total number of gene-sample pairs: 39513200
Total number of gene-sample pairs with TPM > 0.1: 363951


In [25]:
### calculate misexpression metrics 

misexp_metrics_dict = {}
count_misexp_per_smpl_df_list = []

for row, zscore in enumerate(zscore_cutoffs): 
    print(f"Z-score cutoff: {zscore}")
    # identify misexpression events at z-score cutoff 
    misexp_events_df = tpm_zscore_flat_df[(tpm_zscore_flat_df.TPM > mixexp_tpm_cutoff) &
                                          (tpm_zscore_flat_df["z-score"] > zscore)]
    
    total_misexp_events = misexp_events_df.shape[0]
    print(f"\tNumber of misexpression events: {total_misexp_events}")
    
    # misexpressed genes and non-misexpressed genes at z-score cutoff 
    misexp_genes = misexp_events_df.gene_id.unique()
    num_misexp_genes = len(misexp_genes)
    print(f"\tNumber of misexpressed genes: {num_misexp_genes}")
    misexp_gene_path = output_dir_path.joinpath(f"misexp_genes_tpm{mixexp_tpm_cutoff}_z{zscore}.txt")
    with open(misexp_gene_path, 'w') as misexp_genes_out:
        for gene_id in misexp_genes: 
            misexp_genes_out.write(f"{gene_id}\n")

    # note using all 8650 inactive genes not just genes in matrix 
    never_misexp_genes = [gene_id for gene_id in inactive_gene_id_pass_qc if gene_id not in misexp_genes]
    num_never_misexp_genes = len(never_misexp_genes)
    print(f"\tNumber of never misexpressed genes: {num_never_misexp_genes}")
    never_misexp_gene_path = output_dir_path.joinpath(f"never_misexp_genes_tpm{mixexp_tpm_cutoff}_z{zscore}.txt")
    with open(never_misexp_gene_path, 'w') as never_misexp_genes_out:
        for gene_id in never_misexp_genes: 
            never_misexp_genes_out.write(f"{gene_id}\n")
    
    # percentage of samples with a misexpression event 
    smpls_with_misexp = misexp_events_df.rna_id.unique()
    num_smpls_with_misexp = len(smpls_with_misexp)
    perc_smpls_with_misexp = (num_smpls_with_misexp/smpl_number) * 100
    print(f"\tNumber of samples with a misexpression event: {num_smpls_with_misexp}/{smpl_number}")
    print(f"\tPercentage of samples with misexpression event: {perc_smpls_with_misexp}")
    # percentage of gene-sample pairs misexpressed 
    pc_gene_smpl_misexp = (total_misexp_events/total_events) * 100
    print(f"\tPercentage of genes-sample pairs misexpressed: {pc_gene_smpl_misexp}%")
    # percentage of genes with at least one misexpression event 
    pc_genes_misexp = (num_misexp_genes/num_inactive_gene_id_pass_qc) * 100
    print(f"\tPercentage of genes with at least one misexpression event: {pc_genes_misexp}%")
    
    # median misexpression events per sample 
    count_misexp_per_smpl_df = pd.DataFrame(misexp_events_df.groupby(by="rna_id")["gene_id"].count())
    count_misexp_per_smpl_df = count_misexp_per_smpl_df.rename(columns={"gene_id":"misexp_count"})
    rna_id_zero_count_dict = {rna_id:0 for rna_id in smpl_pass_qc if rna_id not in smpls_with_misexp}
    rna_id_zero_count_df = pd.DataFrame.from_dict(rna_id_zero_count_dict, orient="index")
    rna_id_zero_count_df = rna_id_zero_count_df.rename(columns={0:"misexp_count"})
    count_misexp_per_smpl_df = pd.concat([count_misexp_per_smpl_df, rna_id_zero_count_df])
    if count_misexp_per_smpl_df.misexp_count.sum() != total_misexp_events:
        raise ValueError(f"Sum of misexpression events per sample is not equal to total misexpression events.")
    median_misexp = count_misexp_per_smpl_df.misexp_count.median()
    count_misexp_per_smpl_df["zscore"] = f"> {zscore}"
    count_misexp_per_smpl_df_list.append(count_misexp_per_smpl_df)
    print(f"\tMedian number of misexpression events: {median_misexp}")
    
    # add information 
    misexp_metrics_dict[row] = [zscore, num_smpls_with_misexp, perc_smpls_with_misexp, total_misexp_events,
                                pc_gene_smpl_misexp, num_misexp_genes, pc_genes_misexp, median_misexp]

Z-score cutoff: 2
	Number of misexpression events: 28956
	Number of misexpressed genes: 4437
	Number of never misexpressed genes: 4213
	Number of samples with a misexpression event: 4386/4568
	Percentage of samples with misexpression event: 96.01576182136601
	Percentage of genes-sample pairs misexpressed: 0.0732818399927113%
	Percentage of genes with at least one misexpression event: 51.29479768786127%
	Median number of misexpression events: 4.0
Z-score cutoff: 10
	Number of misexpression events: 17461
	Number of misexpressed genes: 4437
	Number of never misexpressed genes: 4213
	Number of samples with a misexpression event: 3511/4568
	Percentage of samples with misexpression event: 76.86077057793345
	Percentage of genes-sample pairs misexpressed: 0.04419029590111659%
	Percentage of genes with at least one misexpression event: 51.29479768786127%
	Median number of misexpression events: 2.0
Z-score cutoff: 20
	Number of misexpression events: 7495
	Number of misexpressed genes: 3891
	Numb

In [26]:
misexp_metrics_cols = ["zscore", "smpl_misexp", "smpl_misexp_perc", "gene_smpl_misexp",
                       "gene_smpl_misexp_perc", "gene_misexp", "gene_misexp_perc", "median_misexp"]
misexp_metrics_df = pd.DataFrame.from_dict(misexp_metrics_dict, orient="index", columns=misexp_metrics_cols)

In [27]:
# write results to file
misexp_metrics_path = output_dir_path.joinpath("misexp_metrics.csv")
misexp_metrics_df.to_csv(misexp_metrics_path, index=False)

### Number of misexpression events per z-score bin 

In [28]:
misexp_tpm_zscore_df = tpm_zscore_flat_df[(tpm_zscore_flat_df.TPM > mixexp_tpm_cutoff) & 
                                          (tpm_zscore_flat_df["z-score"] > 2)
                                         ].copy()

In [29]:
bins = [bin_name for bin_name in range(2, 69)]
bin_names = [f"{bin_name}-{bin_name+1}" for bin_name in range(2, 68)]
misexp_tpm_zscore_df["z-score_bins"] = pd.cut(misexp_tpm_zscore_df["z-score"], bins=bins, labels=bin_names) 

In [30]:
count_misexp_by_bin_df = misexp_tpm_zscore_df.groupby(["z-score_bins"], as_index=False).gene_id.count()
count_misexp_by_bin_df = count_misexp_by_bin_df.rename(columns={"gene_id": "misexp_count"})

In [31]:
# write to file 
misexp_events_per_zscore_bin_path = output_dir_path.joinpath("misexp_events_per_zscore_bin.tsv")
count_misexp_by_bin_df.to_csv(misexp_events_per_zscore_bin_path, sep="\t", index=False)

### Median misexpression events per sample (across z-score cutoffs)

In [32]:
count_misexp_per_smpl_all_cutoffs_df = pd.concat(count_misexp_per_smpl_df_list)
count_misexp_per_smpl_all_cutoffs_df = count_misexp_per_smpl_all_cutoffs_df.reset_index().rename(columns={"index": "rna_id"})
# write to file 
count_misexp_per_smpl_all_cutoffs_path = output_dir_path.joinpath("misexp_events_per_smpl_zscore.tsv")
count_misexp_per_smpl_all_cutoffs_df.to_csv(count_misexp_per_smpl_all_cutoffs_path, sep="\t")