### Remove genes correlated with technical or biological covariates 

In [1]:
import pandas as pd 
from scipy.stats import spearmanr
from pathlib import Path
from statsmodels.stats import multitest

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

# inputs 
cov_corr_dir = wkdir_path.joinpath("2_misexp_qc/misexp_cov_corr")
inactive_genes_path = wkdir_path.joinpath("1_rna_seq_qc/gene_sets/inactive_genes_8779.txt")
zscore_tpm_flat_path = wkdir_path.joinpath("1_rna_seq_qc/zscore_tpm_flat/tpm_zscore_4568smpls_8739genes_tpm0.1_frac_5.0perc_flat.csv")
# output directory 
out_dir = wkdir_path.joinpath("2_misexp_qc/misexp_gene_cov_corr")
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)
# variables 
spearman_rho_cutoff = 0.2

In [3]:
inactive_genes = pd.read_csv(inactive_genes_path, sep="\t", header=None)[0].tolist()
num_inactive_genes = len(set(inactive_genes))
print(f"Number of inactive genes: {num_inactive_genes}")

misexp_cov_corr_path = wkdir_path.joinpath(f"2_misexp_qc/misexp_cov_corr/interval_gene_cov_corr.tsv")
misexp_cov_corr_df = pd.read_csv(misexp_cov_corr_path, sep="\t")

Number of inactive genes: 8779


In [4]:
print(f"Number of genes with correlations: {len(misexp_cov_corr_df.gene_id.unique())}")
print(f"Number of covariates with correlations: {len(misexp_cov_corr_df.covariate.unique())}")

Number of genes with correlations: 8779
Number of covariates with correlations: 225


In [5]:
# multiple testing correction 
misexp_cov_corr_nonan_df = misexp_cov_corr_df[~misexp_cov_corr_df.spearman.isna()].copy()
pval_as_array = misexp_cov_corr_nonan_df.pval.to_numpy()
for method in ["fdr_bh", "bonferroni"]:
    pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method=method)
    misexp_cov_corr_nonan_df[f"{method}_pass"] = pass_test
    misexp_cov_corr_nonan_df[f"{method}_pval_adj"] = pval_adj

In [6]:
# genes fail 
misexp_cov_corr_nonan_fail_df = misexp_cov_corr_nonan_df[(misexp_cov_corr_nonan_df.spearman.abs() > spearman_rho_cutoff) &
                                                    (misexp_cov_corr_nonan_df.fdr_bh_pass)]
gene_id_fail_cutoff = misexp_cov_corr_nonan_fail_df.gene_id.unique()
print(f"Number of genes failing QC: {len(gene_id_fail_cutoff)}")

Number of genes failing QC: 129


In [7]:
# write genes to file 
genes_corr_tech_covs_path = out_dir_path.joinpath("genes_corr_tech_covs.txt")
with open(genes_corr_tech_covs_path, 'w') as f_out: 
    for gene_id in gene_id_fail_cutoff: 
        f_out.write(f"{gene_id}\n")

In [8]:
# remove genes correlated with technical covariates 
zscore_tpm_flat_df = pd.read_csv(zscore_tpm_flat_path, sep=",")
zscore_tpm_flat_rmvd_genes_df = zscore_tpm_flat_df[~zscore_tpm_flat_df.gene_id.isin(gene_id_fail_cutoff)]

In [9]:
num_gene_id_z_tpm_flat_remaining = zscore_tpm_flat_rmvd_genes_df.gene_id.nunique()
print(f"Number of genes remaining in matrix: {num_gene_id_z_tpm_flat_remaining}")
# this number differs to the one above as we removed 40 genes with TPM=0 across all 
# samples to calculate z-scores 
number_smpls = zscore_tpm_flat_rmvd_genes_df.rna_id.nunique()
print(f"Number of samples remaining in matrix: {number_smpls}")

Number of genes remaining in matrix: 8610
Number of samples remaining in matrix: 4568


In [12]:
# write genes passing QC to file 
inactive_genes_pass_qc_path = wkdir_path.joinpath("2_misexp_qc/misexp_gene_cov_corr/gene_id_post_tech_cov_qc_8650.txt")
inactive_genes_pass_qc = set(inactive_genes) - set(gene_id_fail_cutoff)
print(f"Number of inactive genes remaining: {len(inactive_genes_pass_qc)}")
with open(inactive_genes_pass_qc_path, "w") as f_out: 
    for gene_id in inactive_genes_pass_qc: 
        f_out.write(f"{gene_id}\n")

Number of inactive genes remaining: 8650


In [10]:
# write remaining genes to file 
xcell_gene_id_corr_pval = out_dir_path.joinpath(f"tpm_zscore_{number_smpls}smpls_{num_gene_id_z_tpm_flat_remaining}genes_flat_misexp_corr_qc.csv")
zscore_tpm_flat_rmvd_genes_df.to_csv(xcell_gene_id_corr_pval, index=False)