### Correlate misexpression events with biological and technical covariates 

In [1]:
import pandas as pd 
from scipy.stats import spearmanr
from pathlib import Path
from statsmodels.stats import multitest

In [2]:
# inputs 
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)
tpm_mtx_inactive_path = wkdir_path.joinpath("1_rna_seq_qc/tpm_mtx_inactive/tpm_4568samples_8779genes_inactive.tsv")
zscore_tpm_flat_path =wkdir_path.joinpath("1_rna_seq_qc/zscore_tpm_flat/tpm_zscore_4568smpls_8739genes_tpm0.1_frac_5.0perc_flat.csv")
covariates_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/phenotypes/rna_seq/processed_v97/covariates/master/master_covariates_v97_swapd_depth_fastq_rin_cell_sex_pcs_season_batch_fc_pipelines_updtd.tsv"
xcell_enrich_path =wkdir_path.joinpath("2_misexp_qc/xcell/xCell_estimates.tsv")
out_dir = wkdir_path.joinpath("2_misexp_qc/misexp_gene_cov_corr")

In [3]:
# variables 
spearman_rho_cutoff = 0.2

In [4]:
out_dir_path = Path(out_dir)
out_dir_path.mkdir(parents=True, exist_ok=True)

In [5]:
# load TPM matrix of inactive genes 
tpm_mtx_inactive_df = pd.read_csv(tpm_mtx_inactive_path, sep="\t")
# read in covariates 
covariates_df = pd.read_csv(covariates_path, sep="\t")
# transpose and clean dataframe 
tpm_mtx_inactive_tp_df = tpm_mtx_inactive_df.set_index("gene_id").transpose()
# remove genes with all zeroes
tpm_mtx_inactive_tp_rmv_all_zero_df = tpm_mtx_inactive_tp_df.loc[:, (tpm_mtx_inactive_tp_df != 0).any(axis=0)]
gene_test_set = tpm_mtx_inactive_tp_rmv_all_zero_df.columns.unique()
tpm_mtx_inactive_tp_clean_df = tpm_mtx_inactive_tp_rmv_all_zero_df.reset_index().rename(columns={"index":"rna_id"})

In [6]:
# all inactive genes
inactive_genes = tpm_mtx_inactive_df.gene_id.unique()
print(f"Total number of inactive genes: {len(inactive_genes)}")

Total number of inactive genes: 8779


In [7]:
### correlation with covariates 

# read in covariates 
covariates_df = pd.read_csv(covariates_path, sep="\t")

bio_covariates_list = ['age_RNA', 'height', 'weight', 'BMI', "sex_0_1"]
cell_types_covariates_list = ['BA_D_10_9_L___RNA_imptd', 'BA_D_PCT___RNA_imptd', 'BA_N_10_9_L___RNA_imptd', 
            'BA_N_PCT___RNA_imptd', 'BASO_10_9_L___RNA_imptd', 'BASO_PCT___RNA_imptd', 
            'Delta_He_pg___RNA_imptd', 'EO_10_9_L___RNA_imptd', 'EO_PCT___RNA_imptd', 
            'FRC_10_12_L___RNA_imptd', 'FRC_PCT___RNA_imptd', 'H_IPF___RNA_imptd', 
            'HCT_PCT___RNA_imptd', 'HFLC_10_9_L___RNA_imptd', 'HFLC_PCT___RNA_imptd', 
            'HFR_PCT___RNA_imptd', 'HGB_g_dL___RNA_imptd', 'HYPER_He_PCT___RNA_imptd', 
            'HYPO_He_PCT___RNA_imptd', 'IG_10_9_L___RNA_imptd', 'IG_PCT___RNA_imptd', 
            'IPF___RNA_imptd', 'IPFx_10_9_L___RNA_imptd', 'IRF_PCT___RNA_imptd', 
            'IRF_Y_ch___RNA_imptd', 'LFR_PCT___RNA_imptd', 'LY_WX___RNA_imptd', 
            'LY_WY___RNA_imptd', 'LY_WZ___RNA_imptd', 'LY_X_ch___RNA_imptd', 
            'LY_Y_ch___RNA_imptd', 'LY_Z_ch___RNA_imptd', 'LYMP_10_9_L___RNA_imptd', 
            'LYMP_PCT___RNA_imptd', 'LYMPH_10_9_L___RNA_imptd', 'LYMPH_PCT___RNA_imptd', 
            'MacroR_PCT___RNA_imptd', 'MCH_pg___RNA_imptd', 'MCHC_g_dL___RNA_imptd', 
            'MCV_fL___RNA_imptd', 'MFR_PCT___RNA_imptd', 'MicroR_PCT___RNA_imptd', 
            'MO_WX___RNA_imptd', 'MO_WY___RNA_imptd', 'MO_WZ___RNA_imptd', 'MO_X_ch___RNA_imptd', 
            'MO_Y_ch___RNA_imptd', 'MO_Z_ch___RNA_imptd', 'MONO_10_9_L___RNA_imptd', 
            'MONO_PCT___RNA_imptd', 'MPV_fL___RNA_imptd', 'NE_FSC_ch___RNA_imptd', 
            'NE_SFL_ch___RNA_imptd', 'NE_SSC_ch___RNA_imptd', 'NE_WX___RNA_imptd', 
            'NE_WY___RNA_imptd', 'NE_WZ___RNA_imptd', 'NEUT_10_9_L___RNA_imptd', 
            'NEUT_PCT___RNA_imptd', 'NEUTx_10_9_L___RNA_imptd', 'NEUTx_PCT___RNA_imptd', 
            'NRBC_10_9_L___RNA_imptd', 'NRBC_PCT___RNA_imptd', 'P_LCR_PCT___RNA_imptd', 
            'PCT_PCT___RNA_imptd', 'PDW_fL___RNA_imptd', 'PLT_10_9_L___RNA_imptd', 
            'PLT_F_10_9_L___RNA_imptd', 'PLT_I_10_9_L___RNA_imptd', 'PLT_O_10_9_L___RNA_imptd', 
            'RBC_10_12_L___RNA_imptd', 'RBC_He_pg___RNA_imptd', 'RBC_O_10_12_L___RNA_imptd', 
            'RDW_CV_PCT___RNA_imptd', 'RDW_SD_fL___RNA_imptd', 'RET_10_6_uL___RNA_imptd', 
            'RET_He_pg___RNA_imptd', 'RET_PCT___RNA_imptd', 'RET_RBC_Y_ch___RNA_imptd', 
            'RET_TNC___RNA_imptd', 'RET_UPP___RNA_imptd', 'RET_Y_ch___RNA_imptd', 'RPI___RNA_imptd', 
            'TNC_10_9_L___RNA_imptd', 'TNC_D_10_9_L___RNA_imptd', 'TNC_N_10_9_L___RNA_imptd', 
            'WBC_10_9_L___RNA_imptd', 'WBC_D_10_9_L___RNA_imptd', 'WBC_N_10_9_L___RNA_imptd']
            
tech_covariates_list = ['Conc_ng_ul', 'OD_260_280','OD_260_230','Yield_ng','Agilent_28S_18S',
                        'Agilent_Conc_ng_ul', 'Agilent_Yield_ng',
                        'Agilent_RINe_imptd_by_batch','Assigned', 'Unassigned_MultiMapping', 
                        'Unassigned_NoFeatures', 'Unassigned_Ambiguity','gc_percent_forward_read', 
                        'gc_percent_reverse_read', 'adapters_percent_forward_read', 
                        'adapters_percent_reverse_read', 'percent_mapped', 'percent_duplicate', 
                        'rna_exonic_rate', 'rna_rrna_rate', 'rna_globin_percent_tpm', 
                        'rna_mitochondrial_percent_tpm', 'num_reads', 'RawReadDepth',
                        'RawReadDepth_fromFastQFile']

other_covariates = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11','PC12',
                    'PC13','PC14','PC15','PC16','PC17','PC18','PC19','PC20','Season_Winter',
                    'Season_Autumn','Season_Spring', 'Season_Summer','sequencingBatch_10',
                    'sequencingBatch_4','sequencingBatch_15','sequencingBatch_3','sequencingBatch_5',
                    'sequencingBatch_8','sequencingBatch_2', 'sequencingBatch_6','sequencingBatch_14',
                    'sequencingBatch_11','sequencingBatch_1','sequencingBatch_12','sequencingBatch_9',
                    'sequencingBatch_7','sequencingBatch_13',]

covariate_list = bio_covariates_list + cell_types_covariates_list + tech_covariates_list + other_covariates
covariates_susbet_df = covariates_df[["rna_id"] + covariate_list]
tpm_mtx_cov_merged_df = pd.merge(tpm_mtx_inactive_tp_clean_df, 
                                 covariates_susbet_df, 
                                 on="rna_id", 
                                 how="inner")

In [8]:
print(f"Number of biological covariates: {len(bio_covariates_list)}")
print(f"Number of Sysmex cell-types: {len(cell_types_covariates_list)}")
print(f"Number of technical covariates: {len(tech_covariates_list)}")
print(f"Number of other covariates: {len(other_covariates)}")
print(f"Total number of measured covariates: {len(covariate_list)}")

Number of biological covariates: 5
Number of Sysmex cell-types: 89
Number of technical covariates: 25
Number of other covariates: 39
Total number of measured covariates: 158


In [9]:
# correlate gene expression with covariates 
cov_gene_id_corr_dict = {}
cov_gene_id_pval_dict = {}
for gene_id in gene_test_set: 
    cov_gene_id_corr_dict[gene_id], cov_gene_id_pval_dict[gene_id] = [], []
    for covariate in covariate_list:
        rho, pval = spearmanr(tpm_mtx_cov_merged_df[gene_id].values, tpm_mtx_cov_merged_df[covariate].values)
        cov_gene_id_corr_dict[gene_id].append(rho)
        cov_gene_id_pval_dict[gene_id].append(pval)

In [10]:
# rho values 
cov_gene_id_corr_df = pd.DataFrame.from_dict(cov_gene_id_corr_dict, orient="index", columns=covariate_list)
cov_gene_id_corr_clean_df = cov_gene_id_corr_df.reset_index().rename(columns={"index":"gene_id"})
# write to file 
cov_gene_id_corr = out_dir_path.joinpath("misexp_corr_cov.csv")
cov_gene_id_corr_clean_df.to_csv(cov_gene_id_corr, index=False)
# flatten 
cov_gene_id_corr_flat_df = pd.melt(cov_gene_id_corr_clean_df, id_vars="gene_id").rename(columns={"value":"spearman_rho"})

In [11]:
# p-values 
cov_gene_id_pval_df = pd.DataFrame.from_dict(cov_gene_id_pval_dict, orient="index", columns=covariate_list)
cov_gene_id_pval_clean_df = cov_gene_id_pval_df.reset_index().rename(columns={"index":"gene_id"})
# write to file 
cov_gene_id_pval = out_dir_path.joinpath("misexp_pval_cov.csv")
cov_gene_id_pval_clean_df.to_csv(cov_gene_id_pval, index=False)
# flatten 
cov_gene_id_pval_flat_df = pd.melt(cov_gene_id_pval_clean_df, id_vars="gene_id").rename(columns={"value":"pval"})

In [33]:
# correlation with inferred cell enrichments (xCell)
xcell_enrich_df = pd.read_csv(xcell_enrich_path, sep="\t")

xcell_features = xcell_enrich_df.columns.tolist()
print(f"Number of xCell features: {len(xcell_features)}")
xcell_enrich_reidx_df = xcell_enrich_df.reset_index().rename(columns={"index":"rna_id"})

tpm_mtx_xcell_merged_df = pd.merge(tpm_mtx_inactive_tp_clean_df, 
                               xcell_enrich_reidx_df, 
                               on="rna_id", 
                               how="inner")
print(f"Total number of covariates: {len(xcell_features) + len(covariate_list)}")

Number of xCell features: 67
Total number of covariates: 225


In [13]:
xcell_gene_id_corr_dict = {}
xcell_gene_id_pval_dict = {}
for i, gene_id in enumerate(gene_test_set):
    rho_list, pval_list = [gene_id], [gene_id]
    for cell in xcell_features: 
        rho, pval = spearmanr(tpm_mtx_xcell_merged_df[gene_id].values, tpm_mtx_xcell_merged_df[cell].values)
        rho_list.append(rho)
        pval_list.append(pval) 
    xcell_gene_id_corr_dict[i] = rho_list
    xcell_gene_id_pval_dict[i] = pval_list

In [14]:
# rho values 
xcell_gene_id_corr_rho_df = pd.DataFrame.from_dict(xcell_gene_id_corr_dict, orient="index", columns=["gene_id"] + xcell_features)
# write to file 
xcell_gene_id_corr = out_dir_path.joinpath("misexp_corr_xcell.csv")
xcell_gene_id_corr_rho_df.to_csv(xcell_gene_id_corr, index=False)
# flatten 
xcell_gene_id_corr_rho_flat_df = pd.melt(xcell_gene_id_corr_rho_df, id_vars="gene_id").rename(columns={"value":"spearman_rho"})

In [15]:
# p-values 
xcell_gene_id_corr_p_df = pd.DataFrame.from_dict(xcell_gene_id_pval_dict, orient="index", columns=["gene_id"] + xcell_features)
# write to file 
xcell_gene_id_pval = out_dir_path.joinpath("misexp_pval_xcell.csv")
xcell_gene_id_corr_p_df.to_csv(xcell_gene_id_pval, index=False)
# flatten 
xcell_gene_id_corr_p_flat_df = pd.melt(xcell_gene_id_corr_p_df, id_vars="gene_id").rename(columns={"value":"pval"})

In [16]:
### remove later - added to not rerun above 
cov_gene_id_corr = out_dir_path.joinpath("misexp_corr_cov.csv")
cov_gene_id_corr_clean_df = pd.read_csv(cov_gene_id_corr)

cov_gene_id_pval = out_dir_path.joinpath("misexp_pval_cov.csv")
cov_gene_id_pval_clean_df = pd.read_csv(cov_gene_id_pval)

xcell_gene_id_corr = out_dir_path.joinpath("misexp_corr_xcell.csv")
xcell_gene_id_corr_rho_df = pd.read_csv(xcell_gene_id_corr)

xcell_gene_id_pval = out_dir_path.joinpath("misexp_pval_xcell.csv")
xcell_gene_id_corr_p_df = pd.read_csv(xcell_gene_id_pval)

# flatten 
cov_gene_id_corr_flat_df = pd.melt(cov_gene_id_corr_clean_df, id_vars="gene_id").rename(columns={"value":"spearman_rho"})
cov_gene_id_pval_flat_df = pd.melt(cov_gene_id_pval_clean_df, id_vars="gene_id").rename(columns={"value":"pval"})
xcell_gene_id_corr_rho_flat_df = pd.melt(xcell_gene_id_corr_rho_df, id_vars="gene_id").rename(columns={"value":"spearman_rho"})
xcell_gene_id_corr_p_flat_df = pd.melt(xcell_gene_id_corr_p_df, id_vars="gene_id").rename(columns={"value":"pval"})

In [17]:
# combine all p-values
all_cov_gene_id_pval_df = pd.concat([cov_gene_id_pval_flat_df, xcell_gene_id_corr_p_flat_df])
# multiple testing correction 
pval_as_array = all_cov_gene_id_pval_df.pval.to_numpy()
pass_test, pval_adj, _, _ = multitest.multipletests(pval_as_array, alpha=0.05, method="fdr_bh")
all_cov_gene_id_pval_df["pass"] = pass_test
all_cov_gene_id_pval_df["pval_adj"] = pval_adj

In [18]:
# multiple testing correction across all p-values 
# only remove genes with high correlation and p-value passing threshold 

In [19]:
# combine correlations 
all_cov_gene_id_corr_df = pd.concat([xcell_gene_id_corr_rho_flat_df, cov_gene_id_corr_flat_df])
# add p-values 
all_cov_gene_id_corr_p_df = pd.merge(all_cov_gene_id_corr_df, 
                                     all_cov_gene_id_pval_df, 
                                     on=["gene_id", "variable"], 
                                     how="inner"
                                    )

In [20]:
# identify correlated genes 
genes_to_remove = all_cov_gene_id_corr_p_df[(all_cov_gene_id_corr_p_df.spearman_rho.abs() > spearman_rho_cutoff) & 
                                            (all_cov_gene_id_corr_p_df["pass"])
                                           ].gene_id.unique()
print(f"Number of genes to remove: {len(genes_to_remove)}")

Number of genes to remove: 129


In [21]:
# write genes to file 
genes_corr_tech_covs_path = out_dir_path.joinpath("genes_corr_tech_covs.txt")
with open(genes_corr_tech_covs_path, 'w') as f_out: 
    for gene_id in genes_to_remove: 
        f_out.write(f"{gene_id}\n")
# write all covariates to file 
gene_id_corr_all_pval = out_dir_path.joinpath("misexp_corr_corr_all_covs.csv")
all_cov_gene_id_corr_df.to_csv(gene_id_corr_all_pval, index=False)

In [26]:
# read in flat z-score matrix 
zscore_tpm_flat_df = pd.read_csv(zscore_tpm_flat_path)

In [34]:
# metrics - percentage of genes removed, total inactive genes = 8,780
number_smpls = len(zscore_tpm_flat_df.rna_id.unique())
number_genes = set(inactive_genes)
print(f"Total number of genes: {len(number_genes)}")
genes_remaining = number_genes - set(genes_to_remove)
number_genes_remaining = len(genes_remaining)
print(f"Number of genes remaining: {number_genes_remaining}")
print(f"Percentage of genes removed: {(len(genes_to_remove)/len(number_genes))*100}")

Total number of genes: 8779
Number of genes remaining: 8650
Percentage of genes removed: 1.4694156509853058


In [29]:
genes_remaining_path = out_dir_path.joinpath(f"gene_id_post_tech_cov_qc_{number_genes_remaining}.txt")
with open(genes_remaining_path, 'w') as f_out: 
    for gene_id in genes_remaining: 
        f_out.write(f"{gene_id}\n")

In [30]:
# remove genes correlated with technical covariates 
zscore_tpm_flat_rmvd_genes_df = zscore_tpm_flat_df[~zscore_tpm_flat_df.gene_id.isin(genes_to_remove)]

In [31]:
gene_id_z_tpm_flat_remaining = zscore_tpm_flat_rmvd_genes_df.gene_id.unique()
num_gene_id_z_tpm_flat_remaining = len(gene_id_z_tpm_flat_remaining)
print(f"Number of genes remaining in matrix: {num_gene_id_z_tpm_flat_remaining}")
# this number differs to the one above as we removed 40 genes with TPM=0 across all 
# samples to calculate z-scores 

Number of genes remaining in matrix: 8610


In [32]:
# write remaining genes to file 
xcell_gene_id_corr_pval = out_dir_path.joinpath(f"tpm_zscore_{number_smpls}smpls_{num_gene_id_z_tpm_flat_remaining}genes_flat_misexp_corr_qc.csv")
zscore_tpm_flat_rmvd_genes_df.to_csv(xcell_gene_id_corr_pval, index=False)