### INTERVAL covariates 

* Covariates to be correlated with inactive gene expression 

Includes:
* Age
* Height
* Weight
* Sex
* Sysmex cell counts
* Technical covariates
* Seasons
* Genetic PCs

Some covariates have missing values: OD_260_230, Agilent_28S_18S, Agilent_Conc_ng_ul, Agilent_Yield_ng

In [1]:
from pathlib import Path 
import pandas as pd

In [2]:
wkdir="/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3"
wkdir_path = Path(wkdir)

# inputs 
covariates_path = "/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/phenotypes/rna_seq/processed_v97/covariates/master/master_covariates_v97_swapd_depth_fastq_rin_cell_sex_pcs_season_batch_fc_pipelines_updtd.tsv"
xcell_enrich_path =wkdir_path.joinpath("2_misexp_qc/xcell/xCell_estimates.tsv")
# output 
out_dir = wkdir_path.joinpath("2_misexp_qc/covariates")
out_dir.mkdir(parents=True, exist_ok=True)

In [3]:
### correlation with covariates 

# read in covariates 
covariates_df = pd.read_csv(covariates_path, sep="\t")

bio_covariates_list = ['age_RNA', 'height', 'weight', 'BMI', "sex_0_1"]
cell_types_covariates_list = ['BA_D_10_9_L___RNA_imptd', 'BA_D_PCT___RNA_imptd', 'BA_N_10_9_L___RNA_imptd', 
            'BA_N_PCT___RNA_imptd', 'BASO_10_9_L___RNA_imptd', 'BASO_PCT___RNA_imptd', 
            'Delta_He_pg___RNA_imptd', 'EO_10_9_L___RNA_imptd', 'EO_PCT___RNA_imptd', 
            'FRC_10_12_L___RNA_imptd', 'FRC_PCT___RNA_imptd', 'H_IPF___RNA_imptd', 
            'HCT_PCT___RNA_imptd', 'HFLC_10_9_L___RNA_imptd', 'HFLC_PCT___RNA_imptd', 
            'HFR_PCT___RNA_imptd', 'HGB_g_dL___RNA_imptd', 'HYPER_He_PCT___RNA_imptd', 
            'HYPO_He_PCT___RNA_imptd', 'IG_10_9_L___RNA_imptd', 'IG_PCT___RNA_imptd', 
            'IPF___RNA_imptd', 'IPFx_10_9_L___RNA_imptd', 'IRF_PCT___RNA_imptd', 
            'IRF_Y_ch___RNA_imptd', 'LFR_PCT___RNA_imptd', 'LY_WX___RNA_imptd', 
            'LY_WY___RNA_imptd', 'LY_WZ___RNA_imptd', 'LY_X_ch___RNA_imptd', 
            'LY_Y_ch___RNA_imptd', 'LY_Z_ch___RNA_imptd', 'LYMP_10_9_L___RNA_imptd', 
            'LYMP_PCT___RNA_imptd', 'LYMPH_10_9_L___RNA_imptd', 'LYMPH_PCT___RNA_imptd', 
            'MacroR_PCT___RNA_imptd', 'MCH_pg___RNA_imptd', 'MCHC_g_dL___RNA_imptd', 
            'MCV_fL___RNA_imptd', 'MFR_PCT___RNA_imptd', 'MicroR_PCT___RNA_imptd', 
            'MO_WX___RNA_imptd', 'MO_WY___RNA_imptd', 'MO_WZ___RNA_imptd', 'MO_X_ch___RNA_imptd', 
            'MO_Y_ch___RNA_imptd', 'MO_Z_ch___RNA_imptd', 'MONO_10_9_L___RNA_imptd', 
            'MONO_PCT___RNA_imptd', 'MPV_fL___RNA_imptd', 'NE_FSC_ch___RNA_imptd', 
            'NE_SFL_ch___RNA_imptd', 'NE_SSC_ch___RNA_imptd', 'NE_WX___RNA_imptd', 
            'NE_WY___RNA_imptd', 'NE_WZ___RNA_imptd', 'NEUT_10_9_L___RNA_imptd', 
            'NEUT_PCT___RNA_imptd', 'NEUTx_10_9_L___RNA_imptd', 'NEUTx_PCT___RNA_imptd', 
            'NRBC_10_9_L___RNA_imptd', 'NRBC_PCT___RNA_imptd', 'P_LCR_PCT___RNA_imptd', 
            'PCT_PCT___RNA_imptd', 'PDW_fL___RNA_imptd', 'PLT_10_9_L___RNA_imptd', 
            'PLT_F_10_9_L___RNA_imptd', 'PLT_I_10_9_L___RNA_imptd', 'PLT_O_10_9_L___RNA_imptd', 
            'RBC_10_12_L___RNA_imptd', 'RBC_He_pg___RNA_imptd', 'RBC_O_10_12_L___RNA_imptd', 
            'RDW_CV_PCT___RNA_imptd', 'RDW_SD_fL___RNA_imptd', 'RET_10_6_uL___RNA_imptd', 
            'RET_He_pg___RNA_imptd', 'RET_PCT___RNA_imptd', 'RET_RBC_Y_ch___RNA_imptd', 
            'RET_TNC___RNA_imptd', 'RET_UPP___RNA_imptd', 'RET_Y_ch___RNA_imptd', 'RPI___RNA_imptd', 
            'TNC_10_9_L___RNA_imptd', 'TNC_D_10_9_L___RNA_imptd', 'TNC_N_10_9_L___RNA_imptd', 
            'WBC_10_9_L___RNA_imptd', 'WBC_D_10_9_L___RNA_imptd', 'WBC_N_10_9_L___RNA_imptd']
            
tech_covariates_list = ['Conc_ng_ul', 'OD_260_280','OD_260_230','Yield_ng','Agilent_28S_18S',
                        'Agilent_Conc_ng_ul', 'Agilent_Yield_ng',
                        'Agilent_RINe_imptd_by_batch','Assigned', 'Unassigned_MultiMapping', 
                        'Unassigned_NoFeatures', 'Unassigned_Ambiguity','gc_percent_forward_read', 
                        'gc_percent_reverse_read', 'adapters_percent_forward_read', 
                        'adapters_percent_reverse_read', 'percent_mapped', 'percent_duplicate', 
                        'rna_exonic_rate', 'rna_rrna_rate', 'rna_globin_percent_tpm', 
                        'rna_mitochondrial_percent_tpm', 'num_reads', 'RawReadDepth',
                        'RawReadDepth_fromFastQFile']

other_covariates = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11','PC12',
                    'PC13','PC14','PC15','PC16','PC17','PC18','PC19','PC20','Season_Winter',
                    'Season_Autumn','Season_Spring', 'Season_Summer','sequencingBatch_10',
                    'sequencingBatch_4','sequencingBatch_15','sequencingBatch_3','sequencingBatch_5',
                    'sequencingBatch_8','sequencingBatch_2', 'sequencingBatch_6','sequencingBatch_14',
                    'sequencingBatch_11','sequencingBatch_1','sequencingBatch_12','sequencingBatch_9',
                    'sequencingBatch_7','sequencingBatch_13',]

covariate_list = bio_covariates_list + cell_types_covariates_list + tech_covariates_list + other_covariates
covariates_susbet_df = covariates_df[["rna_id"] + covariate_list]

In [4]:
print(f"Number of biological covariates: {len(bio_covariates_list)}")
print(f"Number of Sysmex cell-types: {len(cell_types_covariates_list)}")
print(f"Number of technical covariates: {len(tech_covariates_list)}")
print(f"Number of other covariates: {len(other_covariates)}")
print(f"Total number of measured covariates: {len(covariate_list)}")

Number of biological covariates: 5
Number of Sysmex cell-types: 89
Number of technical covariates: 25
Number of other covariates: 39
Total number of measured covariates: 158


In [5]:
# correlation with inferred cell enrichments (xCell)
xcell_enrich_df = pd.read_csv(xcell_enrich_path, sep="\t")
xcell_features = xcell_enrich_df.columns.tolist()
print(f"Number of xCell features: {len(xcell_features)}")
xcell_enrich_reidx_df = xcell_enrich_df.reset_index().rename(columns={"index":"rna_id"})

covariates_xcell_df = pd.merge(covariates_susbet_df, 
                                 xcell_enrich_reidx_df, 
                                 on="rna_id", 
                                 how="inner")

Number of xCell features: 67


In [6]:
print(f"Total number of covariates: {len(xcell_features) + len(covariate_list)}")

Total number of covariates: 225


In [7]:
covariates_xcell_path = out_dir.joinpath("covariates_xcell.tsv")
covariates_xcell_df.to_csv(covariates_xcell_path, sep="\t", index=False)