# Metabolite Correlation Statistics

In [38]:
from pathlib import Path

import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.multitest import multipletests

In [12]:
RESULTS_DIR = Path("../results/correlation_network_data/metabolites_only/")
NEWBORN_OUTCOME_LABELS = ["bpd_any", "rop_any", "nec_any", "ivh_any"]

outcome_corr = pd.read_csv(RESULTS_DIR / "outcome_correlation_rho.csv")
outcome_corr = outcome_corr.query("outcome in @NEWBORN_OUTCOME_LABELS")
edges = pd.read_csv(RESULTS_DIR / "edges.csv")

In [39]:
nb_save_prefix = "0.0.1-metabolite_corr_network_stats"

In [13]:
outcome_corr

Unnamed: 0,feature,rho,outcome
0,ALA,-0.029088,nec_any
1,ARG,-0.001509,nec_any
2,C02,-0.008012,nec_any
3,C03DC,0.020554,nec_any
4,C04,0.039219,nec_any
...,...,...,...
179,RL_A,-0.014097,ivh_any
180,RO_C,-0.118582,ivh_any
181,TYR,0.066066,ivh_any
182,VAL,0.022117,ivh_any


In [15]:
outcome_corr.outcome.unique()

array(['nec_any', 'rop_any', 'bpd_any', 'ivh_any'], dtype=object)

In [21]:
metabolite_data = pd.read_csv(
    "../data/processed/neonatal_conditions.csv").set_index("row_id")

meta = pd.read_csv(
    "../data/processed/metadata.csv", low_memory=False).set_index("row_id")

In [25]:
sorted(meta.gacat.unique())

['20_21',
 '22_23',
 '24_25',
 '26_27',
 '28_29',
 '30_31',
 '32_33',
 '34_35',
 '36',
 '37_38',
 '39_40',
 '41_42',
 '43_44']

In [26]:
cohort_preterm_ga = [
    "22_23",
    "24_25",
    "26_27",
    "28_29",
]
preterm_ids = meta.query("gacat in @cohort_preterm_ga").index
preterm_cohort_metab = metabolite_data.loc[preterm_ids]

In [27]:
preterm_cohort_metab

Unnamed: 0_level_0,ALA,ARG,C02,C03,C03DC,C04,C04DC,C05,C051,C05DC,...,RO_C,RV_F,SA,TYR,VAL,XLE,nec_any,rop_any,bpd_any,ivh_any
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,299,13,19.7,,0.06,0.69,,0.37,0.02,0.09,...,5.25,,,145.3,266.9,184.4,0,0,1,0
5,525,65,32.2,,0.11,0.60,,0.57,0.03,0.22,...,14.28,3.24846,0.70,158.5,370.0,450.1,0,1,0,0
6,130,21,22.5,,0.07,0.25,,0.24,0.02,0.12,...,6.54,,,73.7,98.4,146.1,0,1,0,0
11,109,8,11.0,,0.05,0.28,,0.27,0.02,0.05,...,4.37,,,43.1,89.7,63.1,0,1,0,1
13,233,22,21.1,,0.05,0.41,,0.32,0.04,0.12,...,3.27,,,90.2,171.0,173.0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41396,218,41,27.4,,0.09,0.41,,0.59,0.02,0.16,...,10.84,,,106.8,203.0,195.2,0,0,0,0
41397,380,11,31.2,,0.08,0.55,,0.62,0.11,0.08,...,10.97,,,99.2,250.3,264.2,0,0,0,0
41400,123,49,20.3,,0.07,0.53,,0.23,0.07,0.19,...,9.25,,,70.6,93.3,80.5,0,0,1,0
41405,199,21,26.8,,0.17,1.00,,0.41,0.03,0.16,...,14.03,,0.91,188.2,305.4,213.5,1,0,0,0


In [34]:
def calculate_empirical_pvals(metabolite_data, outcome_data, n_permutations=10000):
    """
    Calculate empirical P values using permutation of outcome labels
    
    Parameters:
    metabolite_data: DataFrame with metabolite features
    outcome_data: DataFrame with outcome labels
    n_permutations: Number of permutations
    
    Returns:
    DataFrame with empirical P values
    """
    
    results = []
    
    for outcome in NEWBORN_OUTCOME_LABELS:
        outcome_vector = outcome_data[outcome]
        
        for feature in metabolite_data.columns:
            if feature == 'row_id':  # Skip ID column
                continue
                
            feature_vector = metabolite_data[feature]
            
            # Calculate observed correlation
            obs_rho, _ = stats.spearmanr(feature_vector, outcome_vector)
            
            # Generate null distribution
            null_rhos = []
            for _ in range(n_permutations):
                # Permute outcome labels
                permuted_outcome = np.random.permutation(outcome_vector)
                null_rho, _ = stats.spearmanr(feature_vector, permuted_outcome)
                null_rhos.append(null_rho)
            
            null_rhos = np.array(null_rhos)
            
            # Calculate empirical P value (two-tailed)
            empirical_pval = np.sum(np.abs(null_rhos) >= np.abs(obs_rho)) / n_permutations
            
            results.append({
                'feature': feature,
                'outcome': outcome,
                'rho': obs_rho,
                'empirical_pval': empirical_pval
            })

    return pd.DataFrame(results)
    

In [46]:
# Runtime with 1000 permutations: 8m50s
outcome_corr_pval = calculate_empirical_pvals(
    preterm_cohort_metab.drop(columns=NEWBORN_OUTCOME_LABELS),
    preterm_cohort_metab.loc[:, NEWBORN_OUTCOME_LABELS],
    n_permutations=10000
)

KeyboardInterrupt: 

In [47]:
outcome_corr_pval.sort_values(["outcome", "empirical_pval"])

Unnamed: 0,feature,outcome,rho,empirical_pval,empirical_pval_fdr
0,ALA,bpd_any,-0.092580,0.000,0.000000
1,ARG,bpd_any,-0.052218,0.000,0.000000
3,C03,bpd_any,,0.000,0.000000
4,C03DC,bpd_any,0.030352,0.000,0.000000
5,C04,bpd_any,0.095460,0.000,0.000000
...,...,...,...,...,...
76,C14OH,rop_any,-0.005419,0.523,0.590051
70,C101,rop_any,0.004834,0.586,0.644600
79,C18,rop_any,0.003285,0.700,0.745024
89,OXP,rop_any,-0.002095,0.804,0.838294


In [50]:
outcome_corr_pval["empirical_pval_fdr"] = multipletests(
    outcome_corr_pval["empirical_pval"], method="fdr_bh"
)[1]
outcome_corr_pval.to_csv(
    f"./intermediate_output/{nb_save_prefix}_outcome_corr_empirical_pvals.csv",
    index=False)

In [None]:
signif_corr = outcome_corr_pval.query("empirical_pval_fdr < 0.05")
signif_corr

Unnamed: 0,feature,outcome,rho,empirical_pval,empirical_pval_fdr
0,ALA,bpd_any,-0.092580,0.0,0.0
1,ARG,bpd_any,-0.052218,0.0,0.0
3,C03,bpd_any,,0.0,0.0
4,C03DC,bpd_any,0.030352,0.0,0.0
5,C04,bpd_any,0.095460,0.0,0.0
...,...,...,...,...,...
213,RL_A,ivh_any,-0.029590,0.0,0.0
214,RO_C,ivh_any,-0.125349,0.0,0.0
215,RV_F,ivh_any,,0.0,0.0
216,SA,ivh_any,,0.0,0.0


In [45]:
signif_corr.query("outcome == 'ivh_any'")

Unnamed: 0,feature,outcome,rho,empirical_pval,empirical_pval_fdr
167,C02,ivh_any,-0.089397,0.0,0.0
168,C03,ivh_any,,0.0,0.0
169,C03DC,ivh_any,0.022405,0.009,0.014043
170,C04,ivh_any,0.099792,0.0,0.0
171,C04DC,ivh_any,,0.0,0.0
172,C05,ivh_any,0.050031,0.0,0.0
173,C051,ivh_any,0.017866,0.033,0.047763
174,C05DC,ivh_any,0.068326,0.0,0.0
178,C081,ivh_any,-0.086533,0.0,0.0
180,C101,ivh_any,-0.029337,0.001,0.001732
