# Correlation Disruption Commonalities
Determining the correlation pairs that perform the most similarly across outcomes.
Should also examine the outcome-specific correlation disruption pairs as well?

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from statsmodels.stats.multitest import multipletests
import seaborn as sns

In [None]:
nb_save_prefix = "0.0.3-metabolic_corr_disrupt_stats"

In [2]:
corr_df = pd.read_csv("../results/correlation_disruption_networks_edges.csv")

In [3]:
corr_df

Unnamed: 0,metabolite_from,metabolite_to,case_spearman_rho,case_pval,case_log10_pval,control_spearman_rho,control_pval,control_log10_pval,metabolite_pair_label,rho_diff,abs_rho_diff,outcome
0,ARG/ORN,OXP,0.100741,6.149821e-11,10.211138,-0.024086,1.097661e-01,0.959532,,0.076655,0.124827,BPD
1,C-10,ARG,0.114193,1.166276e-13,12.933199,-0.007928,5.986706e-01,0.222812,,0.106265,0.122121,BPD
2,C-12:1,ARG,0.076605,6.834579e-07,6.165288,-0.004699,7.550614e-01,0.122018,,0.071905,0.081304,BPD
3,C-16,C-5,0.005384,7.275618e-01,0.138130,-0.116305,8.762844e-15,14.057355,,-0.110921,0.121689,BPD
4,C-8/C-10,PHE,-0.002837,8.543761e-01,0.068351,0.053265,4.010653e-04,3.396785,,0.050429,0.056102,BPD
...,...,...,...,...,...,...,...,...,...,...,...,...
99,PHE/TYR,FC/(C-16 + C-18:1),-0.017385,1.887169e-01,0.724189,0.127542,1.690714e-17,16.771930,,0.110157,0.144927,ROP
100,TYR,C-18:1,-0.035964,6.529541e-03,2.185117,0.005272,7.263622e-01,0.138847,,-0.030692,0.041235,ROP
101,VAL,C14:1,0.010213,4.400463e-01,0.356502,-0.083516,2.741576e-08,7.562000,,-0.073303,0.093729,ROP
102,XLE,C-16,0.012015,3.637084e-01,0.439247,-0.098130,6.342991e-11,10.197706,,-0.086115,0.110145,ROP


In [22]:
# Append case and control numbers from metadata
outcome_labels = {
    'bpd_any': 'BPD',
    'ivh_any': 'IVH',
    'nec_any': 'NEC',
    'rop_any': 'ROP'
}

metab_outcomes = pd.read_csv("../data/processed/neonatal_conditions.csv").set_index("row_id")
meta = pd.read_csv("../data/processed/metadata.csv", low_memory=False)
meta = meta.set_index("row_id")
cohort_preterm_ga = [
    "22_23",
    "24_25",
    "26_27",
    "28_29",
]
preterm_ids = meta.query("gacat in @cohort_preterm_ga").index
preterm_cohort_meta = meta.loc[preterm_ids]
preterm_outcomes = metab_outcomes.loc[preterm_ids, list(outcome_labels.keys())]

In [25]:
preterm_outcomes

Unnamed: 0_level_0,bpd_any,ivh_any,nec_any,rop_any
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,0,0,0
5,0,0,0,1
6,0,0,0,1
11,0,1,0,1
13,0,0,0,1
...,...,...,...,...
41396,0,0,0,0
41397,0,0,0,0
41400,1,0,0,0
41405,0,0,1,0


In [42]:
preterm_counts = preterm_outcomes.melt(var_name="outcome", value_name="status").groupby(
    ["outcome", "status"]
).size().reset_index().pivot(
    index="outcome", columns="status", values=0
).rename(
    columns={0: "control_n", 1: "case_n"}
).rename(
    index=outcome_labels
).reset_index()
preterm_counts.columns.name = None
preterm_counts

Unnamed: 0,outcome,control_n,case_n
0,BPD,9347,4189
1,IVH,9276,4260
2,NEC,12097,1439
3,ROP,7818,5718


In [44]:
corr_df_counts = pd.merge(corr_df, preterm_counts, left_on="outcome", right_on="outcome", how="left")
corr_df_counts

Unnamed: 0,metabolite_from,metabolite_to,case_spearman_rho,case_pval,case_log10_pval,control_spearman_rho,control_pval,control_log10_pval,metabolite_pair_label,rho_diff,abs_rho_diff,outcome,metabolite_pair,control_n,case_n
0,ARG/ORN,OXP,0.100741,6.149821e-11,10.211138,-0.024086,1.097661e-01,0.959532,,0.076655,0.124827,BPD,ARG/ORN :: OXP,9347,4189
1,C-10,ARG,0.114193,1.166276e-13,12.933199,-0.007928,5.986706e-01,0.222812,,0.106265,0.122121,BPD,C-10 :: ARG,9347,4189
2,C-12:1,ARG,0.076605,6.834579e-07,6.165288,-0.004699,7.550614e-01,0.122018,,0.071905,0.081304,BPD,C-12:1 :: ARG,9347,4189
3,C-16,C-5,0.005384,7.275618e-01,0.138130,-0.116305,8.762844e-15,14.057355,,-0.110921,0.121689,BPD,C-16 :: C-5,9347,4189
4,C-8/C-10,PHE,-0.002837,8.543761e-01,0.068351,0.053265,4.010653e-04,3.396785,,0.050429,0.056102,BPD,C-8/C-10 :: PHE,9347,4189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,PHE/TYR,FC/(C-16 + C-18:1),-0.017385,1.887169e-01,0.724189,0.127542,1.690714e-17,16.771930,,0.110157,0.144927,ROP,PHE/TYR :: FC/(C-16 + C-18:1),7818,5718
100,TYR,C-18:1,-0.035964,6.529541e-03,2.185117,0.005272,7.263622e-01,0.138847,,-0.030692,0.041235,ROP,TYR :: C-18:1,7818,5718
101,VAL,C14:1,0.010213,4.400463e-01,0.356502,-0.083516,2.741576e-08,7.562000,,-0.073303,0.093729,ROP,VAL :: C14:1,7818,5718
102,XLE,C-16,0.012015,3.637084e-01,0.439247,-0.098130,6.342991e-11,10.197706,,-0.086115,0.110145,ROP,XLE :: C-16,7818,5718


In [45]:
corr_df_counts['metabolite_pair'] = corr_df_counts[['metabolite_from', 'metabolite_to']].agg(' :: '.join, axis=1)

In [46]:
corr_df_counts

Unnamed: 0,metabolite_from,metabolite_to,case_spearman_rho,case_pval,case_log10_pval,control_spearman_rho,control_pval,control_log10_pval,metabolite_pair_label,rho_diff,abs_rho_diff,outcome,metabolite_pair,control_n,case_n
0,ARG/ORN,OXP,0.100741,6.149821e-11,10.211138,-0.024086,1.097661e-01,0.959532,,0.076655,0.124827,BPD,ARG/ORN :: OXP,9347,4189
1,C-10,ARG,0.114193,1.166276e-13,12.933199,-0.007928,5.986706e-01,0.222812,,0.106265,0.122121,BPD,C-10 :: ARG,9347,4189
2,C-12:1,ARG,0.076605,6.834579e-07,6.165288,-0.004699,7.550614e-01,0.122018,,0.071905,0.081304,BPD,C-12:1 :: ARG,9347,4189
3,C-16,C-5,0.005384,7.275618e-01,0.138130,-0.116305,8.762844e-15,14.057355,,-0.110921,0.121689,BPD,C-16 :: C-5,9347,4189
4,C-8/C-10,PHE,-0.002837,8.543761e-01,0.068351,0.053265,4.010653e-04,3.396785,,0.050429,0.056102,BPD,C-8/C-10 :: PHE,9347,4189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,PHE/TYR,FC/(C-16 + C-18:1),-0.017385,1.887169e-01,0.724189,0.127542,1.690714e-17,16.771930,,0.110157,0.144927,ROP,PHE/TYR :: FC/(C-16 + C-18:1),7818,5718
100,TYR,C-18:1,-0.035964,6.529541e-03,2.185117,0.005272,7.263622e-01,0.138847,,-0.030692,0.041235,ROP,TYR :: C-18:1,7818,5718
101,VAL,C14:1,0.010213,4.400463e-01,0.356502,-0.083516,2.741576e-08,7.562000,,-0.073303,0.093729,ROP,VAL :: C14:1,7818,5718
102,XLE,C-16,0.012015,3.637084e-01,0.439247,-0.098130,6.342991e-11,10.197706,,-0.086115,0.110145,ROP,XLE :: C-16,7818,5718


In [50]:
# Fisher's Z Transformation Test for Reporting Correlation Difference P Values
def fisher_z_test(rho1, n1, rho2, n2):
    """
    Test if two correlations are significantly different using Fisher's Z transformation
    
    Parameters:
    rho1: Spearman rho for cases
    n1: Sample size for cases
    rho2: Spearman rho for controls
    n2: Sample size for controls
    
    Returns:
    P value for difference in correlations
    """
    # Fisher's Z transformation
    z1 = np.arctanh(rho1)
    z2 = np.arctanh(rho2)
    
    # Standard error of the difference
    se_diff = np.sqrt(1/(n1-3) + 1/(n2-3))
    
    # Z statistic
    z_stat = (z1 - z2) / se_diff
    
    # Two-tailed P value
    p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))
    
    return p_value

corr_df_counts['disruption_pval'] = corr_df_counts.apply(
    lambda row: fisher_z_test(
        row['case_spearman_rho'], row['case_n'],
        row['control_spearman_rho'], row['control_n']
    ), axis=1
)

In [64]:

corr_df_counts['disruption_fdr'] = multipletests(corr_df_counts['disruption_pval'], method='fdr_bh')[1]
# Filter for significant disruptions
significant_disruptions = corr_df_counts[
    (corr_df_counts['disruption_fdr'] < 0.05) &
    (corr_df_counts['abs_rho_diff'] > 0.12)
]

In [62]:
corr_df_counts

Unnamed: 0,metabolite_from,metabolite_to,case_spearman_rho,case_pval,case_log10_pval,control_spearman_rho,control_pval,control_log10_pval,metabolite_pair_label,rho_diff,abs_rho_diff,outcome,metabolite_pair,control_n,case_n,disruption_pval,disruption_fdr
0,ARG/ORN,OXP,0.100741,6.149821e-11,10.211138,-0.024086,1.097661e-01,0.959532,,0.076655,0.124827,BPD,ARG/ORN :: OXP,9347,4189,1.693623e-11,9.785377e-11
1,C-10,ARG,0.114193,1.166276e-13,12.933199,-0.007928,5.986706e-01,0.222812,,0.106265,0.122121,BPD,C-10 :: ARG,9347,4189,4.310463e-11,1.867867e-10
2,C-12:1,ARG,0.076605,6.834579e-07,6.165288,-0.004699,7.550614e-01,0.122018,,0.071905,0.081304,BPD,C-12:1 :: ARG,9347,4189,1.189068e-05,1.766615e-05
3,C-16,C-5,0.005384,7.275618e-01,0.138130,-0.116305,8.762844e-15,14.057355,,-0.110921,0.121689,BPD,C-16 :: C-5,9347,4189,4.987033e-11,2.074606e-10
4,C-8/C-10,PHE,-0.002837,8.543761e-01,0.068351,0.053265,4.010653e-04,3.396785,,0.050429,0.056102,BPD,C-8/C-10 :: PHE,9347,4189,2.534726e-03,2.929017e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99,PHE/TYR,FC/(C-16 + C-18:1),-0.017385,1.887169e-01,0.724189,0.127542,1.690714e-17,16.771930,,0.110157,0.144927,ROP,PHE/TYR :: FC/(C-16 + C-18:1),7818,5718,0.000000e+00,0.000000e+00
100,TYR,C-18:1,-0.035964,6.529541e-03,2.185117,0.005272,7.263622e-01,0.138847,,-0.030692,0.041235,ROP,TYR :: C-18:1,7818,5718,1.778592e-02,1.926808e-02
101,VAL,C14:1,0.010213,4.400463e-01,0.356502,-0.083516,2.741576e-08,7.562000,,-0.073303,0.093729,ROP,VAL :: C14:1,7818,5718,6.799497e-08,1.571439e-07
102,XLE,C-16,0.012015,3.637084e-01,0.439247,-0.098130,6.342991e-11,10.197706,,-0.086115,0.110145,ROP,XLE :: C-16,7818,5718,2.202079e-10,7.633872e-10


In [65]:
significant_disruptions

Unnamed: 0,metabolite_from,metabolite_to,case_spearman_rho,case_pval,case_log10_pval,control_spearman_rho,control_pval,control_log10_pval,metabolite_pair_label,rho_diff,abs_rho_diff,outcome,metabolite_pair,control_n,case_n,disruption_pval,disruption_fdr
0,ARG/ORN,OXP,0.100741,6.149821e-11,10.211138,-0.024086,0.1097661,0.959532,,0.076655,0.124827,BPD,ARG/ORN :: OXP,9347,4189,1.693623e-11,9.785377e-11
1,C-10,ARG,0.114193,1.166276e-13,12.933199,-0.007928,0.5986706,0.222812,,0.106265,0.122121,BPD,C-10 :: ARG,9347,4189,4.310463e-11,1.867867e-10
3,C-16,C-5,0.005384,0.7275618,0.13813,-0.116305,8.762844e-15,14.057355,,-0.110921,0.121689,BPD,C-16 :: C-5,9347,4189,4.987033e-11,2.074606e-10
9,ORN,C-10,0.126851,1.56254e-16,15.806169,-0.009654,0.5215866,0.282674,,0.117197,0.136505,BPD,ORN :: C-10,9347,4189,1.625367e-13,1.126921e-12
13,OXP,C-5,0.125302,3.646447e-16,15.43813,-0.026064,0.0835127,1.078247,,0.099238,0.151366,BPD,OXP :: C-5,9347,4189,2.220446e-16,2.309264e-15
14,PHE,C-10,0.102213,3.217458e-11,10.492487,-0.020279,0.1781692,0.749167,,0.081934,0.122492,BPD,PHE :: C-10,9347,4189,3.963496e-11,1.828606e-10
18,PHE/TYR,C-4,-0.111129,5.206502e-13,12.283454,0.034702,0.02118795,1.673911,,-0.076427,0.145831,BPD,PHE/TYR :: C-4,9347,4189,3.552714e-15,2.639159e-14
19,PHE/TYR,C-5,-0.02643,0.08718869,1.05954,0.148436,3.166602e-23,22.499406,,0.122006,0.174866,BPD,PHE/TYR :: C-5,9347,4189,0.0,0.0
20,PHE/TYR,FC/(C-16 + C-18:1),-0.034909,0.02385041,1.622504,0.127542,1.690714e-17,16.77193,,0.092633,0.162452,BPD,PHE/TYR :: FC/(C-16 + C-18:1),9347,4189,0.0,0.0
21,PHE/TYR,PHE,-0.002828,0.8548048,0.068133,0.121362,5.653958e-16,15.247647,,0.118534,0.12419,BPD,PHE/TYR :: PHE,9347,4189,1.950484e-11,1.067633e-10


In [67]:
eval_df = corr_df_counts[['metabolite_pair', 'outcome', 'case_pval', 'control_pval', 'abs_rho_diff', 'disruption_fdr']].sort_values('metabolite_pair')
eval_df

Unnamed: 0,metabolite_pair,outcome,case_pval,control_pval,abs_rho_diff,disruption_fdr
49,ARG/ORN :: C-14,NEC,7.650927e-01,3.245395e-16,0.130245,4.772372e-06
50,ARG/ORN :: C-8:1,NEC,9.214281e-01,5.432993e-05,0.063327,2.478341e-02
51,ARG/ORN :: ORN,NEC,7.002675e-01,4.265489e-07,0.086184,2.341720e-03
83,ARG/ORN :: OXP,ROP,3.326055e-09,1.097661e-01,0.102165,1.126679e-08
52,ARG/ORN :: OXP,NEC,2.398443e-04,1.097661e-01,0.120707,2.043091e-05
...,...,...,...,...,...,...
23,XLE :: C-16,BPD,1.264236e-01,6.342991e-11,0.121748,2.104823e-10
48,XLE :: PHE/TYR,IVH,2.493070e-01,4.637352e-14,0.130775,8.196528e-12
82,XLE :: PHE/TYR,NEC,6.978556e-01,4.637352e-14,0.123364,1.435987e-05
24,XLE :: PHE/TYR,BPD,8.688624e-01,4.637352e-14,0.115671,1.358561e-09


In [68]:
counts_df = eval_df.groupby('metabolite_pair').count().sort_values(['outcome'], ascending=False)
top_counts = counts_df.query('outcome == 4')
counts_df

Unnamed: 0_level_0,outcome,case_pval,control_pval,abs_rho_diff,disruption_fdr
metabolite_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XLE :: PHE/TYR,4,4,4,4,4
PHE/TYR :: FC/(C-16 + C-18:1),4,4,4,4,4
ORN :: C-10,4,4,4,4,4
OXP :: C-5,4,4,4,4,4
PHE :: C-10,4,4,4,4,4
PHE/TYR :: ARG,4,4,4,4,4
PHE/TYR :: C-16,4,4,4,4,4
PHE/TYR :: C-4,4,4,4,4,4
CIT/ARG :: C-10,4,4,4,4,4
CIT :: C14:1,4,4,4,4,4


In [78]:
top_counts_df = eval_df[eval_df['metabolite_pair'].isin(top_counts.index)]
top_counts_df = top_counts_df.sort_values(['metabolite_pair', 'outcome'])
top_counts_df.to_csv(
    f"./intermediate_output/{nb_save_prefix}_common_metabolite_pairs.csv",
    index=False)
top_counts_df

Unnamed: 0,metabolite_pair,outcome,case_pval,control_pval,abs_rho_diff,disruption_fdr
0,ARG/ORN :: OXP,BPD,6.149821e-11,1.097661e-01,0.124827,9.785377e-11
25,ARG/ORN :: OXP,IVH,2.395023e-09,1.097661e-01,0.115308,1.358561e-09
52,ARG/ORN :: OXP,NEC,2.398443e-04,1.097661e-01,0.120707,2.043091e-05
83,ARG/ORN :: OXP,ROP,3.326055e-09,1.097661e-01,0.102165,1.126679e-08
1,C-10 :: ARG,BPD,1.166276e-13,5.986706e-01,0.122121,1.867867e-10
...,...,...,...,...,...,...
102,XLE :: C-16,ROP,3.637084e-01,6.342991e-11,0.110145,7.633872e-10
24,XLE :: PHE/TYR,BPD,8.688624e-01,4.637352e-14,0.115671,1.358561e-09
48,XLE :: PHE/TYR,IVH,2.493070e-01,4.637352e-14,0.130775,8.196528e-12
82,XLE :: PHE/TYR,NEC,6.978556e-01,4.637352e-14,0.123364,1.435987e-05


In [15]:
var_df = top_counts_df.groupby('metabolite_pair').var(numeric_only=True)
var_df.sort_values('abs_rho_diff')

Unnamed: 0_level_0,case_pval,control_pval,abs_rho_diff
metabolite_pair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
C-12:1 :: ARG,5.013419e-06,0.0,4.1e-05
XLE :: C-16,0.009952703,0.0,5e-05
XLE :: PHE/TYR,0.08439666,0.0,5.3e-05
PHE/TYR :: C-5,0.08813878,0.0,9.7e-05
ARG/ORN :: OXP,1.438109e-08,0.0,9.7e-05
PHE/TYR :: FC/(C-16 + C-18:1),0.1247768,0.0,0.000145
C-10 :: ARG,7.665134e-11,0.0,0.000174
CIT/ARG :: C-10,6.871349e-07,0.0,0.000215
PHE/TYR :: C-4,1.797083e-06,0.0,0.000231
PHE/TYR :: ARG,0.005930776,0.0,0.000314


# Analytic Approach for Outcome-Specific Disruptions

In [70]:
specific_counts = counts_df.query('outcome == 1')

In [74]:
specific_edges = eval_df[eval_df['metabolite_pair'].isin(specific_counts.index)]
specific_edges = specific_edges.sort_values(["outcome", "disruption_fdr"])
specific_edges

Unnamed: 0,metabolite_pair,outcome,case_pval,control_pval,abs_rho_diff,disruption_fdr
21,PHE/TYR :: PHE,BPD,0.854805,5.653958e-16,0.12419,1.067633e-10
8,LEU/ALA :: C-10,BPD,0.966242,2.524887e-10,0.095651,5.182731e-07
10,ORN :: C-14OH,BPD,0.405873,4.529318e-07,0.0887,3.190207e-06
6,CIT/ARG :: C14:1,BPD,0.716442,4.243133e-06,0.074782,7.171641e-05
12,ORN/CIT :: C-14OH,BPD,0.963127,8.266118e-06,0.06777,0.0003218464
4,C-8/C-10 :: PHE,BPD,0.854376,0.0004010653,0.056102,0.002929017
46,VAL :: C-14OH,IVH,0.81902,8.771005e-07,0.077445,3.759684e-05
32,FC/(C-16 + C-18:1) :: ORN,IVH,0.789873,1.117386e-06,0.077309,3.837252e-05
33,LEU/ALA :: C-18,IVH,0.740413,0.006633338,0.045948,0.01440907
55,C-12 :: ARG,NEC,0.055142,4.1700949999999996e-19,0.184312,1.62007e-10


In [77]:
specific_edges.to_csv(f"intermediate_output/{nb_save_prefix}_outcome_specific_edges.csv", index=False)