# 1. Import Libraries and Load Data

In [31]:
import numpy as np
import pandas as pd

# Load the correlation matrix
# We assume the first column contains the metabolite names
df = pd.read_csv("data/cleaned_correlation_matrix.csv", index_col=0)

# Display the first few rows to confirm loading
print(f"Matrix Shape: {df.shape}")
df.head()

Matrix Shape: (76, 76)


Unnamed: 0,creatinine,ethanol,trimethylaminenoxide,alanine,creatine,glutamic acid,glutamine,glycine,histidine,isoleucine,...,Inosine,Butyrylcarnitine,7-Methylguanine,N1-methyl-2-pyridone-5-carboxamide,N1-methyl-4-pyridone-3-carboxamide,Cortisol,N4-Acetylcytidine,1-Methyl-2-piperidinecarboxylate,Tetradecanoylcarnitine,Tetradecadienoylcarnitine
creatinine,1.0,0.07814,0.03168,0.14584,-0.37902,0.07273,0.21373,0.04715,-0.10091,0.16251,...,0.01048,0.17717,0.16461,0.13251,0.14086,-0.06632,0.10905,0.10998,-0.00485,-0.00593
ethanol,0.07814,1.0,0.06636,0.08009,0.03469,0.19316,-0.21366,0.01039,0.08311,0.17847,...,-0.10151,-0.07398,0.0733,-0.04453,-0.01756,-0.22178,-0.12426,-0.06994,0.01798,0.00486
trimethylaminenoxide,0.03168,0.06636,1.0,0.01601,0.05544,0.0238,-0.01576,0.00279,0.00994,0.07539,...,-0.09184,0.08325,0.01668,-0.0074,-0.02001,-0.07739,0.00742,-0.00403,-0.02292,-0.07747
alanine,0.14584,0.08009,0.01601,1.0,0.06048,0.17056,0.17464,0.2696,0.26182,0.41513,...,-0.00564,0.00111,0.0984,0.01961,0.04237,-0.05606,0.00366,-0.03541,-0.1787,-0.29331
creatine,-0.37902,0.03469,0.05544,0.06048,1.0,0.00776,-0.15962,0.12851,0.15773,0.10401,...,0.00079,0.02455,-0.10936,0.20354,0.17871,0.03309,-0.0046,-0.00684,-0.02481,-0.13354


# 2. Extract Non-Redundant Correlation Pairs

In [32]:
# Create a mask for the lower triangle of the matrix (excluding the diagonal)
mask = np.tril(np.ones(df.shape), k=-1).astype(bool)

# Apply the mask and "stack" the data into a long format
df_pairs = df.where(mask).stack().reset_index()

# Rename columns for clarity
df_pairs.columns = ["Metabolite_1", "Metabolite_2", "Correlation"]

print("Pairs extracted successfully.")

Pairs extracted successfully.


In [4]:
df_pairs

Unnamed: 0,Metabolite_1,Metabolite_2,Correlation
0,Ethanol,Creatinine,0.07814
1,Trimethylaminenoxide,Creatinine,0.03168
2,Trimethylaminenoxide,Ethanol,0.06636
3,Alanine,Creatinine,0.14584
4,Alanine,Ethanol,0.08009
...,...,...,...
2845,Tetradecadienoylcarnitine,N1-methyl-4-pyridone-3-carboxamide,0.08971
2846,Tetradecadienoylcarnitine,Cortisol,0.13880
2847,Tetradecadienoylcarnitine,N4-Acetylcytidine,0.16579
2848,Tetradecadienoylcarnitine,1-Methyl-2-piperidinecarboxylate,0.02331


# 3. Filter for High Correlations (0.7 to 1.0)

In [33]:
# Define the threshold
threshold = 0.7

# Filter for both positive and negative correlations
high_corr = df_pairs[
    (df_pairs["Correlation"].abs() >= threshold) & (df_pairs["Correlation"].abs() < 1.0)
].copy()

# Add a column for absolute value to help with sorting
high_corr["Abs_Correlation"] = high_corr["Correlation"]

# Sort by strongest correlation
high_corr = high_corr.sort_values(by="Abs_Correlation", ascending=False).drop(
    columns="Abs_Correlation"
)

# Reset index for the final table
high_corr = high_corr.reset_index(drop=True)

print(f"Found {len(high_corr)} highly correlated pairs.")

Found 19 highly correlated pairs.


# 4. Display Results

In [34]:
# Display the count
print(f"Total number of highly correlated metabolites (|r| >= 0.7): {len(high_corr)}")

# Show the top pairs
high_corr

Total number of highly correlated metabolites (|r| >= 0.7): 19


Unnamed: 0,Metabolite_1,Metabolite_2,Correlation
0,Inosine,Hypoxanthine,0.99999
1,1-methylhistidine,3-methylhistidine,0.99789
2,N1-methyl-4-pyridone-3-carboxamide,N1-methyl-2-pyridone-5-carboxamide,0.93884
3,Tetradecadienoylcarnitine,Hydroxydecanoylcarnitine,0.85685
4,Hydroxydecanoylcarnitine,Octanoylcarnitine,0.84801
5,N-methyl proline,Proline betaine,0.8381
6,Octanoylcarnitine,Hexanoylcarnitine,0.82867
7,Tetradecadienoylcarnitine,Octanoylcarnitine,0.82407
8,Hexadecenoylcarnitine,Oleoylcarnitine,0.79527
9,Tetradecanoylcarnitine,Hexadecenoylcarnitine,0.79198


# Creating a highly correlated adjacency matrix

# 1. Identify Unique High-Correlation Metabolites

In [35]:
import numpy as np
import pandas as pd

# Load the original correlation matrix
df = pd.read_csv("data/cleaned_correlation_matrix.csv", index_col=0)

# Identify pairs with |r| >= 0.7 (excluding self-correlation of 1.0)
mask = np.tril(np.ones(df.shape), k=-1).astype(bool)
pairs = df.where(mask).stack().reset_index()
pairs.columns = ["Met1", "Met2", "Corr"]

# Filter for the 19 highly correlated pairs
high_corr_pairs = pairs[(pairs["Corr"].abs() >= 0.7) & (pairs["Corr"].abs() < 1.0)]

# Get the list of unique metabolites involved
unique_mets = sorted(
    list(set(high_corr_pairs["Met1"]).union(set(high_corr_pairs["Met2"])))
)

print(f"Number of unique metabolites: {len(unique_mets)}")

Number of unique metabolites: 20


# 2. Create the Weighted Adjacency Matrix

In [36]:
# Extract the sub-matrix for these unique metabolites
adj_matrix = df.loc[unique_mets, unique_mets].copy()

# Define the Weighted Adjacency logic:
# Keep the correlation value if |r| >= 0.7, otherwise set to 0.
# We also set the diagonal to 0 because a node doesn't have an 'edge' to itself in this context.
weighted_adj = adj_matrix.map(lambda x: x if 0.7 <= abs(x) < 1.0 else 0)

# Display the dimensions
print(f"Weighted Adjacency Matrix Shape: {weighted_adj.shape}")

Weighted Adjacency Matrix Shape: (20, 20)


# 3. Save and Inspect the Result

In [37]:
# Save the adjacency matrix
weighted_adj.to_csv("data/weighted_high_corr_adjacency_matrix.csv")

weighted_adj

Unnamed: 0,1-methylhistidine,3-methylhistidine,Caffeine,Hexadecenoylcarnitine,Hexanoylcarnitine,Hydroxydecanoylcarnitine,Hypoxanthine,Inosine,Linoleoylcarnitine,N-methyl proline,N1-methyl-2-pyridone-5-carboxamide,N1-methyl-4-pyridone-3-carboxamide,Octanoylcarnitine,Oleoylcarnitine,Palmitoylcarnitine,Paraxanthine,Proline betaine,Stearoylcarnitine,Tetradecadienoylcarnitine,Tetradecanoylcarnitine
1-methylhistidine,0.0,0.99789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3-methylhistidine,0.99789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Caffeine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75058,0.0,0.0,0.0,0.0
Hexadecenoylcarnitine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.79527,0.76541,0.0,0.0,0.0,0.0,0.79198
Hexanoylcarnitine,0.0,0.0,0.0,0.0,0.0,0.74311,0.0,0.0,0.0,0.0,0.0,0.0,0.82867,0.0,0.0,0.0,0.0,0.0,0.74511,0.0
Hydroxydecanoylcarnitine,0.0,0.0,0.0,0.0,0.74311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84801,0.0,0.0,0.0,0.0,0.0,0.85685,0.0
Hypoxanthine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Inosine,0.0,0.0,0.0,0.0,0.0,0.0,0.99999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Linoleoylcarnitine,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.70041,0.0,0.0,0.0,0.0,0.0,0.0
N-methyl proline,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8381,0.0,0.0,0.0
