In [14]:
import os
import pandas as pd
import yaml
import matplotlib.pyplot as plt
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    precision_recall_curve,
    average_precision_score
)

In [2]:
import os
parent_dir = "../oi2-baseline-outputs-test-split"
subdirs = [d for d in os.listdir(parent_dir) if os.path.isdir(os.path.join(parent_dir, d))]

In [3]:
# Load the YAML config
with open("../processed-data-files/all_datasets_config.txt", "r") as f:
    cfg = yaml.safe_load(f)

# Build a lookup from label → node_files[0]
label_to_nodefile = {}
for ds in cfg.get("datasets", []):
    label = ds.get("label")
    label = label + "-ml"
    if label in subdirs:
        # grab the first node file, or None if the list is empty
        node_files = ds.get("node_files", [])
        if node_files:
            pc_pathway = str(node_files[0])[:-21] + '.txt'
            label_to_nodefile[label] = pc_pathway
        else:
            label_to_nodefile[label] = None

# Now `label_to_nodefile` holds your desired map
print(label_to_nodefile)

{'alpha_lino_4939-ml': 'alpha_linolenic_acid.txt', 'aminoacyl_2796-ml': 'aminoacyl_trna_biosy.txt', 'aminoacyl_5354-ml': 'aminoacyl_trna_biosy.txt', 'aminoacyl_8954-ml': 'aminoacyl_trna_biosy.txt', 'arachidoni_3652-ml': 'arachidonic_acid_met.txt', 'arginine_a_3611-ml': 'arginine_and_proline.txt', 'arginine_a_7047-ml': 'arginine_and_proline.txt', 'ascorbate_3317-ml': 'ascorbate_and_aldara.txt', 'ascorbate_8407-ml': 'ascorbate_and_aldara.txt', 'beta_alani_4217-ml': 'beta_alanine_metabol.txt', 'biosynthes_2748-ml': 'biosynthesis_of_unsa.txt', 'butanoate_4190-ml': 'butanoate_metabolism.txt', 'butanoate_5885-ml': 'butanoate_metabolism.txt', 'citrate_cy_5392-ml': 'citrate_cycle__tca_c.txt', 'citrate_cy_5519-ml': 'citrate_cycle__tca_c.txt', 'citrate_cy_6720-ml': 'citrate_cycle__tca_c.txt', 'drug_metab_1576-ml': 'drug_metabolism___cy.txt', 'drug_metab_2581-ml': 'drug_metabolism___cy.txt', 'drug_metab_2665-ml': 'drug_metabolism___cy.txt', 'drug_metab_7291-ml': 'drug_metabolism___cy.txt', 'drug_

In [4]:
test_oi2_prc_df = pd.DataFrame(columns=['Node1','Node2','Frequency','y_true'])

In [5]:
for label in label_to_nodefile.keys():
    ensemble_pathway = '../oi2-baseline-outputs-test-split/' + str(label) + '/ensemble-pathway.txt'
    ensemble_df = pd.read_csv(ensemble_pathway, sep='\t')
    # MAKE THE PRC TABLE:
    ensemble_df.drop(columns=['Direction'], inplace=True)
    ensemble_df['y_true'] = 0
    # comparing ensemble df to pathway commons --> set y_true to 1 and freq to 0 for missing pairs
    pc_true_path = '../oi2-baseline-outputs-test-split/' + str(label) + '/' + str(label_to_nodefile[label])
    pc_true_df = pd.read_csv(pc_true_path, sep='\t')
    pc_ytrue_pairs = set(zip(pc_true_df['Node1'], pc_true_df['Node2']))
    ensemble_df.loc[ensemble_df[['Node1', 'Node2']].apply(tuple, axis=1).isin(pc_ytrue_pairs), 'y_true'] = 1
    pc_missing_pairs = pc_ytrue_pairs.difference(zip(ensemble_df['Node1'], ensemble_df['Node2']))
    if pc_missing_pairs:                      
        new_rows = pd.DataFrame(list(pc_missing_pairs), columns=['Node1', 'Node2'])
        new_rows['Frequency'] = 0          
        new_rows['y_true']   = 1
        temp_df = pd.concat([ensemble_df, new_rows], ignore_index=True)
    test_oi2_prc_df = pd.concat([test_oi2_prc_df, temp_df], ignore_index=True)

  test_oi2_prc_df = pd.concat([test_oi2_prc_df, temp_df], ignore_index=True)


In [6]:
union_ppi = pd.read_csv('../processed-data-files/union_ppi.txt', sep='\t', header=None)
union_ppi.rename(columns={0:'Node1', 1:'Node2', 3:'Prize',4:'Directionality'}, inplace=True)

union_ppi_length = len(union_ppi)
union_ppi_length

202090

In [7]:
# comparing ensemble df to union ppi --> set y_true to 0 and freq to 0 for missing pairs
union_ppi_pairs = set(zip(union_ppi['Node1'], union_ppi['Node2']))
ensemble_pairs  = set(zip(ensemble_df['Node1'],  ensemble_df['Node2']))
union_missing_pairs = union_ppi_pairs.difference(ensemble_pairs)
missing_df = pd.DataFrame(list(union_missing_pairs), columns=['Node1','Node2'])
missing_df['y_true'] = 0
missing_df['Frequency']   = 0
test_oi2_prc_df = pd.concat([test_oi2_prc_df, missing_df], ignore_index=True)

In [10]:
from sklearn.utils.multiclass import type_of_target

print(test_oi2_prc_df['y_true'].dtype)
print(test_oi2_prc_df['y_true'].unique())
print(type_of_target(test_oi2_prc_df['y_true']))
print(test_oi2_prc_df['y_true'].isna().sum(), " missing y_true values")

object
[1 0]
unknown
0  missing y_true values


In [11]:
# cast the labels to integer
test_oi2_prc_df['y_true'] = test_oi2_prc_df['y_true'].astype(int)

# (optionally) ensure your scores are floats
test_oi2_prc_df['Frequency'] = test_oi2_prc_df['Frequency'].astype(float)

In [13]:
# plot and save precision recall curve
precision, recall, _ = precision_recall_curve(test_oi2_prc_df['y_true'], test_oi2_prc_df['Frequency'])
plt.figure()
plt.plot(recall, precision, linewidth=2)
# force both axes from 0 to 1
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('OmicsIntegrator2: Precision-Recall Curve for GNN Test split')
plt.grid(True)
save_path = os.path.join('../processed-data-files', "oi2_test_prc_curve.png")
plt.savefig(save_path)
plt.close()

In [15]:
# make sure your columns are numeric
y_true   = test_oi2_prc_df['y_true'].astype(int)
y_scores = test_oi2_prc_df['Frequency'].astype(float)

# choose your classification threshold
threshold = 0.5
y_pred    = (y_scores >= threshold).astype(int)

# compute metrics
precision_val = precision_score(y_true, y_pred)
recall_val    = recall_score(y_true, y_pred)
f1_val        = f1_score(y_true, y_pred)
roc_auc       = roc_auc_score(y_true, y_scores)

print(f"Precision (th={threshold}): {precision_val:.4f}")
print(f"Recall    (th={threshold}): {recall_val:.4f}")
print(f"F1-score  (th={threshold}): {f1_val:.4f}")
print(f"ROC-AUC                : {roc_auc:.4f}")

Precision (th=0.5): 0.6803
Recall    (th=0.5): 0.0163
F1-score  (th=0.5): 0.0319
ROC-AUC                : 0.5094
