### CREATE PRECISION RECALL CURVES FOR ALL SPRAS ENSEMBLE PATHWAYS

- loop through all spras ensemble pathway
    - match the ensemble pathway to the processed pc individual pathway
    - make the prc table:
        - remove direction from ensemble pathway
        - add y_true column and set all rows 0
        - if ensemble pathway has rows from processed pc individual pathway, set those y_true to 1
        - if there are rows in processed pc individual pathway that are not in ensemble pathway, add to prc df with y_true 1 and frequency 0
    - use prc table to make the pr curve image

In [1]:
import pandas as pd
from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt
import os
import yaml
import re
import shutil
import glob

In [2]:
processed_pc_pathways_folder = '../processed-pc-individual-pathways'
oi2_baseline_outputs_folder = '../oi2-baseline-outputs'

In [3]:
pc_processed_list = [f for f in os.listdir(processed_pc_pathways_folder) if f.endswith('.txt')]

# map of cleaned base names to file names
file_map = {}
for filename in pc_processed_list:
    base = filename[:10]
    label = re.sub(r'\W+', '_', base).strip('_')
    if label not in file_map:
        file_map[label] = filename
    else:
        print("duplicate found - verify manually")
        print(label)
print(len(file_map))

duplicate found - verify manually
drug_metab
duplicate found - verify manually
fatty_acid
duplicate found - verify manually
fatty_acid
duplicate found - verify manually
glycosamin
duplicate found - verify manually
glycosamin
duplicate found - verify manually
glycosphin
duplicate found - verify manually
glycosphin
duplicate found - verify manually
phenylalan
duplicate found - verify manually
valine__le
73


In [4]:
del file_map['drug_metab']
del file_map['fatty_acid']
del file_map['glycosamin']
del file_map['glycosphin']
del file_map['phenylalan']
del file_map['valine__le']
print(len(file_map))

67


In [5]:
union_ppi = pd.read_csv('../processed-data-files/union_ppi.txt', sep='\t', header=None)
union_ppi.rename(columns={0:'Node1', 1:'Node2', 3:'Prize',4:'Directionality'}, inplace=True)

union_ppi_length = len(union_ppi)
union_ppi_length

202090

In [6]:
# loop through each subfolder in spras outputs dir
for folder_name in os.listdir(oi2_baseline_outputs_folder):
    folder_path = os.path.join(oi2_baseline_outputs_folder, folder_name)
    if os.path.isdir(folder_path):
        # Extract label from subfolder name
        label_match = re.match(r'([a-zA-Z_]+)', folder_name)
        if label_match:
            label = label_match.group(1).strip('_')
            if label in file_map:
                src_file = os.path.join(processed_pc_pathways_folder, file_map[label])
                dest_file = os.path.join(folder_path, file_map[label])
                shutil.copy(src_file, dest_file)
                # print(f"Copied {src_file} -> {dest_file}")
            else:
                print(f"No matching file for folder: {folder_name}")

No matching file for folder: drug_metab_1108-ml
No matching file for folder: drug_metab_1722-ml
No matching file for folder: drug_metab_3545-ml
No matching file for folder: drug_metab_4503-ml
No matching file for folder: drug_metab_5565-ml
No matching file for folder: drug_metab_6155-ml
No matching file for folder: drug_metab_6253-ml
No matching file for folder: drug_metab_6262-ml
No matching file for folder: drug_metab_7207-ml
No matching file for folder: drug_metab_7643-ml
No matching file for folder: drug_metab_7684-ml
No matching file for folder: drug_metab_8702-ml
No matching file for folder: valine__le_1008-ml
No matching file for folder: valine__le_1385-ml
No matching file for folder: valine__le_1950-ml
No matching file for folder: valine__le_3551-ml
No matching file for folder: valine__le_3704-ml
No matching file for folder: valine__le_4369-ml
No matching file for folder: valine__le_5467-ml
No matching file for folder: valine__le_6172-ml
No matching file for folder: valine__le_

In [13]:
avg_prec_results = []
skipped_subfolders = []
duplicate_label_results = []

In [8]:
for training_sample_subfolder in os.listdir(oi2_baseline_outputs_folder):
    # get subfolder 
    folder_path = os.path.join(oi2_baseline_outputs_folder, training_sample_subfolder)

    if os.path.isdir(folder_path):

        # get the matching processed pc pathway
        prefix = training_sample_subfolder[:10]
        matched_pc_pathway = None
        for file_name in os.listdir(folder_path):
            if file_name.startswith(prefix) and file_name.endswith(".txt"):
                matched_pc_pathway = os.path.join(folder_path, file_name)
                break  
        # if there's no matching pc pathway, report and skip
        if not matched_pc_pathway or not os.path.isfile(matched_pc_pathway):
            print(f"Skipping subfolder '{training_sample_subfolder}': no matching .txt file with prefix '{prefix}'")
            skipped_subfolders.append(training_sample_subfolder)
            continue

        pc_df = pd.read_csv(matched_pc_pathway, sep="\t")
        # get ensemble pathway file 
        file_path = os.path.join(folder_path, "ensemble-pathway.txt")
        if os.path.isfile(file_path):
            ensemble_df = pd.read_csv(file_path, sep='\t')

        # MAKE THE PRC TABLE:
        ensemble_df.drop(columns=['Direction'], inplace=True)
        ensemble_df['y_true'] = 0
        # comparing ensemble df to pathway commons --> set y_true to 1 and freq to 0 for missing pairs
        pc_ytrue_pairs = set(zip(pc_df['Node1'], pc_df['Node2']))
        ensemble_df.loc[ensemble_df[['Node1', 'Node2']].apply(tuple, axis=1).isin(pc_ytrue_pairs), 'y_true'] = 1
        pc_missing_pairs = pc_ytrue_pairs.difference(zip(ensemble_df['Node1'], ensemble_df['Node2']))
        if pc_missing_pairs:                      
            new_rows = pd.DataFrame(list(pc_missing_pairs), columns=['Node1', 'Node2'])
            new_rows['Frequency'] = 0          
            new_rows['y_true']   = 1
            prc_df = pd.concat([ensemble_df, new_rows], ignore_index=True)
        # comparing ensemble df to union ppi --> set y_true to 0 and freq to 0 for missing pairs
        union_ppi_pairs = set(zip(union_ppi['Node1'], union_ppi['Node2']))
        ensemble_pairs  = set(zip(ensemble_df['Node1'],  ensemble_df['Node2']))
        union_missing_pairs = union_ppi_pairs.difference(ensemble_pairs)
        missing_df = pd.DataFrame(list(union_missing_pairs), columns=['Node1','Node2'])
        missing_df['y_true'] = 0
        missing_df['Frequency']   = 0
        prc_df = pd.concat([prc_df, missing_df], ignore_index=True)
        #print(prc_df)

        # calculate prevalence --> probability of randomly drawing a positive example from your dataset
        # num positives --> taken from pathwaycommons individual
        num_pos = len(pc_df)
        prevalence = num_pos/union_ppi_length

        # calculate avg precision score and add to df, make new row
        ap = average_precision_score(prc_df['y_true'], prc_df['Frequency'])
        trimmed = os.path.basename(matched_pc_pathway)
        avg_prec_results.append({
        'Sample': trimmed,
        'AveragePrecision': ap,
        'dataset_label': training_sample_subfolder
        })
    
        # plot and save precision recall curve
        precision, recall, _ = precision_recall_curve(prc_df['y_true'], prc_df['Frequency'])
        plt.figure()
        plt.plot(recall, precision, linewidth=2)
        # add prevalence as a horizontal dotted line
        plt.axhline(prevalence, color='gray', linestyle='--', label=f'Prevalence = {prevalence:.5f}')
        # force both axes from 0 to 1
        plt.xlim(0, 1)
        plt.ylim(0, 1)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve for {training_sample_subfolder}')
        plt.grid(True)
        plt.legend(loc='lower left')
        clean_name = training_sample_subfolder.replace('-ml', '')
        save_path = os.path.join('../spras-baseline-eval-prc', f"{clean_name}_prc_curve.png")
        plt.savefig(save_path)
        plt.close()
    
avg_prec_df = pd.DataFrame(avg_prec_results)
# out_path = os.path.join('../processed-data-files', 'oi2_summary_ap_scores.csv')
# avg_prec_df.to_csv(out_path, index=False, sep='\t')
# print(f"Wrote summary of average-precision scores to {out_path}")


Skipping subfolder 'drug_metab_1108-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_1722-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_3545-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_4503-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_5565-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_6155-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_6253-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_6262-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_7207-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_7643-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_7684-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_8

#### if the label has multiple matches to processed pc pathway, need to manually create PRC:

map the skipped subfolders to its node file using the config file

In [9]:
# Load the YAML config
with open("../processed-data-files/datasets_config.txt", "r") as f:
    cfg = yaml.safe_load(f)

# Build a lookup from label → node_files[0]
label_to_nodefile = {}
for ds in cfg.get("datasets", []):
    label = ds.get("label")
    label = label + "-ml"
    if label in skipped_subfolders:
        # grab the first node file, or None if the list is empty
        node_files = ds.get("node_files", [])
        if node_files:
            pc_pathway = str(node_files[0])[:-21] + '.txt'
            label_to_nodefile[label] = pc_pathway
        else:
            label_to_nodefile[label] = None

# Now `label_to_nodefile` holds your desired map
print(label_to_nodefile)

{'drug_metab_3545-ml': 'drug_metabolism___cy.txt', 'drug_metab_4503-ml': 'drug_metabolism___cy.txt', 'drug_metab_6262-ml': 'drug_metabolism___cy.txt', 'drug_metab_7207-ml': 'drug_metabolism___cy.txt', 'drug_metab_7643-ml': 'drug_metabolism___cy.txt', 'drug_metab_7684-ml': 'drug_metabolism___cy.txt', 'drug_metab_1108-ml': 'drug_metabolism___ot.txt', 'drug_metab_1722-ml': 'drug_metabolism___ot.txt', 'drug_metab_5565-ml': 'drug_metabolism___ot.txt', 'drug_metab_6155-ml': 'drug_metabolism___ot.txt', 'drug_metab_6253-ml': 'drug_metabolism___ot.txt', 'drug_metab_8702-ml': 'drug_metabolism___ot.txt', 'valine__le_1385-ml': 'valine__leucine_and.txt', 'valine__le_1950-ml': 'valine__leucine_and.txt', 'valine__le_3704-ml': 'valine__leucine_and.txt', 'valine__le_4369-ml': 'valine__leucine_and.txt', 'valine__le_6172-ml': 'valine__leucine_and.txt', 'valine__le_7204-ml': 'valine__leucine_and.txt', 'valine__le_7495-ml': 'valine__leucine_and.txt', 'valine__le_7776-ml': 'valine__leucine_and.txt', 'valine

In [14]:
for label, nodefile in label_to_nodefile.items():
    if nodefile is None:
        print(f"{label!r} has no node file, skipping.")
        continue
    # get the matching processed pc pathway
    training_sample_subfolder = label
    oi2_baseline_outputs_folder = '../oi2-baseline-outputs'
    folder_path = os.path.join(oi2_baseline_outputs_folder, training_sample_subfolder)
    matched_pc_pathway = '../processed-pc-individual-pathways/' + nodefile
    pc_df = pd.read_csv(matched_pc_pathway, sep='\t')

    # get ensemble pathway file 
    file_path = os.path.join(folder_path, "ensemble-pathway.txt")
    if os.path.isfile(file_path):
        ensemble_df = pd.read_csv(file_path, sep='\t')
    else:
        raise FileNotFoundError(f"Could not find {file_path}")

    # MAKE THE PRC TABLE:
    ensemble_df = ensemble_df.drop(columns=['Direction'])
    ensemble_df['y_true'] = 0

    # mark true positives from PC
    pc_ytrue_pairs = set(zip(pc_df['Node1'], pc_df['Node2']))
    ensemble_df.loc[
        ensemble_df[['Node1','Node2']].apply(tuple, axis=1).isin(pc_ytrue_pairs),
        'y_true'
    ] = 1

    # add missing PC edges with freq=0
    pc_missing_pairs = pc_ytrue_pairs.difference(zip(ensemble_df['Node1'], ensemble_df['Node2']))
    if pc_missing_pairs:
        new_rows = pd.DataFrame(list(pc_missing_pairs), columns=['Node1', 'Node2'])
        new_rows['Frequency'] = 0
        new_rows['y_true']   = 1
        prc_df = pd.concat([ensemble_df, new_rows], ignore_index=True)
    else:
        prc_df = ensemble_df.copy()
    # add missing union PPI edges with y_true=0, freq=0
    union_ppi_pairs = set(zip(union_ppi['Node1'], union_ppi['Node2']))
    ensemble_pairs  = set(zip(prc_df['Node1'], prc_df['Node2']))
    union_missing_pairs = union_ppi_pairs.difference(ensemble_pairs)
    if union_missing_pairs:
        missing_df = pd.DataFrame(list(union_missing_pairs), columns=['Node1','Node2'])
        missing_df['Frequency'] = 0
        missing_df['y_true']    = 0
        prc_df = pd.concat([prc_df, missing_df], ignore_index=True)

    # calculate prevalence
    num_pos    = len(pc_df)
    prevalence = num_pos / union_ppi_length

    ap = average_precision_score(prc_df['y_true'], prc_df['Frequency'])
    trimmed = os.path.basename(matched_pc_pathway)
    duplicate_label_results.append({
    'Sample': trimmed,
    'AveragePrecision': ap,
    'dataset_label': training_sample_subfolder
    })

    # plot and save precision-recall curve
    precision, recall, _ = precision_recall_curve(prc_df['y_true'], prc_df['Frequency'])
    plt.figure()
    plt.plot(recall, precision, linewidth=2)
    plt.axhline(prevalence, linestyle='--', label=f'Prevalence = {prevalence:.5f}')
    plt.xlim(0,1)
    plt.ylim(0,1)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'Precision-Recall Curve for {training_sample_subfolder}')
    plt.grid(True)
    plt.legend(loc='lower left')

    clean_name = training_sample_subfolder.replace('-ml', '')
    out_dir    = '../spras-baseline-eval-prc'
    os.makedirs(out_dir, exist_ok=True)
    save_path  = os.path.join(out_dir, f"{clean_name}_prc_curve.png")
    plt.savefig(save_path)
    plt.close()

duplicate_label_prec_df = pd.DataFrame(duplicate_label_results)
overall_prec_df = pd.concat([
    avg_prec_df,
    duplicate_label_prec_df
], ignore_index=True)
out_path = os.path.join('../processed-data-files', 'oi2_summary_ap_scores.csv')
overall_prec_df.to_csv(out_path, index=False, sep='\t')
print(f"Wrote summary of average-precision scores to {out_path}")

Wrote summary of average-precision scores to ../processed-data-files\oi2_summary_ap_scores.csv


#### code to match the Oi2 baseline outputs (i.e. dataset label) back to the respective node file

In [15]:
print(len(set(overall_prec_df['dataset_label'])))
overall_prec_df.head()

337


Unnamed: 0,Sample,AveragePrecision,dataset_label
0,alanine__aspartate_a.txt,0.016665,alanine__a_1510-ml
1,alanine__aspartate_a.txt,0.023841,alanine__a_2535-ml
2,alanine__aspartate_a.txt,0.020378,alanine__a_3273-ml
3,alanine__aspartate_a.txt,0.027565,alanine__a_3711-ml
4,alanine__aspartate_a.txt,0.005766,alanine__a_3781-ml


In [16]:
# ——— Parameters ———
csv_path     = '../processed-data-files/oi2_summary_ap_scores.csv'     # the CSV with a column "label"
nodes_folder = '../baseline-spras-nodes'     # folder containing *_nodes.txt

suffix_pattern = re.compile(r'^(.*?)(_train_\d+\.txt)$')

# ——— Step 1: Read the CSV ———
df = pd.read_csv(csv_path, sep='\t')
print(f"Loaded {len(df)} rows; unique labels: {df['dataset_label'].nunique()}")

# ——— Step 2: Strip the trailing "-ml" to get the actual label ———
df["clean_label"] = df["dataset_label"].str.replace(r"-ml$", "", regex=True)

# ——— Step 3: Build label→filename map ———
label_to_file = {}
for fname in os.listdir(nodes_folder):
    # get the matching folder
    match = re.search(r'_(\d{4})_nodes\.txt$', fname)
    if not match:
        print(fname)
        continue
    year = match.group(1)
    base = fname[:10]
    lbl  = re.sub(r'\W+', '_', base).strip('_') + f"_{year}"
    label_to_file[lbl] = fname

# ——— Step 4: Lookup and attach ———
df["node_file"] = df["clean_label"].map(label_to_file)

# Report any that still didn’t match
missing = df[df["node_file"].isna()]["dataset_label"].unique()
if len(missing):
    print(f"Warning: {len(missing)} labels still had no match. Examples: {missing[:5]}")

# ——— Step 5: Save the augmented CSV ———
out_csv = '../processed-data-files/oi2_apscore_nodefile.csv'
df.drop(columns="clean_label").to_csv(out_csv, index=False, sep='\t')
print(f"Done! Wrote augmented CSV to {out_csv}")

Loaded 337 rows; unique labels: 337
Done! Wrote augmented CSV to ../processed-data-files/oi2_apscore_nodefile.csv
