### CREATE PRECISION RECALL CURVES FOR ALL SPRAS ENSEMBLE PATHWAYS

- loop through all spras ensemble pathway
    - match the ensemble pathway to the processed pc individual pathway
    - make the prc table:
        - remove direction from ensemble pathway
        - add y_true column and set all rows 0
        - if ensemble pathway has rows from processed pc individual pathway, set those y_true to 1
        - if there are rows in processed pc individual pathway that are not in ensemble pathway, add to prc df with y_true 1 and frequency 0
    - use prc table to make the pr curve image

In [1]:
import pandas as pd
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import os
import re
import shutil

In [2]:
processed_pc_pathways_folder = 'processed-data/processed-pc-individual-pathways'
oi2_baseline_outputs_folder = 'oi2-baseline-outputs'

In [3]:
pc_processed_list = [f for f in os.listdir(processed_pc_pathways_folder) if f.endswith('.txt')]

# Create a map of cleaned base names to file names
file_map = {}
for filename in pc_processed_list:
    base = filename[:10]
    label = re.sub(r'\W+', '_', base).strip('_')
    if label not in file_map:
        file_map[label] = filename
    else:
        print("duplicate found - verify manually")
        print(label)
print(len(file_map))

duplicate found - verify manually
drug_metab
duplicate found - verify manually
fatty_acid
duplicate found - verify manually
fatty_acid
duplicate found - verify manually
glycosamin
duplicate found - verify manually
glycosamin
duplicate found - verify manually
glycosphin
duplicate found - verify manually
glycosphin
duplicate found - verify manually
phenylalan
duplicate found - verify manually
valine__le
73


In [4]:
del file_map['drug_metab']
del file_map['fatty_acid']
del file_map['glycosamin']
del file_map['glycosphin']
del file_map['phenylalan']
del file_map['valine__le']
print(len(file_map))

67


In [5]:
# loop through each subfolder in spras outputs dir
for folder_name in os.listdir(oi2_baseline_outputs_folder):
    folder_path = os.path.join(oi2_baseline_outputs_folder, folder_name)
    if os.path.isdir(folder_path):
        # Extract label from subfolder name
        label_match = re.match(r'([a-zA-Z_]+)', folder_name)
        if label_match:
            label = label_match.group(1).strip('_')
            if label in file_map:
                src_file = os.path.join(processed_pc_pathways_folder, file_map[label])
                dest_file = os.path.join(folder_path, file_map[label])
                shutil.copy(src_file, dest_file)
                # print(f"Copied {src_file} -> {dest_file}")
            # else:
            #     print(f"No matching file for folder: {folder_name}")

In [8]:
for training_sample_subfolder in os.listdir(oi2_baseline_outputs_folder):
    # get subfolder 
    folder_path = os.path.join(oi2_baseline_outputs_folder, training_sample_subfolder)

    if os.path.isdir(folder_path):
        # get the processed pc pathway
        prefix = training_sample_subfolder[:10]
        matched_pc_pathway = None
        for file_name in os.listdir(folder_path):
            if file_name.startswith(prefix) and file_name.endswith(".txt"):
                matched_pc_pathway = os.path.join(folder_path, file_name)
                break  
        if matched_pc_pathway and os.path.isfile(matched_pc_pathway):
            pc_df = pd.read_csv(matched_pc_pathway, sep='\t')  
        # get ensemble pathway file 
        file_path = os.path.join(folder_path, "ensemble-pathway.txt")
        if os.path.isfile(file_path):
            ensemble_df = pd.read_csv(file_path, sep='\t')

        # MAKE THE PRC TABLE:
        ensemble_df.drop(columns=['Direction'], inplace=True)
        ensemble_df['y_true'] = 0
        pc_ytrue_pairs = set(zip(pc_df['Node1'], pc_df['Node2']))
        ensemble_df.loc[ensemble_df[['Node1', 'Node2']].apply(tuple, axis=1).isin(pc_ytrue_pairs), 'y_true'] = 1
        # only build a frame if something is missing
        missing_pairs = pc_ytrue_pairs.difference(zip(ensemble_df['Node1'], ensemble_df['Node2']))
        if missing_pairs:                      
            new_rows = pd.DataFrame(list(missing_pairs), columns=['Node1', 'Node2'])
            new_rows['Frequency'] = 0          
            new_rows['y_true']   = 1
            prc_df = pd.concat([ensemble_df, new_rows], ignore_index=True)
        
        # plot and save precision recall curve
        precision, recall, _ = precision_recall_curve(prc_df['y_true'], prc_df['Frequency'])
        plt.figure()
        plt.plot(recall, precision, linewidth=2)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve for {training_sample_subfolder}')
        plt.grid(True)
        clean_name = training_sample_subfolder.replace('-ml', '')
        save_path = os.path.join('spras-baseline-eval-prc', f"{clean_name}_prc_curve.png")
        plt.savefig(save_path)
        plt.close()

In [None]:
#TODO: handle files where the labels are same but the datasets are different!