### CREATE PRECISION RECALL CURVES FOR ALL SPRAS ENSEMBLE PATHWAYS

- loop through all spras ensemble pathway
    - match the ensemble pathway to the processed pc individual pathway
    - make the prc table:
        - remove direction from ensemble pathway
        - add y_true column and set all rows 0
        - if ensemble pathway has rows from processed pc individual pathway, set those y_true to 1
        - if there are rows in processed pc individual pathway that are not in ensemble pathway, add to prc df with y_true 1 and frequency 0
    - use prc table to make the pr curve image

In [9]:
import pandas as pd
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
import os
import re
import shutil

In [10]:
processed_pc_pathways_folder = 'processed-data/processed-pc-individual-pathways'
oi2_baseline_outputs_folder = 'oi2-baseline-outputs'

In [11]:
pc_processed_list = [f for f in os.listdir(processed_pc_pathways_folder) if f.endswith('.txt')]

# Create a map of cleaned base names to file names
file_map = {}
for filename in pc_processed_list:
    base = filename[:10]
    label = re.sub(r'\W+', '_', base).strip('_')
    if label not in file_map:
        file_map[label] = filename
    else:
        print("duplicate found - verify manually")
        print(label)
print(len(file_map))

duplicate found - verify manually
drug_metab
duplicate found - verify manually
fatty_acid
duplicate found - verify manually
fatty_acid
duplicate found - verify manually
glycosamin
duplicate found - verify manually
glycosamin
duplicate found - verify manually
glycosphin
duplicate found - verify manually
glycosphin
duplicate found - verify manually
phenylalan
duplicate found - verify manually
valine__le
73


In [12]:
del file_map['drug_metab']
del file_map['fatty_acid']
del file_map['glycosamin']
del file_map['glycosphin']
del file_map['phenylalan']
del file_map['valine__le']
print(len(file_map))

67


In [13]:
union_ppi = pd.read_csv('processed-data/union_ppi.txt', sep='\t', header=None)
union_ppi.rename(columns={0:'Node1', 1:'Node2', 3:'Prize',4:'Directionality'}, inplace=True)

union_ppi_length = len(union_ppi)
union_ppi_length

202090

In [14]:
# loop through each subfolder in spras outputs dir
for folder_name in os.listdir(oi2_baseline_outputs_folder):
    folder_path = os.path.join(oi2_baseline_outputs_folder, folder_name)
    if os.path.isdir(folder_path):
        # Extract label from subfolder name
        label_match = re.match(r'([a-zA-Z_]+)', folder_name)
        if label_match:
            label = label_match.group(1).strip('_')
            if label in file_map:
                src_file = os.path.join(processed_pc_pathways_folder, file_map[label])
                dest_file = os.path.join(folder_path, file_map[label])
                shutil.copy(src_file, dest_file)
                # print(f"Copied {src_file} -> {dest_file}")
            else:
                print(f"No matching file for folder: {folder_name}")

No matching file for folder: drug_metab_1108-ml
No matching file for folder: drug_metab_1722-ml
No matching file for folder: drug_metab_3545-ml
No matching file for folder: drug_metab_4503-ml
No matching file for folder: drug_metab_5565-ml
No matching file for folder: drug_metab_6155-ml
No matching file for folder: drug_metab_6253-ml
No matching file for folder: drug_metab_6262-ml
No matching file for folder: drug_metab_7207-ml
No matching file for folder: drug_metab_7643-ml
No matching file for folder: drug_metab_7684-ml
No matching file for folder: drug_metab_8702-ml


In [15]:
for training_sample_subfolder in os.listdir(oi2_baseline_outputs_folder):
    # get subfolder 
    folder_path = os.path.join(oi2_baseline_outputs_folder, training_sample_subfolder)

    if os.path.isdir(folder_path):

        # get the matching processed pc pathway
        prefix = training_sample_subfolder[:10]
        matched_pc_pathway = None
        for file_name in os.listdir(folder_path):
            if file_name.startswith(prefix) and file_name.endswith(".txt"):
                matched_pc_pathway = os.path.join(folder_path, file_name)
                break  
        # if there's no matching pc pathway, report and skip
        if not matched_pc_pathway or not os.path.isfile(matched_pc_pathway):
            print(f"Skipping subfolder '{training_sample_subfolder}': no matching .txt file with prefix '{prefix}'")
            continue

        pc_df = pd.read_csv(matched_pc_pathway, sep="\t")
        # get ensemble pathway file 
        file_path = os.path.join(folder_path, "ensemble-pathway.txt")
        if os.path.isfile(file_path):
            ensemble_df = pd.read_csv(file_path, sep='\t')

        # MAKE THE PRC TABLE:
        ensemble_df.drop(columns=['Direction'], inplace=True)
        ensemble_df['y_true'] = 0
        # comparing ensemble df to pathway commons --> set y_true to 1 and freq to 0 for missing pairs
        pc_ytrue_pairs = set(zip(pc_df['Node1'], pc_df['Node2']))
        ensemble_df.loc[ensemble_df[['Node1', 'Node2']].apply(tuple, axis=1).isin(pc_ytrue_pairs), 'y_true'] = 1
        pc_missing_pairs = pc_ytrue_pairs.difference(zip(ensemble_df['Node1'], ensemble_df['Node2']))
        if pc_missing_pairs:                      
            new_rows = pd.DataFrame(list(pc_missing_pairs), columns=['Node1', 'Node2'])
            new_rows['Frequency'] = 0          
            new_rows['y_true']   = 1
            prc_df = pd.concat([ensemble_df, new_rows], ignore_index=True)
        # comparing ensemble df to union ppi --> set y_true to 0 and freq to 0 for missing pairs
        union_ppi_pairs = set(zip(union_ppi['Node1'], union_ppi['Node2']))
        ensemble_pairs  = set(zip(ensemble_df['Node1'],  ensemble_df['Node2']))
        union_missing_pairs = union_ppi_pairs.difference(ensemble_pairs)
        missing_df = pd.DataFrame(list(union_missing_pairs), columns=['Node1','Node2'])
        missing_df['y_true'] = 0
        missing_df['Frequency']   = 0
        prc_df = pd.concat([prc_df, missing_df], ignore_index=True)
        #print(prc_df)

        # calculate prevalence --> probability of randomly drawing a positive example from your dataset
        # num positives --> taken from pathwaycommons individual
        num_pos = len(pc_df)
        prevalence = num_pos/union_ppi_length
    
        # plot and save precision recall curve
        precision, recall, _ = precision_recall_curve(prc_df['y_true'], prc_df['Frequency'])
        plt.figure()
        plt.plot(recall, precision, linewidth=2)
        # add prevalence as a horizontal dotted line
        plt.axhline(prevalence, color='gray', linestyle='--', label=f'Prevalence = {prevalence:.5f}')
        # force both axes from 0 to 1
        plt.xlim(0, 1)
        plt.ylim(0, 1)
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve for {training_sample_subfolder}')
        plt.grid(True)
        plt.legend(loc='lower left')
        clean_name = training_sample_subfolder.replace('-ml', '')
        save_path = os.path.join('spras-baseline-eval-prc', f"{clean_name}_prc_curve.png")
        plt.savefig(save_path)
        plt.close()

Skipping subfolder 'drug_metab_1108-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_1722-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_3545-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_4503-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_5565-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_6155-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_6253-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_6262-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_7207-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_7643-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_7684-ml': no matching .txt file with prefix 'drug_metab'
Skipping subfolder 'drug_metab_8

if the label has multiple matches to processed pc pathway, need to manually create PRC:

In [22]:
# get the matching processed pc pathway
training_sample_subfolder = 'drug_metab_6262-ml'
folder_path = os.path.join(oi2_baseline_outputs_folder, training_sample_subfolder)
prefix = training_sample_subfolder[:10]
matched_pc_pathway = 'processed-data\processed-pc-individual-pathways\drug_metabolism___cy.txt'
pc_df = pd.read_csv(matched_pc_pathway, sep="\t")


# get ensemble pathway file 
file_path = os.path.join(folder_path, "ensemble-pathway.txt")
if os.path.isfile(file_path):
    ensemble_df = pd.read_csv(file_path, sep='\t')

# MAKE THE PRC TABLE:
ensemble_df.drop(columns=['Direction'], inplace=True)
ensemble_df['y_true'] = 0
# comparing ensemble df to pathway commons --> set y_true to 1 and freq to 0 for missing pairs
pc_ytrue_pairs = set(zip(pc_df['Node1'], pc_df['Node2']))
ensemble_df.loc[ensemble_df[['Node1', 'Node2']].apply(tuple, axis=1).isin(pc_ytrue_pairs), 'y_true'] = 1
pc_missing_pairs = pc_ytrue_pairs.difference(zip(ensemble_df['Node1'], ensemble_df['Node2']))
if pc_missing_pairs:                      
    new_rows = pd.DataFrame(list(pc_missing_pairs), columns=['Node1', 'Node2'])
    new_rows['Frequency'] = 0          
    new_rows['y_true']   = 1
    prc_df = pd.concat([ensemble_df, new_rows], ignore_index=True)
# comparing ensemble df to union ppi --> set y_true to 0 and freq to 0 for missing pairs
union_ppi_pairs = set(zip(union_ppi['Node1'], union_ppi['Node2']))
ensemble_pairs  = set(zip(ensemble_df['Node1'],  ensemble_df['Node2']))
union_missing_pairs = union_ppi_pairs.difference(ensemble_pairs)
missing_df = pd.DataFrame(list(union_missing_pairs), columns=['Node1','Node2'])
missing_df['y_true'] = 0
missing_df['Frequency']   = 0
prc_df = pd.concat([prc_df, missing_df], ignore_index=True)
#print(prc_df)

# calculate prevalence --> probability of randomly drawing a positive example from your dataset
# num positives --> taken from pathwaycommons individual
num_pos = len(pc_df)
prevalence = num_pos/union_ppi_length

# plot and save precision recall curve
precision, recall, _ = precision_recall_curve(prc_df['y_true'], prc_df['Frequency'])
plt.figure()
plt.plot(recall, precision, linewidth=2)
# add prevalence as a horizontal dotted line
plt.axhline(prevalence, color='gray', linestyle='--', label=f'Prevalence = {prevalence:.5f}')
# force both axes from 0 to 1
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve for {training_sample_subfolder}')
plt.grid(True)
plt.legend(loc='lower left')
clean_name = training_sample_subfolder.replace('-ml', '')
save_path = os.path.join('spras-baseline-eval-prc', f"{clean_name}_prc_curve.png")
plt.savefig(save_path)
plt.close()