In [1]:
import pandas as pd
import os
import random

In [2]:
def create_training_sample(individual_pathway, all_nodes, union_ppi, ind_pathway_name):
    """
    create the training samples from a processed single pathway
    """
    sample_size = max(1, len(all_nodes) // 4)  # TODO: change this to multiplication
    # randomly sample nodes
    random_sample = random.sample(all_nodes, sample_size)
    
    # Filter the edges that contain the sampled nodes
    mask = individual_pathway[0].isin(random_sample) & individual_pathway[2].isin(random_sample)
    matching_rows = individual_pathway[mask]
    
    # Generate the training sample
    training_sample = union_ppi.copy()
    training_sample["label"] = training_sample[[0, 1]].apply(tuple, axis=1).isin(matching_rows.apply(tuple, axis=1)).astype(int)
    
    if 1 in training_sample['label'].tolist():
        print("True labels in PPI - found")
    else:
        print("No True Label found in PPI")
        
    # save the training sample
    filename = ind_pathway_name + str(random.randint(1000, 9999)) + '.csv'
    training_sample.to_csv(f'training-samples/{filename}', index=False, sep='\t')

In [3]:
def process_individual_pathway(file_path, union_ppi, id_map):
    """
    Process a single pathway file and generate training samples
    
    Parameters:
    file_path (str): Path to the pathway file
    union_ppi (pd.DataFrame): The union PPI dataset
    id_map (pd.DataFrame): Mapping between gene names and entry names
    """
    # extract filename without extension for naming training samples
    path_name = os.path.splitext(os.path.basename(file_path))[0]
    path_name = path_name.replace(' ', '_').lower() + '_train_'
    
    # read + preprocess the pathway file
    individual_pathway = pd.read_csv(file_path, sep='\t', header=None)
    # remove the reaction column
    individual_pathway.drop(columns=[1], inplace=True)
    
    # create mapping dictionary from id map
    mapping_dict = dict(zip(id_map["gene_name"], id_map["entry_name"]))
    # replace gene names w entry names
    individual_pathway[0] = individual_pathway[0].replace(mapping_dict)
    individual_pathway[2] = individual_pathway[2].replace(mapping_dict)
    
    # get all unique nodes in the individual pathway
    all_nodes = list(set(individual_pathway[0].tolist() + individual_pathway[2].tolist()))
    
    print(f"\nProcessing pathway: {path_name}")
    print(f"Total nodes in pathway: {len(all_nodes)}")
    
    # Generate 10 training samples for each pathway
    for i in range(10):
        create_training_sample(individual_pathway, all_nodes, union_ppi, path_name)

In [4]:
def process_all_pathways(folder_path, union_ppi, id_map):
    """
    Process all pathway files in the specified folder
    
    Parameters:
    folder_path (str): Path to the folder containing pathway files
    union_ppi (pd.DataFrame): The union PPI dataset
    id_map (pd.DataFrame): Mapping between gene names and entry names
    """

    os.makedirs('training-samples', exist_ok=True)
    # get all .txt files in the folder
    pathway_files = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    print(f"Found {len(pathway_files)} pathway files to process")
    
    # Process each pathway file
    for file_name in pathway_files:
        file_path = os.path.join(folder_path, file_name)
        try:
            process_individual_pathway(file_path, union_ppi, id_map)
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
            continue

In [5]:
union_ppi = pd.read_csv('processed-data/union_ppi.txt', sep='\t', header=None)
union_ppi.head()

Unnamed: 0,0,1,2,3
0,TACC1_HUMAN,RUXG_HUMAN,0.736771,U
1,TACC1_HUMAN,KAT2A_HUMAN,0.292198,U
2,TACC1_HUMAN,CKAP5_HUMAN,0.724783,U
3,TACC1_HUMAN,YETS4_HUMAN,0.542597,U
4,TACC1_HUMAN,LSM7_HUMAN,0.714823,U


In [6]:
id_map = pd.read_csv('processed-data/mapped_ids_reviewed.csv', sep='\t')
id_map.head()

Unnamed: 0,gene_name,entry_name
0,PLA2G10,PA2GX_HUMAN
1,FDFT1,FDFT_HUMAN
2,UGCG,CEGT_HUMAN
3,CYP1A2,CP1A2_HUMAN
4,RARS2,SYRM_HUMAN


In [7]:
folder_path = 'pathwaycommons-individual'
process_all_pathways(folder_path, union_ppi, id_map)

Found 82 pathway files to process

Processing pathway: vitamin_b6_metabolis_train_
Total nodes in pathway: 12
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI
No True Label found in PPI
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI

Processing pathway: valine,_leucine_and__train_
Total nodes in pathway: 22
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: valine,_leucine_and__(1)_train_
Total nodes in pathway: 77
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True 

True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: phenylalanine_metabo_train_
Total nodes in pathway: 25
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI
True labels in PPI - found

Processing pathway: phenylalanine,_tyros_train_
Total nodes in pathway: 15
No True Label found in PPI
No True Label found in PPI
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: pentose_phosphate_pa_train_
Total nodes in pathway: 36
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in P

True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: d-arginine_and_d-orn_train_
Total nodes in pathway: 7
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI
No True Label found in PPI

Processing pathway: drug_metabolism_-_ot_train_
Total nodes in pathway: 96
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: d-glutamine_and_d-gl_train_
Total nodes in pathway: 6
No True Label foun

True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: histidine_metabolism_train_
Total nodes in pathway: 41
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: inositol_phosphate_m_train_
Total nodes in pathway: 64
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found

Processing pathway: linoleic_acid_met

In [9]:
folder_path = 'training-samples'
files = os.listdir(folder_path)
num_files = len(files)
print(f"Total files: {num_files}")

Total files: 820
