In [1]:
import pandas as pd
import os
import re

In [2]:
ind_pathway_files = [f for f in os.listdir('pathwaycommons-individual') if f.endswith('.txt')]

In [3]:
mapped_ids_reviewed = pd.read_csv('processed-data/mapped_ids_reviewed.csv', sep='\t')
mapped_ids_reviewed.head()

Unnamed: 0,gene_name,entry_name
0,PLA2G10,PA2GX_HUMAN
1,FDFT1,FDFT_HUMAN
2,UGCG,CEGT_HUMAN
3,CYP1A2,CP1A2_HUMAN
4,RARS2,SYRM_HUMAN


In [4]:
all_relations = ['catalysis-precedes',
 'interacts-with',
 'consumption-controlled-by',
 'controls-production-of',
 'reacts-with',
 'used-to-produce']
undirected = ['in-complex-with', 'interacts-with', 'neighbor-of', 'reacts-with']
directed = [item for item in all_relations if item not in undirected]

In [5]:
mapping_dict = dict(zip(mapped_ids_reviewed["gene_name"], mapped_ids_reviewed["entry_name"]))

In [6]:
for ind_pathway in ind_pathway_files:
    # path_name = os.path.splitext(os.path.basename(ind_pathway))[0]
    read_file = os.path.join('pathwaycommons-individual', ind_pathway)
    ind_pathway = ind_pathway[:-4]
    # print(ind_pathway)
    pc_df = pd.read_csv(read_file, sep='\t',header=None)
    pc_df[0] = pc_df[0].replace(mapping_dict)
    pc_df[2] = pc_df[2].replace(mapping_dict)
    pc_df[4] = pc_df[2].apply(lambda x: 'D' if x in directed else 'U')
    pc_df.drop(columns=[1], inplace=True)
    pc_df.rename(columns={0:'Node1', 2:'Node2',4:'Directionality'}, inplace=True)
    output_path = 'processed-pc-individual-pathways/' + re.sub(r'\W+', '_', ind_pathway).strip('_').lower() + '.txt'
    # print(output_path)
    pc_df.to_csv(output_path, index=False, sep='\t')

In [7]:
import os
import re

src_dir = 'training-samples/'
# match “anything” + “_train_<digits>.csv” as two groups
suffix_pattern = re.compile(r'^(.*?)(_train_\d+\.csv)$')

for fname in os.listdir(src_dir):
    full_path = os.path.join(src_dir, fname)

    m = suffix_pattern.match(fname)
    if m:
        prefix, suffix = m.group(1), m.group(2)
        # normalize only the prefix
        clean = (
            re.sub(r'\W+', '_', prefix)  # non-word → _
              .strip('_')                 # drop leading/trailing _
              .lower()                    # lowercase
        )
        new_fname = clean + suffix
    else:
        # fallback for any file not ending in _train_###.csv
        new_fname = (
            re.sub(r'\W+', '_', fname)
              .strip('_')
              .lower()
        )

    if new_fname != fname:
        old = os.path.join(src_dir, fname)
        new = os.path.join(src_dir, new_fname)
        print(f"Renaming: {fname} → {new_fname}")
        os.rename(old, new)


Renaming: alanine,_aspartate_a_train_1510.csv → alanine__aspartate_a_train_1510.csv
Renaming: alanine,_aspartate_a_train_2535.csv → alanine__aspartate_a_train_2535.csv
Renaming: alanine,_aspartate_a_train_3273.csv → alanine__aspartate_a_train_3273.csv
Renaming: alanine,_aspartate_a_train_3711.csv → alanine__aspartate_a_train_3711.csv
Renaming: alanine,_aspartate_a_train_3781.csv → alanine__aspartate_a_train_3781.csv
Renaming: alanine,_aspartate_a_train_6130.csv → alanine__aspartate_a_train_6130.csv
Renaming: alanine,_aspartate_a_train_6672.csv → alanine__aspartate_a_train_6672.csv
Renaming: alanine,_aspartate_a_train_6727.csv → alanine__aspartate_a_train_6727.csv
Renaming: alanine,_aspartate_a_train_8527.csv → alanine__aspartate_a_train_8527.csv
Renaming: alanine,_aspartate_a_train_9941.csv → alanine__aspartate_a_train_9941.csv
Renaming: alpha-linolenic_acid_train_1854.csv → alpha_linolenic_acid_train_1854.csv
Renaming: alpha-linolenic_acid_train_2371.csv → alpha_linolenic_acid_train_2