In [2]:
import pandas as pd
import os
import re

In [3]:
mapped_ids_reviewed = pd.read_csv('processed-data/mapped_ids_reviewed.csv', sep='\t')
mapped_ids_reviewed.head()

Unnamed: 0,gene_name,entry_name
0,PLA2G10,PA2GX_HUMAN
1,FDFT1,FDFT_HUMAN
2,UGCG,CEGT_HUMAN
3,CYP1A2,CP1A2_HUMAN
4,RARS2,SYRM_HUMAN


In [4]:
pathway_commons = pd.read_csv('PathwayCommons12.kegg.hgnc.sif', sep='\t', header=None)
pathway_commons.head()

Unnamed: 0,0,1,2
0,A4GALT,catalysis-precedes,ABO
1,A4GALT,catalysis-precedes,AK3
2,A4GALT,catalysis-precedes,ALG13
3,A4GALT,catalysis-precedes,ALG14
4,A4GALT,catalysis-precedes,ALG5


In [5]:
mapping_dict = dict(zip(mapped_ids_reviewed["gene_name"], mapped_ids_reviewed["entry_name"]))

pathway_commons[0] = pathway_commons[0].replace(mapping_dict)
pathway_commons[2] = pathway_commons[2].replace(mapping_dict)

In [6]:
all_relations = list(set(pathway_commons[1]))
all_relations

['used-to-produce',
 'interacts-with',
 'reacts-with',
 'consumption-controlled-by',
 'controls-production-of',
 'catalysis-precedes']

In [7]:
undirected = ['in-complex-with', 'interacts-with', 'neighbor-of', 'reacts-with']
directed = [item for item in all_relations if item not in undirected]
directed

['used-to-produce',
 'consumption-controlled-by',
 'controls-production-of',
 'catalysis-precedes']

In [8]:
new_rows = pd.DataFrame({
    0: pathway_commons[0],  # First column
    1: pathway_commons[2],  # Second column
    2: [0.75] * len(pathway_commons),  # Placeholder for column 3
    3: pathway_commons[1].apply(lambda x: 'D' if x in directed else 'U')
})

In [9]:
# Function to replace CHEBI entries efficiently
def replace_chebi(value):
    if isinstance(value, str) and value.startswith("CHEBI:"):
        match = re.match(r"CHEBI:\s*(\d+)", value)
        if match:
            return f"chebi:{match.group(1)}"
    return value

# Apply function to the entire DataFrame
new_rows = new_rows.applymap(replace_chebi)

  new_rows = new_rows.applymap(replace_chebi)


In [10]:
new_rows.head()

Unnamed: 0,0,1,2,3
0,A4GAT_HUMAN,BGAT_HUMAN,0.75,D
1,A4GAT_HUMAN,KAD3_HUMAN,0.75,D
2,A4GAT_HUMAN,ALG13_HUMAN,0.75,D
3,A4GAT_HUMAN,ALG14_HUMAN,0.75,D
4,A4GAT_HUMAN,ALG5_HUMAN,0.75,D


In [11]:
new_rows.to_csv('processed-bulk-pc-pathway.txt', sep='\t')