In [1]:
import pandas as pd
import re

### phosphosite-irefindex13.0-uniprot.txt (SPRAS PPI)

In [2]:
initial_ppi = pd.read_csv('phosphosite-irefindex13.0-uniprot.txt', sep='\t',header=None)
initial_ppi.head()

Unnamed: 0,0,1,2,3
0,TACC1_HUMAN,RUXG_HUMAN,0.736771,U
1,TACC1_HUMAN,KAT2A_HUMAN,0.292198,U
2,TACC1_HUMAN,CKAP5_HUMAN,0.724783,U
3,TACC1_HUMAN,YETS4_HUMAN,0.542597,U
4,TACC1_HUMAN,LSM7_HUMAN,0.714823,U


In [3]:
# getting summary stats for directed edges - to assign to KEGG edges
directed_initial_ppi = initial_ppi[initial_ppi[3]=='D']
directed_initial_ppi.describe()

Unnamed: 0,2
count,3917.0
mean,0.69713
std,0.114483
min,0.553333
25%,0.553333
50%,0.666667
75%,0.773333
max,0.996667


### load mapped IDs

In [4]:
mapped_ids_reviewed = pd.read_csv('processed-data/mapped_ids_reviewed.csv', sep='\t')
mapped_ids_reviewed.head()

Unnamed: 0,gene_name,entry_name
0,PLA2G10,PA2GX_HUMAN
1,FDFT1,FDFT_HUMAN
2,UGCG,CEGT_HUMAN
3,CYP1A2,CP1A2_HUMAN
4,RARS2,SYRM_HUMAN


### Add PathwayCommons KEGG edges to SPRAS PPI - make Union PPI

In [5]:
pathway_commons = pd.read_csv('PathwayCommons12.kegg.hgnc.sif', sep='\t', header=None)
pathway_commons.head()

Unnamed: 0,0,1,2
0,A4GALT,catalysis-precedes,ABO
1,A4GALT,catalysis-precedes,AK3
2,A4GALT,catalysis-precedes,ALG13
3,A4GALT,catalysis-precedes,ALG14
4,A4GALT,catalysis-precedes,ALG5


**map gene names to uniprot entry names to match spras PPI**

In [6]:
mapping_dict = dict(zip(mapped_ids_reviewed["gene_name"], mapped_ids_reviewed["entry_name"]))

pathway_commons[0] = pathway_commons[0].replace(mapping_dict)
pathway_commons[2] = pathway_commons[2].replace(mapping_dict)

In [7]:
pathway_commons.head()

Unnamed: 0,0,1,2
0,A4GAT_HUMAN,catalysis-precedes,BGAT_HUMAN
1,A4GAT_HUMAN,catalysis-precedes,KAD3_HUMAN
2,A4GAT_HUMAN,catalysis-precedes,ALG13_HUMAN
3,A4GAT_HUMAN,catalysis-precedes,ALG14_HUMAN
4,A4GAT_HUMAN,catalysis-precedes,ALG5_HUMAN


**directionality**

In [8]:
all_relations = list(set(pathway_commons[1]))
all_relations

['catalysis-precedes',
 'interacts-with',
 'consumption-controlled-by',
 'controls-production-of',
 'reacts-with',
 'used-to-produce']

In [9]:
undirected = ['in-complex-with', 'interacts-with', 'neighbor-of', 'reacts-with']
directed = [item for item in all_relations if item not in undirected]
directed

['catalysis-precedes',
 'consumption-controlled-by',
 'controls-production-of',
 'used-to-produce']

**make new rows based on the pathwaycommons PPI**

In [10]:
new_rows = pd.DataFrame({
    0: pathway_commons[0],  # First column
    1: pathway_commons[2],  # Second column
    2: [0.75] * len(pathway_commons),  # Placeholder for column 3
    3: pathway_commons[1].apply(lambda x: 'D' if x in directed else 'U')
})

**concat + clean the union ppi**

In [11]:
# Function to replace CHEBI entries efficiently
def replace_chebi(value):
    if isinstance(value, str) and value.startswith("CHEBI:"):
        match = re.match(r"CHEBI:\s*(\d+)", value)
        if match:
            return f"chebi:{match.group(1)}"
    return value

# Apply function to the entire DataFrame
new_rows = new_rows.applymap(replace_chebi)

  new_rows = new_rows.applymap(replace_chebi)


In [12]:
# Concatenate the new rows to the existing DataFrame
updated_ppi = pd.concat([initial_ppi, new_rows], ignore_index=True)
new_rows.head()

Unnamed: 0,0,1,2,3
0,A4GAT_HUMAN,BGAT_HUMAN,0.75,D
1,A4GAT_HUMAN,KAD3_HUMAN,0.75,D
2,A4GAT_HUMAN,ALG13_HUMAN,0.75,D
3,A4GAT_HUMAN,ALG14_HUMAN,0.75,D
4,A4GAT_HUMAN,ALG5_HUMAN,0.75,D


In [13]:
updated_ppi.head()

Unnamed: 0,0,1,2,3
0,TACC1_HUMAN,RUXG_HUMAN,0.736771,U
1,TACC1_HUMAN,KAT2A_HUMAN,0.292198,U
2,TACC1_HUMAN,CKAP5_HUMAN,0.724783,U
3,TACC1_HUMAN,YETS4_HUMAN,0.542597,U
4,TACC1_HUMAN,LSM7_HUMAN,0.714823,U


In [14]:
updated_ppi.to_csv('processed-data/union_ppi.txt', sep='\t', header=None, index=False)