In [1]:
import pandas as pd
import random
import re

### Load the union ppi and ID map

In [2]:
union_ppi = pd.read_csv('processed-data/union_ppi.txt', sep='\t', header=None)
union_ppi.head()

Unnamed: 0,0,1,2,3
0,TACC1_HUMAN,RUXG_HUMAN,0.736771,U
1,TACC1_HUMAN,KAT2A_HUMAN,0.292198,U
2,TACC1_HUMAN,CKAP5_HUMAN,0.724783,U
3,TACC1_HUMAN,YETS4_HUMAN,0.542597,U
4,TACC1_HUMAN,LSM7_HUMAN,0.714823,U


In [3]:
id_map = pd.read_csv('processed-data/mapped_ids_reviewed.csv', sep='\t')
id_map.head()

Unnamed: 0,gene_name,entry_name
0,PLA2G10,PA2GX_HUMAN
1,FDFT1,FDFT_HUMAN
2,UGCG,CEGT_HUMAN
3,CYP1A2,CP1A2_HUMAN
4,RARS2,SYRM_HUMAN


### function to create the training sample data

In [4]:
def create_training_sample(individual_pathway, all_nodes, union_ppi, ind_pathway_name):
    sample_size = max(1, len(all_nodes) // 4)  # TODO: change this to multiplication
    # Randomly sample nodes
    random_sample = random.sample(all_nodes, sample_size)
    # Filter the edges that contain the sampled nodes
    mask = individual_pathway[0].isin(random_sample) & individual_pathway[2].isin(random_sample)
    matching_rows = individual_pathway[mask]
    # Generate the training sample
    training_sample = union_ppi.copy()
    training_sample["label"] = training_sample[[0, 1]].apply(tuple, axis=1).isin(matching_rows.apply(tuple, axis=1)).astype(int)
    # print(training_sample[training_sample['label']==1])
    if 1 in training_sample['label'].tolist():
        print("True labels in PPI - found")
    else:
        print("No True Label found in PPI")
    filename = ind_pathway_name + str(random.randint(1000, 9999)) + '.csv'
    training_sample.to_csv(f'training-samples/{filename}', index=False, sep='\t')

### Individual Pathway: Glycolysis___Glucone.txt

**load the pathway + map IDs using ID map + preprocessing**

In [5]:
glycolysis_glucone = pd.read_csv('pathwaycommons-individual/Glycolysis___Glucone.txt', sep='\t',header=None)
print(set(glycolysis_glucone[1].tolist()))
glycolysis_glucone.head()

{'controls-production-of', 'consumption-controlled-by', 'reacts-with', 'catalysis-precedes', 'used-to-produce'}


Unnamed: 0,0,1,2
0,ACSS2,catalysis-precedes,ADPGK
1,ACSS2,catalysis-precedes,DLAT
2,ACSS2,controls-production-of,chebi:15351
3,ACSS2,controls-production-of,chebi:16027
4,ADH1A,catalysis-precedes,ALDH3A1


In [6]:
glycolysis_glucone.drop(columns=[1],inplace=True)
print(len(glycolysis_glucone))
glycolysis_glucone.head()

454


Unnamed: 0,0,2
0,ACSS2,ADPGK
1,ACSS2,DLAT
2,ACSS2,chebi:15351
3,ACSS2,chebi:16027
4,ADH1A,ALDH3A1


In [7]:
mapping_dict = dict(zip(id_map["gene_name"], id_map["entry_name"]))

glycolysis_glucone[0] = glycolysis_glucone[0].replace(mapping_dict)
glycolysis_glucone[2] = glycolysis_glucone[2].replace(mapping_dict)
glycolysis_glucone.head()

Unnamed: 0,0,2
0,ACSA_HUMAN,ADPGK_HUMAN
1,ACSA_HUMAN,ODP2_HUMAN
2,ACSA_HUMAN,chebi:15351
3,ACSA_HUMAN,chebi:16027
4,ADH1A_HUMAN,AL3A1_HUMAN


**get all nodes + apply the training sample generating function - 10 times**

In [8]:
all_gg_nodes = list(set(glycolysis_glucone[0])) + list(set(glycolysis_glucone[2]))
len(all_gg_nodes)

141

In [9]:
gg_path_name = 'glycolysis_glucone_train_'

In [10]:
for _ in range(10):
    create_training_sample(glycolysis_glucone, all_gg_nodes, union_ppi, gg_path_name)

True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found


### Individual Pathway: Butirosin_and_neomyc.txt

**load + preprocess individual pathway**

In [11]:
butirosin_neomyc = pd.read_csv('pathwaycommons-individual/Butirosin_and_neomyc.txt', sep='\t',header=None)
print(set(butirosin_neomyc[1].tolist()))
butirosin_neomyc.head()

{'controls-production-of', 'consumption-controlled-by', 'used-to-produce'}


Unnamed: 0,0,1,2
0,CHEBI:4167,consumption-controlled-by,GCK
1,CHEBI:4167,consumption-controlled-by,HK1
2,CHEBI:4167,consumption-controlled-by,HK2
3,CHEBI:4167,consumption-controlled-by,HK3
4,CHEBI:4167,consumption-controlled-by,HKDC1


In [12]:
butirosin_neomyc.drop(columns=[1],inplace=True)
print(len(butirosin_neomyc))
butirosin_neomyc.head()

11


Unnamed: 0,0,2
0,CHEBI:4167,GCK
1,CHEBI:4167,HK1
2,CHEBI:4167,HK2
3,CHEBI:4167,HK3
4,CHEBI:4167,HKDC1


In [13]:
def replace_chebi(value):
    if isinstance(value, str) and value.startswith("CHEBI:"):
        match = re.match(r"CHEBI:\s*(\d+)", value)
        if match:
            return f"chebi:{match.group(1)}"
    return value

butirosin_neomyc = butirosin_neomyc.applymap(replace_chebi)
butirosin_neomyc.head()

Unnamed: 0,0,2
0,chebi:4167,GCK
1,chebi:4167,HK1
2,chebi:4167,HK2
3,chebi:4167,HK3
4,chebi:4167,HKDC1


In [14]:
mapping_dict = dict(zip(id_map["gene_name"], id_map["entry_name"]))

butirosin_neomyc[0] = butirosin_neomyc[0].replace(mapping_dict)
butirosin_neomyc[2] = butirosin_neomyc[2].replace(mapping_dict)
butirosin_neomyc.head()

Unnamed: 0,0,2
0,chebi:4167,HXK4_HUMAN
1,chebi:4167,HXK1_HUMAN
2,chebi:4167,HXK2_HUMAN
3,chebi:4167,HXK3_HUMAN
4,chebi:4167,HKDC1_HUMAN


In [15]:
all_bn_nodes = list(set(butirosin_neomyc[0])) + list(set(butirosin_neomyc[2]))
len(all_bn_nodes)

12

In [16]:
bn_path_name = 'butirosin_neomyc_train_'

In [17]:
for _ in range(10):
    create_training_sample(butirosin_neomyc, all_bn_nodes, union_ppi, bn_path_name)

True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI
True labels in PPI - found
True labels in PPI - found
No True Label found in PPI
No True Label found in PPI


In [None]:
#TODO: training loop for all the individual pathways in training-samples folder