In [1]:
import pandas as pd

**Load SPRAS ensemble pathway output**

- currently contains:
    - false positives (in ensemble, not in pathwaycommons individual graph)
    - true positives (in ensemble, also in pathway commons individual graph)

In [None]:
ensemble_sample = pd.read_csv('processed-data/ensemble-pathway-sample.txt', sep='\t')
ensemble_sample.drop(columns=['Direction'], inplace=True)
ensemble_sample['y_true'] = 0
ensemble_sample.head()

Unnamed: 0,Node1,Node2,Frequency,y_true
0,DCE2_HUMAN,GLSK_HUMAN,0.04,0
1,SSDH_HUMAN,chebi:16265,0.04,0
2,chebi:16265,SSDH_HUMAN,0.04,0
3,TACC1_HUMAN,RUXG_HUMAN,0.04,0
4,TACC1_HUMAN,KAT2A_HUMAN,0.04,0


**Add edges not in ensemble, but in pathway commons graph**

- i.e. add the false negatives

In [20]:
pathwaycommons_ind = pd.read_csv('pathway-commons-individual/Alanine,_aspartate_a.txt', sep='\t', header=None)

mapped_ids_reviewed = pd.read_csv('processed-data/mapped_ids_reviewed.csv', sep='\t')
mapped_ids_reviewed.head()

mapping_dict = dict(zip(mapped_ids_reviewed["gene_name"], mapped_ids_reviewed["entry_name"]))

pathwaycommons_ind[0] = pathwaycommons_ind[0].replace(mapping_dict)
pathwaycommons_ind[2] = pathwaycommons_ind[2].replace(mapping_dict)
pathwaycommons_ind.drop(columns=[1], inplace=True)
pathwaycommons_ind.rename(columns={0:'Node1', 2:'Node2'}, inplace=True)
pathwaycommons_ind

Unnamed: 0,Node1,Node2
0,AGT1_HUMAN,chebi:15428
1,AGT1_HUMAN,chebi:16891
2,AGT1_HUMAN,chebi:16977
3,AGT1_HUMAN,chebi:32816
4,AL4A1_HUMAN,CHEBI:371
...,...,...
258,chebi:30915,DHE4_HUMAN
259,chebi:30915,chebi:16015
260,chebi:32816,AGT1_HUMAN
261,chebi:32816,chebi:16891


In [None]:
ytrue_pairs = set(zip(pathwaycommons_ind['Node1'], pathwaycommons_ind['Node2']))
ensemble_sample.loc[ensemble_sample[['Node1', 'Node2']].apply(tuple, axis=1).isin(ytrue_pairs), 'y_true'] = 1

# --- find pairs that are *only* in  and append them ------------------
missing_pairs = ytrue_pairs.difference(zip(ensemble_sample['Node1'], ensemble_sample['Node2']))
if missing_pairs:                      # only build a frame if something is missing
    new_rows = pd.DataFrame(list(missing_pairs), columns=['Node1', 'Node2'])
    new_rows['Frequency'] = 0          # per your rule
    new_rows['y_true']   = 1
    ensemble_sample = pd.concat([ensemble_sample, new_rows], ignore_index=True)

In [24]:
ensemble_sample

Unnamed: 0,Node1,Node2,Frequency,y_true
0,DCE2_HUMAN,GLSK_HUMAN,0.04,1
1,SSDH_HUMAN,chebi:16265,0.04,1
2,chebi:16265,SSDH_HUMAN,0.04,1
3,TACC1_HUMAN,RUXG_HUMAN,0.04,0
4,TACC1_HUMAN,KAT2A_HUMAN,0.04,0
...,...,...,...,...
259,DCE2_HUMAN,GNPI2_HUMAN,0.00,1
260,chebi:15428,chebi:32816,0.00,1
261,GLSK_HUMAN,chebi:16015,0.00,1
262,chebi:16467,chebi:15682,0.00,1
