In [39]:
import csv
import pandas as pd
import numpy as np
import torch

## Read graph data and save as undirected edge list

In [40]:
raw_data = pd.read_csv('PP-Pathways_ppi.csv')

In [41]:
raw_data.shape     # Edge list format

(342352, 2)

Example entry

In [42]:
raw_data.loc[raw_data[0] == 7349]

KeyError: 0

Append the other direction for the edgelist to be undirected (per convention of py geometric)

In [None]:
rest = raw_data.copy()
rest = rest.rename(columns={0: 1, 1: 0})
full_data =raw_data.append(rest)

In [None]:
full_data.loc[full_data[0] == 7349]  #Check that both directions are added, compared from previous entry checking for 7349 in first position

Unnamed: 0,0,1
154630,7349,58959
159230,7349,1394
247936,7349,64680
260335,7349,3952
266208,7349,1395
290792,7349,1393


Check that the number of nodes correspond with dataset website 

In [None]:
N = full_data[0]
N = N.unique()
N.shape

Saving the edge list

In [None]:
np.savetxt(r'ppi_edge_list', full_data.values, fmt='%d')

## Read deepwalk output node embeddings

In [None]:
with open('ppi.embeddings','r') as f:
    embeddings = f.read().splitlines()

In [None]:
embeddings_input = pd.read_table('ppi.embeddings',sep=" ",skiprows=[0],header=None)

In [None]:
embeddings_input

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,7316,0.189910,-0.064698,-0.599348,0.596876,-0.838050,-0.193471,0.492390,0.251973,-0.932927,...,-0.137484,0.302540,0.129742,-0.229662,0.427583,-0.077384,-0.213938,-0.305025,0.363385,0.399331
1,9515,0.691739,-0.180698,0.202206,0.538861,-0.366840,-0.147514,0.105208,1.165225,-0.449967,...,-0.804815,-0.047687,0.438935,0.829328,0.760427,0.299100,1.313166,-0.487873,-0.168249,0.337687
2,1994,0.912802,-0.272483,0.724075,-0.242186,-0.178778,-0.460282,0.554090,0.067562,-0.295401,...,0.176077,0.284037,0.053238,0.682973,0.038556,-0.093938,1.068039,-0.122543,0.573283,0.425981
3,6622,-0.021047,0.794369,0.241550,0.203650,-0.227179,-0.715000,0.652274,-0.701848,-0.970441,...,-0.437235,0.226226,0.791789,0.495407,0.490702,0.602886,1.386207,-0.369359,0.453405,-0.212723
4,801,0.496900,-0.728646,0.106540,0.380249,0.203935,-0.303260,0.056357,0.233541,-0.701761,...,-0.245434,0.066865,0.077869,0.407522,0.290492,-0.236038,0.425556,0.005502,0.580945,0.939810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21552,9043,0.014123,0.004329,-0.015429,0.009397,-0.014265,0.010961,-0.013164,-0.005495,-0.011759,...,-0.008984,-0.008643,-0.013500,-0.004089,0.006694,0.006382,0.004451,-0.003720,-0.000940,0.004938
21553,114836,-0.012383,0.007339,0.013936,0.008257,0.012825,0.013396,-0.011796,0.005186,-0.000722,...,0.002209,0.007570,-0.002435,0.011183,-0.005994,-0.009608,0.008898,-0.011565,-0.011611,0.014662
21554,25769,-0.009213,-0.005118,0.010256,0.006941,-0.004975,0.001724,0.004428,0.000953,0.008670,...,0.000323,0.008156,0.011999,0.011780,-0.012374,0.002535,-0.012322,0.010036,0.008177,0.009573
21555,57586,-0.005197,-0.005457,-0.006769,0.000308,0.007852,-0.008098,-0.000677,-0.001047,0.014665,...,-0.001947,-0.014540,-0.002869,-0.006419,0.012517,-0.013187,-0.002361,0.007360,0.004088,0.007637


In [None]:
embeddings_input = embeddings_input.sort_values(by=[0])  #sort by node number

In [None]:
embeddings = embeddings_input.to_numpy()

In [None]:
embeddings_features=embeddings[:,1:]    # Splice to have features only (first column is node names)
embeddings_features.shape

(21557, 64)

In [None]:
embeddings_features  #feature for the nodes arranged in the order of increasing node number

array([[-0.36996102,  0.09315004,  1.402932  , ...,  0.38436455,
        -0.653469  , -0.18093751],
       [-0.49422905,  0.61262727, -0.7776907 , ..., -1.1173348 ,
         0.63795   ,  0.95317674],
       [ 0.85459   , -0.4973924 ,  0.59411734, ...,  0.7508602 ,
         0.2990249 ,  1.6158814 ],
       ...,
       [ 0.5960941 ,  0.54542536,  0.05377278, ..., -0.5385075 ,
        -1.775665  , -0.5381145 ],
       [ 0.81639177, -0.17730135,  0.69772583, ..., -0.70117736,
         1.1467463 ,  1.0267804 ],
       [ 1.2118686 , -0.47343823, -0.0856882 , ..., -0.33950025,
         1.095661  , -0.51299816]])

## Map edge list to have continuous node numbering

In [None]:
import torch
from torch_geometric.data import Data
import pandas as pd
import numpy as np

In [None]:
edge_index = pd.read_table('ppi_edge_list',sep=" ").to_numpy()

In [None]:
edge_index.shape

(684705, 2)

In [None]:
all_nodes = np.unique(edge_index.T[0])   # We just need to find all unique occurences in one column only since the matrix contains undirected entries (each edge appears twice in reversed direction)
all_nodes.shape

(21557,)

In [None]:
dict_map = {k: v for v, k in enumerate(all_nodes)}

In [None]:
for i in range(edge_index.shape[0]):
    for j in range(edge_index.shape[1]):
        edge_index[i,j] = dict_map[edge_index[i,j]]

In [None]:
t = torch.from_numpy(edge_index)

## Create PyGeometric Dataset from embedding and edge list

In [None]:
data = Data(x = embeddings_features,edge_index=t.t().contiguous())

## Testing deepwalk embedding similarities 

deepwalk --input example_list --number-walks 10 --representation-size 128 --walk-length 6 --window-size 2 --output example.embeddings

deepwalk --input example_list --number-walks 50 --representation-size 128 --walk-length 6 --window-size 2 --output example_1.embeddings

In [44]:
import numpy as np

In [45]:
embeddings = pd.read_table('example.embeddings',sep=" ",skiprows=[0],header=None)
embeddings=embeddings.set_index(0)
embeddings = embeddings.sort_index()

In [46]:
embeddings_1 = pd.read_table('example_1.embeddings',sep=" ",skiprows=[0],header=None)
embeddings_1=embeddings_1.set_index(0)
embeddings_1 = embeddings_1.sort_index()

In [47]:
def Euclidean_dist(a, b):
    return round(np.linalg.norm(a-b),3)

In [48]:
def fill_distance(embeddings_table):
    length = len(embeddings_table)
    distance_matrix = np.zeros([length, length])
    for i in range(length):
        for j in range(i,length):
            row_i = embeddings_table[i:i+1].values
            row_j = embeddings_table[j:j+1].values
            distance_matrix[i,j] = Euclidean_dist(row_i,row_j)
    return distance_matrix

In [49]:
fill_distance(embeddings)

array([[0.   , 0.07 , 0.072, 0.075, 0.078, 0.076, 0.071, 0.077, 0.077],
       [0.   , 0.   , 0.068, 0.068, 0.071, 0.07 , 0.075, 0.067, 0.072],
       [0.   , 0.   , 0.   , 0.074, 0.071, 0.073, 0.076, 0.073, 0.073],
       [0.   , 0.   , 0.   , 0.   , 0.069, 0.073, 0.072, 0.075, 0.077],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.068, 0.073, 0.073, 0.069],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.075, 0.07 , 0.073],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.074, 0.078],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.079],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]])

In [50]:
fill_distance(embeddings_1)

array([[0.   , 0.143, 0.072, 0.144, 0.175, 0.149, 0.165, 0.149, 0.157],
       [0.   , 0.   , 0.132, 0.071, 0.12 , 0.068, 0.11 , 0.078, 0.082],
       [0.   , 0.   , 0.   , 0.134, 0.164, 0.137, 0.158, 0.137, 0.145],
       [0.   , 0.   , 0.   , 0.   , 0.116, 0.08 , 0.11 , 0.079, 0.092],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.124, 0.074, 0.108, 0.108],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.113, 0.076, 0.087],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.102, 0.102],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.077],
       [0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   , 0.   ]])

most similar  
0: 2  
1: 3,5  
2: 3, 6, 7  
3: 5,7  
4: 6  
5: 7, 8  
6: same  
7: 8  
