# Mount Drive

In [None]:
# import packages
## for mount drive purpose
import os
from google.colab import drive

In [None]:
# mount drive
drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/My Drive/Colab_Notebooks/TSPE/')

Mounted at /content/drive/


# PPI graph embedding

In [None]:
from utils import (get_graph_from_file, get_gene_idx_dict_from_file)

In [None]:
input_folder = 'input'

edge_list_file_path = f'{input_folder}/interactom_edges.txt' # stores the edges for the largest connected component in human Interactome
node_file_path = f'{input_folder}/interactom_nodes.txt'   # stores the nodes for the largest connected component in human Interactome

## Graph nodes with mapped ids.
Map the ids to the gene_ids first, so that graph nodes are 0, 1, ... number_of_nodes. Then, use this graph with new node ids to do the following tasks.

In [None]:
import networkx as nx
from utils import (get_graph_from_file, get_gene_idx_dict_from_file)
import numpy as np
import scipy.sparse as sp

In [None]:
def get_graph_from_file_and_map_ids(network_file, node_dict, **kwargs):
    """
        generate a graph based on the input file
        The input file is provided by Joerg Menche et al. in their paper's supplementary
        Thus modify their function to parse the file and get the graph
        The function returns:
        G: the graph with self loop removed
    """

    defaultKwargs = {'self_link': True}
    kwargs = { **defaultKwargs, **kwargs}

    G = nx.Graph()
    network_file = open(network_file,'r')
    for line in network_file:
        # lines starting with '#' will be ignored
        if line[0]=='#':
            continue
        line_data   = line.strip().split('\t')
        gene1 = line_data[0]
        gene2 = line_data[1]

        G.add_edge(node_dict[gene1],node_dict[gene2])

    # remove self links
    if not kwargs['self_link']:
        remove_self_links(G)
    return G
#------------------------------------------------------------------------------#
def remove_self_links(G):
    sl = nx.selfloop_edges(G)
    G.remove_edges_from(sl)

In [None]:
node_idx_dict = get_gene_idx_dict_from_file(node_file_path)

In [None]:
G_sub = get_graph_from_file_and_map_ids(edge_list_file_path, node_idx_dict)

## Node2Vec

In [None]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.3
    Uninstalling networkx-3.3:
      Successfully uninstalled networkx-3.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.0+cu121 requires nvidia-cublas-cu12==12.1.3.1; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.0+cu121 requires nvidia-cuda-cupti-cu12==12.1.105; platform_system == "Linux" and platform_machine == "x86_64", which is not installed.
torch 2.3.0+cu121 requires nvidia-

In [None]:
from node2vec import Node2Vec

In [None]:
def node2vec_embed(G_sub, dim):
  # use default setting from https://github.com/eliorc/node2vec
  node2vec = Node2Vec(G_sub, dimensions = dim, walk_length=30, num_walks=200, workers=4)
  # Embed nodes, use default setting from https://github.com/eliorc/node2vec
  model = node2vec.fit(window=2, min_count=1, batch_words=4)
  # get embedding matrix
  Z = model.wv.vectors
  return Z

In [None]:
Z = node2vec_embed(G_sub, 64)

Computing transition probabilities:   0%|          | 0/13329 [00:00<?, ?it/s]

### write to the file

In [None]:
import pickle

In [None]:
# write Z to the file
with open('emb/node2nev_emb_64', "wb") as fp:
    pickle.dump(Z, fp)

## LPE

In [None]:
import numpy as np
from scipy import sparse as sp
from utils import get_gene_idx_dict_from_file
import networkx as nx

In [None]:
edge_list_file_path = f'interactom_edges.txt' # stores the edges for the largest connected component in human Interactome
node_file_path = f'{input_folder}/interactom_nodes.txt'

In [None]:
# 1. get graph nodes
node_idx_dict = get_gene_idx_dict_from_file(node_file_path)

In [None]:
G_sub = get_graph_from_file_and_map_ids(edge_list_file_path, node_idx_dict)
print(G_sub)    # Graph with 13329 nodes and 138356 edges

Graph with 13329 nodes and 138356 edges


In [None]:
degrees = [G_sub.degree[k] for k in range(G_sub.number_of_nodes())]

In [None]:
N = sp.diags(np.asarray(degrees) ** -0.5, dtype=float)

In [None]:
L = sp.eye(G_sub.number_of_nodes()) - N * A * N

In [None]:
# Eigenvectors with numpy
EigVal, EigVec = np.linalg.eig(L.toarray())
idx = EigVal.argsort() # increasing order
EigVal, EigVec = EigVal[idx], np.real(EigVec[:,idx])

In [None]:
pos_enc_dim = 64

In [None]:
LPE = EigVec[:,1:pos_enc_dim+1]

In [None]:
LPE_file = "emb/LPE.tsv"

In [None]:
matrix_to_file(LPE, LPE_file)

## GPE

In [None]:
input_folder = 'input'
node_file_path = f'{input_folder}/interactom_nodes.txt'   # stores the nodes for the largest connected component in human Interactome
train_file_path = f'{input_folder}/train_set.tsv'
test_file_path = f'{input_folder}/test_set.tsv'

In [None]:
from utils import get_gene_idx_dict_from_file
import numpy as np

In [None]:
def get_disease_dict(train_file_path, test_file_path):
    dis_id_dis_dict = {}     #{disease_id: disease_name}
    dis_dis_id_dict = {} #{disease_name: disease_id}
    dis_cnt_dict = {}  # {disease_id: disease_cnt}
    dis_gene_dict = {}  # {disease:[gene1,gene2,...]}

    id = 0
    for file_path in [train_file_path, test_file_path]:
      f = open(file_path, "r")
      head = True
      for line in f:
          if head:
              head = False
              continue

          row = line.strip().split("\t")
          dis_pair, disease_a_genes, disease_b_genes, all_genes, rr = row

          disease_a, disease_b = dis_pair.split("&")
          dis_gene_dict[disease_a] = disease_a_genes.split(",")
          dis_gene_dict[disease_b] = disease_b_genes.split(",")

          for dis in [disease_a, disease_b]:
            if dis not in dis_dis_id_dict:
              dis_dis_id_dict[dis] = id
              dis_id_dis_dict[id] = dis
              dis_dis_id_dict[id] = 1
              dis_cnt_dict[dis] = len(dis_gene_dict[dis]) #the disease in disease pairs in the train set and test set can repeat, only need to cnt the number of genes when first see it.
              id += 1



      f.close()

    return dis_id_dis_dict, dis_dis_id_dict, dis_cnt_dict, dis_gene_dict

def get_W(dis_cnt_dict, dis_gene_dict, dis_dis_id_dict, node_idx_dict):
  """
    dis_id_dis_dict = {}     #{disease_id: disease_name}
    dis_dis_id_dict = {} #{disease_name: disease_id}
    dis_cnt_dict = {}  # {disease_id: disease_cnt}
    dis_gene_dict = {}  # {disease:[gene1,gene2,...]}
    node_idx_dict: {gene: mapped_gene_id_from_0}
  """
  k = len(dis_cnt_dict.keys())
  n = len(node_idx_dict.keys())
  W = np.zeros((n,k))

  for dis, gene_list in dis_gene_dict.items():
    dis_id = dis_dis_id_dict[dis]
    dis_cnt = dis_cnt_dict[dis]
    for gene in gene_list:
      if gene in node_idx_dict:
        gene_id = node_idx_dict[gene]
        W[gene_id,dis_id] = 1/dis_cnt

  return W

In [None]:
node_idx_dict = get_gene_idx_dict_from_file(node_file_path)

In [None]:
dis_id_dis_dict, dis_dis_id_dict, dis_cnt_dict, dis_gene_dict = get_disease_dict(train_file_path, test_file_path)
W = get_W(dis_cnt_dict, dis_gene_dict, dis_dis_id_dict, node_idx_dict)

In [None]:
print(f'how many diseases: {len(dis_cnt_dict)}')

how many diseases: 153


In [None]:
print(W.shape)

(13329, 153)


### GEE embed: Z = AW

In [None]:
A = nx.adjacency_matrix(G_sub)

In [None]:
Z = A.dot(W)

In [None]:
print(Z.shape)

(13329, 153)


### GPE

In [None]:
from scipy.linalg import svd
# take 64 dims of U
U, s, VT = svd(Z)
idx = s.argsort()[::-1] # decreasing order
s, U = s[idx], U[:,idx]

In [None]:
pos_enc_dim = 64
GPE = U[:,1:pos_enc_dim+1]
print(GPE.shape)

(13329, 64)


#### write to file

In [None]:
from utils import matrix_to_file, array_to_file

In [None]:
GPE_file = 'emb/GPE.tsv' # The file that only contains the row has dis info

In [None]:
matrix_to_file(GPE, GPE_file)