This notebook is an example of using ETNA algorithm to generate aligned embeddings from two species (<em>S. cerevisiae</em> and <em>S. pombe</em>)

In [1]:
import numpy as np
import networkx as nx
import random
import torch
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn import metrics
import importlib

import load_data as ld
import func
import evaluation
import algorithms.ETNA as etna
import algorithms.helper as helper

In [2]:
org0 = 'sce'
org1 = 'spo'

# load two networks - in this example, S. cerevisiae and S. pombe
g0 = ld.load_ppi(org0, k_core=None, lcc=False)
g1 = ld.load_ppi(org1, k_core=None, lcc=False)

load the ppi network of sce
read as directed: 7073 nodes,          116326 edges
remove selfloop edges: 7072 nodes, 114578 edges
convert to undirected: 7072 nodes, 114578 edges
return the distinct nodes: 6284 nodes,                113394 edges
load the ppi network of spo
read as directed: 3573 nodes,          13122 edges
remove selfloop edges: 3508 nodes, 12567 edges
convert to undirected: 3508 nodes, 12567 edges
return the distinct nodes: 2433 nodes,                11126 edges


In [3]:
# make mapping between node name and node index for two networks
g0_node2index = func.node2index(g0)
g0_node2index = defaultdict(lambda:-1, g0_node2index)
g1_node2index = func.node2index(g1)
g1_node2index = defaultdict(lambda:-1, g1_node2index)
g0_index2node = func.index2node(g0)
g1_index2node = func.index2node(g1)

In [4]:
# load ontology file
ontology_file = org0 + '_' + org1 + '_ontology_pairs_expert.txt'
ontology = ld.load_go_pairs(org0, org1, ontology_file)
ontology = ld.filter_anchor(ontology, g0_node2index, g1_node2index)
print('ontology', len(ontology))

# make ontology matrix for evaluation
ontology_matrix = np.zeros((len(g0.nodes()), len(g1.nodes())), dtype=int)
for i,j in ontology:
    i_idx = g0_node2index[i]
    j_idx = g1_node2index[j]
    ontology_matrix[i_idx][j_idx] = 1

# load ortholog file
ortholog = ld.load_anchor(org0, org1)
ortholog = ld.filter_anchor(ortholog, g0_node2index, g1_node2index)
print('ortholog', len(ortholog))

# make ortholog set for cross training
ortholog_set = set()
for i, j, k in ortholog:
    i_idx = g0_node2index[i]
    j_idx = g1_node2index[j]
    ortholog_set.add((i_idx, j_idx))





(244188, 2)
ontology 197576
ortholog 1599


In [5]:
#select gene with at least one annotation
org0_annotations = np.sum(ontology_matrix, axis=1)
org1_annotations = np.sum(ontology_matrix, axis=0)
org0_ontology_indexes = [x for x in range(len(org0_annotations)) if org0_annotations[x]>0]
org1_ontology_indexes = [x for x in range(len(org1_annotations)) if org1_annotations[x]>0]

test_matrix = ontology_matrix[org0_ontology_indexes][:,org1_ontology_indexes]

In [6]:
# pre-calculate matrices (adjacency matrix and normalized deep walk matrix) needed for ETNA
g0_adj = nx.adjacency_matrix(g0)
g0_dw = helper.direct_compute_deepwalk_matrix(g0_adj, 10).toarray()
g0_norms = np.linalg.norm(g0_dw, axis=1, keepdims=True)
g0_norms[g0_norms == 0] = 1
g0_normalized = (g0_dw/g0_norms)
g0_adj = g0_adj.toarray()

g1_adj = nx.adjacency_matrix(g1)
g1_dw = helper.direct_compute_deepwalk_matrix(g1_adj, 10).toarray()
g1_norms = np.linalg.norm(g1_dw, axis=1, keepdims=True)
g1_norms[g1_norms == 0] = 1
g1_normalized = (g1_dw/g1_norms)
g1_adj = g1_adj.toarray()


In [7]:
device = 'cpu'
# initialize model
model = etna.ETNA(g0, g1, ortholog_set, precal=True, 
             g1_matrices=(g0_adj, g0_dw, g0_normalized),
             g2_matrices=(g1_adj, g1_dw, g1_normalized))

  torch.nn.init.xavier_uniform(m.weight)


In [8]:
#train model
model.fit()

#get embedding and score matrix
S = model.get_score_matrix()

#evaluate aligned embedding by comparing the score matrix with ontology label generated by GO
auroc, auprc = evaluation.evaluate_all(S[org0_ontology_indexes][:,org1_ontology_indexes], 
                                           test_matrix, np.ones(test_matrix.shape))
print(auroc, auprc)

0.7216661736014969 0.1817707376284373
