In [None]:
import numpy as np
import pandas as pd
import torch
from torch_geometric.nn import Node2Vec
from torch_geometric.data import Data
import networkx as nx
from torch_geometric.utils import from_networkx
import warnings
warnings.filterwarnings("ignore")

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
#load PPI network
def load_network(file_path):
    ppi = pd.read_table(filepath_or_buffer=file_path, header=None, index_col=None, names=['source', 'target'], dtype='str', sep='\t')
    ppi_nodes = pd.concat([ppi['source'], ppi['target']], ignore_index=True)
    ppi_nodes = pd.DataFrame(ppi_nodes, columns=['nodes']).drop_duplicates()
    ppi_nodes.reset_index(drop=True, inplace=True)
    return ppi,ppi_nodes

In [4]:
#load gene association network
def load_featured_graph(network):
    
    G = nx.from_pandas_adjacency(network)

    G_adj = nx.convert_node_labels_to_integers(G, ordering='sorted', label_attribute='label')

    print(f'If the graph is connected graph: {nx.is_connected(G_adj)}')
    print(f'The number of connected components: {nx.number_connected_components(G_adj)}')

    graph = from_networkx(G_adj)
    assert graph.is_undirected() == True
    print(f'The edge index is {graph.edge_index}')

    return graph

In [5]:
#create gene dataset and multi-dimensional gene association network
def create_dataset():
    #load feature matrix
    df = pd.read_table('../data/pan/string_850/mut_features_miRNA_sub_du_2.0.txt', index_col=0, header =0, sep="\t")
    gene_index = list(df.index.values)
    features = torch.tensor(np.array(df)).to(torch.float32)
    print(features)
    ppi, ppi_nodes = load_network('../data/pan/string_850/mut_PPI_2.0.txt')
    sou_list = list(ppi['source'].values)
    tar_list = list(ppi['target'].values)
    sou_index = []
    for sou in sou_list:
        for index,gene in enumerate(gene_index):
            if sou==gene:
                sou_index.extend([index])
    tar_index = []
    for tar in tar_list:
        for index,gene in enumerate(gene_index):
            if tar==gene:
                tar_index.extend([index])
    edge_list =[]
    edge_list.append(sou_index)
    edge_list.append(tar_index)
    edge_index = torch.tensor(edge_list, dtype=torch.long)
    print(edge_index.shape)
    print(features.shape)
    print(edge_index)
    
    go_network = pd.read_csv('../data/pan/string_850/go_2.0.tsv', sep='\t', index_col=0, header=0)
    go_graph = load_featured_graph(go_network,'go')
    print(go_graph)

    path_network = pd.read_csv('../data/pan/string_850/path_2.0.tsv', sep='\t', index_col=0, header=0)
    path_graph = load_featured_graph(path_network,'path')
    print(path_graph)

    data = Data(x=features,edge_index=edge_index,go_index=go_graph.edge_index,path_index=path_graph.edge_index)
    return data

In [None]:
data1 = create_dataset()
torch.save(data1, '../data/pan/string_850/mut_data_miRNA_sub_du_go_path_2.0.pkl')

In [7]:
#create models for obtaining positional embeddings
model1 =  Node2Vec(data1.edge_index, embedding_dim=16, walk_length=80,
                      context_size=5,  walks_per_node=10,
                      num_negative_samples=1, p=1, q=1, sparse=True).to(device)
loader1 = model1.loader(batch_size=128, shuffle=True)
optimizer1 = torch.optim.SparseAdam(list(model1.parameters()), lr=0.0001)

model2 =  Node2Vec(data1.go_index, embedding_dim=16, walk_length=80,
                      context_size=5,  walks_per_node=10,
                      num_negative_samples=1, p=1, q=1, num_nodes=10743, sparse=True).to(device)
loader2 = model2.loader(batch_size=128, shuffle=True)
optimizer2 = torch.optim.SparseAdam(list(model2.parameters()), lr=0.0001)

model3 =  Node2Vec(data1.path_index, embedding_dim=16, walk_length=80,
                      context_size=5,  walks_per_node=10,
                      num_negative_samples=1, p=1, q=1, num_nodes=10743, sparse=True).to(device)
loader3 = model3.loader(batch_size=128, shuffle=True)
optimizer3 = torch.optim.SparseAdam(list(model3.parameters()), lr=0.0001)

In [8]:
def train(model,optimizer,loader):
    model.train()
    total_loss = 0
    for pos_rw, neg_rw in loader:
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

In [None]:
#obtain PPI's positional embeddings
for epoch in range(1, 1001):
    loss = train(model1,optimizer1,loader1)

model1.eval()
str_features1 = model1()
print(str_features1.shape)

torch.save(str_features1, '../data/pan/string_850/mut_str_features_16_0.0001_2.0.pkl')

In [None]:
#obtain gene semantic similarity network's positional embeddings
for epoch in range(1, 1001):
    loss = train(model2,optimizer2,loader2)

model2.eval()
str_features2 = model2()
print(str_features2.shape)

torch.save(str_features2, '../data/pan/string_850/mut_str_features_16_0.0001_go_2.0.pkl')

In [None]:
#obtain gene pathway co-occurrence network's positional embeddings
for epoch in range(1, 1001):
    loss = train(model3,optimizer3,loader3)

model3.eval()
str_features3 = model3()
print(str_features3.shape)

torch.save(str_features3, '../data/pan/string_850/mut_str_features_16_0.0001_path_2.0.pkl')