In [1]:
import os
import gc
import time
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Linear
from torch_geometric.nn import ChebConv
from torch_geometric.utils import dropout_edge
import pickle
from sklearn import metrics

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#load the required data for the model
data = torch.load('../data/pan/string_850/data_67_0.0001_go_2.0.pkl')
data = data.type(torch.FloatTensor).to(device)

Y_data = torch.load('../data/pan/string_850/Y_796_data_2.0.pkl')
Y = torch.tensor(np.logical_or(Y_data.y, Y_data.y_te)).type(torch.FloatTensor).to(device)

with open("../data/pan/string_850/k_sets_796_2.0.pkl", 'rb') as handle:
    k_sets = pickle.load(handle)
    
e_data = torch.load("../data/pan/string_850/mut_data_miRNA_sub_du_go_path_2.0.pkl")
e_edge_index =e_data.go_index
e_edge_index = e_edge_index.to(device)

#transformer layer 1 parameter settings
d_model1 = 67  #dimension of input features
d_ff1 = 268  #dimension mapped by feedforward neural network
d_k1 = d_v1 = 67  # dimension of K(=Q), V

#transformer layer 2 parameter settings
d_model2 = 300  #dimension of input features
d_ff2 = 1200  #dimension mapped by feedforward neural network
d_k2 = d_v2 = 300  # dimension of K(=Q), V

n_heads = 6  #number of heads in Multi-Head Attention

  Y = torch.tensor(np.logical_or(Y_data.y, Y_data.y_te)).type(torch.FloatTensor).to(device)


In [3]:
#the framwork of Transformer layer 1
class ScaledDotProductAttention1(nn.Module):
    #use a single attention head to aggregate information
    def __init__(self):
        super(ScaledDotProductAttention1, self).__init__()
 
    def forward(self, Q, K, V):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k1)
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context

class MultiHeadAttention1(nn.Module):
    #use the multi-head attention mechanism to calculate the feature representations 
    def __init__(self):
        super(MultiHeadAttention1, self).__init__()
        self.W_Q = nn.Linear(d_model1, d_k1 * n_heads) 
        self.W_K = nn.Linear(d_model1, d_k1 * n_heads)
        self.W_V = nn.Linear(d_model1, d_v1 * n_heads)
        self.linear = nn.Linear(n_heads * d_v1, d_model1)
 
    def forward(self, H):
        residual, batch_size = H, H.size(0)
        q_s = self.W_Q(H).view(batch_size, -1, n_heads, d_k1).transpose(1,2)
        k_s = self.W_K(H).view(batch_size, -1, n_heads, d_k1).transpose(1,2)
        v_s = self.W_V(H).view(batch_size, -1, n_heads, d_v1).transpose(1,2)
 
        context = ScaledDotProductAttention1()(q_s, k_s, v_s)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v1)
        output = self.linear(context)
        return torch.relu(output + residual) #residual connection
    
class PoswiseFeedForwardNet1(nn.Module):
    #the feedforward neural network of transformer layer 1
    def __init__(self):
        super(PoswiseFeedForwardNet1, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model1, d_ff1, bias=False),
            nn.ELU(),
            nn.Linear(d_ff1, 300, bias=False))
        
    def forward(self, inputs):
        output = self.fc(inputs)
        return output

class TransformerLayer1(nn.Module):
    def __init__(self):
        super(TransformerLayer1, self).__init__()
        self.enc_self_attn = MultiHeadAttention1()
        self.pos_ffn = PoswiseFeedForwardNet1()
 
    def forward(self, enc_inputs):
        enc_outputs = self.enc_self_attn(enc_inputs)
        enc_outputs = self.pos_ffn(enc_outputs)
        return enc_outputs

class TL1(nn.Module):
    def __init__(self):
        super(TL1, self).__init__()
        self.trans = TransformerLayer1()
        
    def forward(self,inputs):
        trans_inputs = inputs.unsqueeze(1)
        trans_outputs = self.trans(trans_inputs)
        outputs = trans_outputs.view(10743,-1)
        return outputs

In [4]:
#the framwork of Transformer layer 2
class ScaledDotProductAttention2(nn.Module):
    #use a single attention head to aggregate information
    def __init__(self):
        super(ScaledDotProductAttention2, self).__init__()
 
    def forward(self, Q, K, V):
        scores = torch.matmul(Q, K.transpose(-1, -2)) / np.sqrt(d_k2)
        attn = nn.Softmax(dim=-1)(scores)
        context = torch.matmul(attn, V)
        return context

class MultiHeadAttention2(nn.Module):
    #use the multi-head attention mechanism to calculate the feature representations
    def __init__(self):
        super(MultiHeadAttention2, self).__init__()
        self.W_Q = nn.Linear(d_model2, d_k2 * n_heads) 
        self.W_K = nn.Linear(d_model2, d_k2 * n_heads)
        self.W_V = nn.Linear(d_model2, d_v2 * n_heads)
        self.linear = nn.Linear(n_heads * d_v2, d_model2)
 
    def forward(self, H):
        residual, batch_size = H, H.size(0)
        q_s = self.W_Q(H).view(batch_size, -1, n_heads, d_k2).transpose(1,2)
        k_s = self.W_K(H).view(batch_size, -1, n_heads, d_k2).transpose(1,2)
        v_s = self.W_V(H).view(batch_size, -1, n_heads, d_v2).transpose(1,2)
 
        context = ScaledDotProductAttention2()(q_s, k_s, v_s)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v2)
        output = self.linear(context)
        return torch.relu(output + residual) #residual connection

class PoswiseFeedForwardNet2(nn.Module):
    #the feedforward neural network of transformer layer 2
    def __init__(self):
        super(PoswiseFeedForwardNet2, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(d_model2, d_ff2, bias=False),
            nn.ELU(),
            nn.Linear(d_ff2, 100, bias=False))
        self.lin = Linear(300,100)
        
    def forward(self, inputs):
        residual = self.lin(inputs)
        output = self.fc(inputs)
        return torch.relu(output + residual) #residual enhanced activation

class TransformerLayer2(nn.Module):
    def __init__(self):
        super(TransformerLayer2, self).__init__()
        self.enc_self_attn = MultiHeadAttention2()
        self.pos_ffn = PoswiseFeedForwardNet2()
 
    def forward(self, enc_inputs):
        enc_outputs = self.enc_self_attn(enc_inputs)
        enc_outputs = self.pos_ffn(enc_outputs)
        return enc_outputs

class TL2(nn.Module):
    def __init__(self):
        super(TL2, self).__init__()
        self.trans = TransformerLayer2()
        
    def forward(self,inputs):
        trans_inputs = inputs.unsqueeze(1)
        trans_outputs = self.trans(trans_inputs)
        outputs = trans_outputs.view(10743,-1)
        return outputs

In [5]:
class net(nn.Module):
    def __init__(self):
        super(net, self).__init__()
        self.T1 = TL1()
        self.T2 = TL2()
        self.GNN = ChebConv(100, 1, K=2, normalization="sym")

        self.lin11 = Linear(67, 300)
        
    def forward(self):
        edge_index, _ = dropout_edge(e_edge_index, p=0.5,force_undirected=True,training=self.training)
        x01 = F.dropout(data, training=self.training)
        
        #learn genes feature representations across GSSN network by the transformer module
        x1 = self.T1(x01)
        x2 = torch.relu(x1 + self.lin11(x01)) #residual enhanced activation
        x3 = F.dropout(x2, training=self.training)
        x4 = self.T2(x3)
        
        #obtain predicted results by Chebyshev GCN
        x20 = F.dropout(x4, training=self.training)
        x21 = self.GNN(x20, edge_index)
        
        return x21

In [6]:
def train(mask):
    model.train()
    optimizer.zero_grad()
    pred = model()
    loss = F.binary_cross_entropy_with_logits(pred[mask], Y[mask])
    
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()

@torch.no_grad()
def test(mask):
    model.eval()
    x = model()
    pred = torch.sigmoid(x[mask]).cpu().detach().numpy()
    Yn = Y[mask].cpu().numpy()
    precision, recall, _thresholds = metrics.precision_recall_curve(Yn, pred)
    area = metrics.auc(recall, precision)
    return metrics.roc_auc_score(Yn, pred), area

In [None]:
epochs = 550 
time_start = time.time()
#ten five-fold cross-validations
AUC = np.zeros(shape=(10, 5))
AUPR = np.zeros(shape=(10, 5))

for i in range(10):
    for cv_run in range(5):
        tr_mask = k_sets[i][cv_run][0]
        te_mask = k_sets[i][cv_run][1]
        model = net().to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=5e-4)
        for t in range(epochs):
            train(tr_mask)
    
        AUC[i][cv_run], AUPR[i][cv_run] = test(te_mask) 
        print("round %d and %d times cross-validations:" %(i+1,cv_run+1))
        print(AUC[i][cv_run], AUPR[i][cv_run])
        
        gc.collect()
        torch.cuda.empty_cache()

print(time.time() - time_start)
print(AUC.mean())
print(AUPR.mean())