# Select GPU

Runtime -> Change Runtime type -> T4 GPU

# Check nvidia

## install if needed

In [None]:
!pip install --upgrade nvidia-pyindex

Collecting nvidia-pyindex
  Downloading nvidia-pyindex-1.0.9.tar.gz (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nvidia-pyindex
  Building wheel for nvidia-pyindex (setup.py) ... [?25l[?25hdone
  Created wheel for nvidia-pyindex: filename=nvidia_pyindex-1.0.9-py3-none-any.whl size=8419 sha256=9c77ce6b59f64da924cb407482ae7f84ba351c73c8f2edc6c8ad4efc81f532f6
  Stored in directory: /root/.cache/pip/wheels/2c/af/d0/7a12f82cab69f65d51107f48bcd6179e29b9a69a90546332b3
Successfully built nvidia-pyindex
Installing collected packages: nvidia-pyindex
Successfully installed nvidia-pyindex-1.0.9


## check

In [None]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found


#Mount Drive

In [1]:
# import packages
## for mount drive purpose
import os
from google.colab import drive
# mount drive
drive.mount('/content/drive/', force_remount=True)
os.chdir('/content/drive/My Drive/Colab_Notebooks/Graph_transformer/')

Mounted at /content/drive/


# Packages

In [2]:
from utils import get_gene_idx_dict_from_file, file_to_matrix
import pickle
import numpy as np

import torch
import torch.utils.data as data

from torch.utils.tensorboard import SummaryWriter
import torch.optim as optim
import torch.nn as nn

from sklearn.metrics import roc_auc_score
from sklearn import metrics

from tqdm import tqdm as prog_bar #The progress bar
import math, copy, time
import torch.nn.functional as F

from torch.nn.utils.rnn import pad_sequence

# Graph Transformer Framework

## Harvard Transformer

We use code from Harvard Transformer for the basic building blocks in Transformer. Modified to fix Degree of Freedom issue in Forward.


@inproceedings{opennmt, author = {Guillaume Klein and Yoon Kim and Yuntian Deng and Jean Senellart and Alexander M. Rush}, title = {OpenNMT: Open-Source Toolkit for Neural Machine Translation}, booktitle = {Proc. ACL}, year = {2017}, url = {https://doi.org/10.18653/v1/P17-4012}, doi = {10.18653/v1/P17-4012} }

In [3]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])


class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
      """
        XQ: modified to fix DOF issue
      """
      mean = x.mean(-1, keepdim=True)
      if x.numel() > 1:
          std = x.std(-1, keepdim=True)  # Standard deviation along the last dimension with keepdim=True
      else:
          std = torch.tensor(0.0).to(x.device)  # If only one element, set std to 0.0 (ensure it’s on the same device as x)
      return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))


class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)


class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)

    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        # Zip only goes through the first 3 Layers - Ioan
        # Each matrix multiplications is done once and then split in heads
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(query, key, value, mask=mask,
                                 dropout=self.dropout)

        # 3) "Concat" using a view and apply a final linear.
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)

        return self.linears[-1](x)

class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

## TransformerGO & TransformerCPI


No positonal encoding + no mask in Decoder + Binary Classification
-- Methods and utils adopted from TransformerGO and TransformerCPI, then modified for our case

In [4]:
class TransformerDis(nn.Module):
    """
      disA: Protein node set related to disease A
      disB: Protein node set related to disease B
    """
    def __init__(self ,d_model, nhead, num_layers, dim_feedforward, dropout = 0.1):
        super().__init__()

        c = copy.deepcopy
        attn = MultiHeadedAttention(nhead, d_model, dropout)
        ff = PositionwiseFeedForward(d_model, dim_feedforward, dropout)

        self.encoder = Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), num_layers)
        self.decoder = Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), num_layers)

        self.linear = nn.Linear(d_model, 1)

    #batch  * max_seq_len * node2vec_dim
    def forward(self, emb_disA, emb_disB, disA_mask, disB_mask):

        memory = self.encoder(emb_disA, disA_mask)
        output = self.decoder(emb_disB, memory, disA_mask, disB_mask)
        #output: batch * seqLen * embDim

        #transform B * seqLen * node2vec_dim --> B * node2vec_dim (TransformerCPI paper)
        output_c = torch.linalg.norm(output, dim = 2)
        output_c = F.softmax(output_c, dim = 1).unsqueeze(1)
        output = torch.bmm(output_c, output)

        return self.linear(output).squeeze(1)


In [5]:
def transformerGO_collate_fn(batch, max_size_set, emb_size = 64, pytorch_pad = False):

    """ Function that remodels each batch of data before
    input to the transformer model

    Args:
        batch (tuple):  #batch_features:  Shape N * ( [disALen, (emb+pos)Dim], [disBLen, (emb+pos)Dim] )
                        #batch_ids:       Shape N * ( [1, disALen], [1, disALen] )
                        #batch_labels:    Shape N * [1 or 0]

    Returns:
    tensor: padded embedding+positional_encoding of shape N * 2(disease pair) * L(longest seq of pro related to one disease) * (emb+pos)Dim
    tensor: batch labels of   shape N * [1 or 0]
    tensor: padding of        shape N * 2 * L * L
    """

    batch_features, batch_labels, batch_ids  = zip(*batch)
    unpadded_seqs = []
    padd_mask_pytorch = torch.ones((len(batch_features), 2, max_size_set), dtype=torch.bool)
    padd_mask = torch.empty((len(batch_features), 2, max_size_set, max_size_set))

    for i in range(0, len(batch_features)):
        protA = batch_features[i][0]
        protB = batch_features[i][1]
        unpadded_seqs.append( torch.FloatTensor(protA) )
        unpadded_seqs.append( torch.FloatTensor(protB) )

        #mask those positions which are not padding
        padd_mask_pytorch[i][0][0:len(protA)] = False
        padd_mask_pytorch[i][1][0:len(protB)] = False

    #pad embedings according to the largest in the entire dataset
    unpadded_seqs.append(torch.zeros(max_size_set, emb_size))
    padded_seq = pad_sequence(unpadded_seqs, batch_first = True)[:-1]

    #create new tensor of shape N * 2(protein pair) * L(longest seq) * Emb dim
    s = padded_seq.shape
    padded_pairs = torch.empty((len(batch_features), 2, s[1], s[2]))
    padded_pairs[:,0] = padded_seq[0::2]
    padded_pairs[:,1] = padded_seq[1::2]

    for i in range(0, padded_pairs.shape[0]):
        padd_mask[i][0] = get_padd_mask_transformer(padded_pairs[i][0])
        padd_mask[i][1] = get_padd_mask_transformer(padded_pairs[i][1])

    if pytorch_pad:
        return padded_pairs, torch.FloatTensor(batch_labels), padd_mask_pytorch

    return padded_pairs, torch.FloatTensor(batch_labels), padd_mask

In [6]:
def get_padd_mask_transformer(emb):
    """Gets an embedding matrix and returns its padding mask
    Args:
        emb (numpy): numpy of shape (seqLen, emb_dim)

    Returns:
    numpy: matrix of size (seqLen, seqLen)
    """
    mask = (emb.numpy() != 0)
    mask = np.matmul(mask, mask.T)
    return torch.from_numpy(mask)

In [7]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division
    acc = correct.sum() / len(correct)
    return acc

In [8]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [9]:
def print_status(epoch, epoch_mins, epoch_secs, train_loss, train_acc, valid_loss, valid_acc, roc_train, roc_val, optimizer):
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s'  ,
    f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%' ,
    f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%' ,
    f'\t Roc Train: {roc_train:.3f}' , f'\t Roc Valid: {roc_val:.3f}' ,
    ",  ", optimizer.param_groups[0]['lr'], "--LR", end='\r')

In [10]:
def write_scalars_tensorboard(writer, train_loss, valid_loss, train_acc, valid_acc, epoch):
    writer.add_scalars('Loss', {'train':train_loss, 'valid': valid_loss}, epoch)
    writer.add_scalars('Acc', {'train':train_acc, 'valid': valid_acc}, epoch)

# Set info


In [11]:
input_folder = 'input'
dataset = 'RR1' # 'RR0'
emb_folder = "embedding"


# PSE = 'NoPE' # opt0
PSE = 'LPE' # opt1
# PSE = 'SPE' # opt2

# Functions

In [12]:
########################PrepData################################################
def get_disease_sets(file_path):
    dis_pairs = []   #[(disA, disB), ...]
    labels = []      # [label, ...]
    disease_genes_dict = {}     #{disease: [gene_1, gene_2, ...]}

    f = open(file_path, "r")
    head = True
    for line in f:
        if head:
            head = False
            continue

        row = line.strip().split("\t")
        dis_pair, disease_a_genes, disease_b_genes, all_genes, rr = row

        disease_a, disease_b = dis_pair.split("&")

        dis_pairs.append((disease_a, disease_b))
        labels.append(int(rr))

        disease_genes_dict[disease_a] = [int(gene) for gene in disease_a_genes.split(",")]
        disease_genes_dict[disease_b] = [int(gene) for gene in disease_b_genes.split(",")]


    f.close()

    return dis_pairs, labels, disease_genes_dict


class Dataset_torch(torch.utils.data.Dataset):
    #Characterizes a dataset for PyTorch
    def __init__(self, dis_pairs, labels, dis_genes_dict, node_idx_dict, emb, lpe, combine_opt):
        self.dis_pairs = dis_pairs
        self.labels = labels
        self.dis_genes_dict = dis_genes_dict
        self.node_idx_dict = node_idx_dict
        self.emb = emb
        self.lpe = lpe
        self.opt = combine_opt

    def __len__(self):
        return len(self.dis_pairs)

    def __getitem__(self, index):
        label = self.labels[index]
        disA,disB = self.dis_pairs[index]
        gene_lists = [self.dis_genes_dict[disA], self.dis_genes_dict[disB]]
        dis_pair_genes = [(disA, gene_lists[0]), (disB, gene_lists[1])] # [(disA, disA_gene_list), (disB, disB_gene_list)]
        features = [get_features(gene_list, node_idx_dict, self.emb, self.lpe, self.opt) for gene_list in gene_lists]

        return np.array((features, label, dis_pair_genes), dtype=object)

#------------------------------------------------------------------------------#
def get_features(gene_list, node_idx_dict, emb, lpe, combine_opt):
    # those keys are strings
    node_idices = [node_idx_dict[str(gene)] for gene in gene_list if str(gene) in node_idx_dict]
    feature_vecs = emb[node_idices, :]
    if lpe is not None:
      if combine_opt == "add":
        feature_vecs = np.add(emb[node_idices, :], lpe[node_idices, :])
      else:
        feature_vecs = np.concatenate((emb[node_idices, :], lpe[node_idices, :]), axis=1)

    return feature_vecs

#------------------------------------------------------------------------------#
def split_train_valid(ori_dataset, ratio):
  sz = len(ori_dataset)
  train_set, valid_set = data.random_split(ori_dataset, [int(ratio[0]*sz), sz - (int(ratio[0]*sz)) ] )

  return train_set, valid_set

#------------------------------------------------------------------------------#
def getEmbeddingAndVars(PSE):
  """
    get embedding and positional encoding
    set var value based on given PSE
  """
  #### opt0: n2v from graph with id mapping, 64 d
  if PSE == 'NoPE':
    emb_dim = 64
    emb_file = f'{emb_folder}/node2nev_emb_64_for_PE' # f'{emb_folder}/node2nev_emb_64'
    with open(emb_file, 'rb') as f:
        emb = pickle.load(f)

    pe = None
    pe_dim = 0
    combine_opt = None

  #### opt1: n2v from graph with id mapping + LPE Add
  if PSE == 'LPE':
    emb_dim = 64

    emb_file = f'{emb_folder}/node2nev_emb_64_for_PE'
    with open(emb_file, 'rb') as f:
        emb = pickle.load(f)

    lpe_dim = 64
    lpe_file = f'{emb_folder}/LPE.tsv'
    lpe = file_to_matrix(lpe_file)

    lpe_dim = 64
    pe = lpe[:,:lpe_dim]

    pe_dim = lpe_dim
    combine_opt = "add"

  # #### opt2: n2v from graph with id mapping + LPE add + GPE concat
  if PSE == 'SPE':
    emb_dim = 64
    emb_file = f'{emb_folder}/node2nev_emb_64_for_PE'
    with open(emb_file, 'rb') as f:
        emb = pickle.load(f)

    # --add lpe, lpe_dim = 64
    lpe_file = f'{emb_folder}/LPE.tsv'
    lpe = file_to_matrix(lpe_file)
    emb = np.add(emb, lpe)

    # --concat gpe, gpe dim: 8
    gpe_dim = 8

    gpe_file = f'{emb_folder}/{dataset}/GEE_Z_U.tsv'
    gpe = file_to_matrix(lpe_file)

    pe = gpe[:,:gpe_dim]
    pe_dim = gpe_dim
    combine_opt = "concat"

  return emb_dim, pe_dim, emb, pe, combine_opt

########################Transformer_ Train&Valid################################
##----adopt from TransformerGO and Modifed for our case------------------------#
def get_max_len_seq(dataset):
    """Finds the dis with the most genes and returns the size"""
    batch_features, batch_labels, batch_ids  = zip(*dataset)
    # XQ: VisibleDeprecationWarning: "batch_features" contains lists or arrays of varying lengths,
    # causing NumPy to treat it as a "ragged" array, which is deprecated without explicitly specifying dtype=object.
    batch_features = np.array(batch_features, dtype=object)


    max_len = 0
    for i in range(0, batch_features.shape[0]):
        max_len = max(max_len, len(batch_features[i][0]), len(batch_features[i][1]))
    return max_len

def helper_collate(batch):
    MAX_LEN_SEQ = get_max_len_seq(batch)
    return transformerGO_collate_fn(batch, MAX_LEN_SEQ, EMB_DIM, pytorch_pad = False)

def train(model, iterator, optimizer, criterion,  torch_vers = False):

    epoch_loss = 0
    epoch_acc = 0
    model.train()

    pred = []
    lab = []
    for batch in prog_bar(iterator):
        optimizer.zero_grad()

        #padded pairs: tensor of shape N * 2(protein pair) * L(longest seq) * Emb dim
        padded_pairs = batch[0].to(device)
        labels = batch[1].to(device)
        mask = batch[2].to(device)

        #split data into disA and disB
        disA_batch = padded_pairs[:,0]
        disB_batch = padded_pairs[:,1]

        #permute the data to fit the pytorch transformer
        if torch_vers:
            disA_batch = disA_batch.permute(1,0,2)
            disB_batch = disB_batch.permute(1,0,2)

        predictions = model(disA_batch, disB_batch, mask[:,0], mask[:,1]).squeeze(1)
        loss = criterion(predictions, labels)
        acc = binary_accuracy(predictions, labels)

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        pred = pred + list(predictions.cpu().data.numpy())
        lab = lab + list(labels.cpu().data.numpy())

    return epoch_loss / len(iterator), epoch_acc / len(iterator), roc_auc_score(lab,pred)

def evaluate(model, iterator, criterion, torch_vers = False):
    epoch_loss = 0
    epoch_acc = 0
    pred = []
    lab = []

    model.eval()
    with torch.no_grad():
        for batch in iterator:

            #padded pairs: tensor of shape N * 2(protein pair) * L(longest seq) * Emb dim
            padded_pairs = batch[0].to(device)
            labels = batch[1].to(device)
            mask = batch[2].to(device)

            #split data into disA and disB
            disA_batch = padded_pairs[:,0]
            disB_batch = padded_pairs[:,1]

            #permute the data to fit the pytorch transformer
            if torch_vers:
                disA_batch = disA_batch.permute(1,0,2)
                disB_batch = disB_batch.permute(1,0,2)

            predictions = model(disA_batch, disB_batch, mask[:,0], mask[:,1]).squeeze(1)
            loss = criterion(predictions, labels)
            acc = binary_accuracy(predictions, labels)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

            pred = pred + list(predictions.cpu().data.numpy())
            lab = lab + list(labels.cpu().data.numpy())

    return epoch_loss / len(iterator), epoch_acc / len(iterator), roc_auc_score(lab,pred), lab, pred


# Main

In [13]:
node_file_path = f'{input_folder}/interactom_nodes.txt'   # stores the nodes for the largest connected component in human Interactome
train_file_path = f'{input_folder}/{dataset}/train_set.tsv'
test_file_path = f'{input_folder}/{dataset}/test_set.tsv'


# 1. get graph original nodes
node_idx_dict = get_gene_idx_dict_from_file(node_file_path)
node_gene_dict = {v:k for k,v in node_idx_dict.items()}

# 2. get selected disease pairs
train_dis_pairs, train_labels, train_disease_genes_dict = get_disease_sets(train_file_path)
test_dis_pairs, test_labels, test_disease_genes_dict = get_disease_sets(test_file_path)

# 3. get embedding and positional encoding value based on given PSE
emb_dim, pe_dim, emb, pe, combine_opt = getEmbeddingAndVars(PSE)

# 4. prep data and split sets
train_origin_set = Dataset_torch(train_dis_pairs, train_labels, train_disease_genes_dict, node_idx_dict, emb, pe, combine_opt)
ratio = [0.9, 0.1] # [0.8, 0.2]  # [0.9, 0.1]
train_set, valid_set = split_train_valid(train_origin_set, ratio)
test_set = Dataset_torch(test_dis_pairs, test_labels, test_disease_genes_dict, node_idx_dict, emb,  pe, combine_opt)


## Set Transformer and Parameters

In [14]:
params = {'batch_size': 20,'collate_fn': helper_collate}
if pe_dim == 64:
  EMB_DIM = emb_dim # (N2V + LPE)
else: EMB_DIM = emb_dim + pe_dim  # (N2V + LPE) concat GPE

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device available: ", device, " ", torch.cuda.get_device_name(0))

MODEL_SIZE = EMB_DIM
NR_HEADS = 8
NR_LAYERS = 3
DROPOUT = 0.2
SIZE_FF = 4 * MODEL_SIZE
LR = 0.0001

# if see AssertionError, this is because the dimension (EMB_DIM) cannot divide the headnumber (NR_HEADS)
# for the current one, add 8 dim for lpe and keep original 64 dims
model = TransformerDis(MODEL_SIZE, NR_HEADS, NR_LAYERS, SIZE_FF, DROPOUT)

model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss().to(device) # this is the activation function used

pytorch_total_params = sum(p.numel() for p in model.parameters())
print(pytorch_total_params)

# Clear memory and set environment variables
torch.cuda.empty_cache()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

# Enable mixed precision training
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()


model_name = f"{PSE}model{dataset}.pt"

print("Train set: ", len(train_set), '\n', "Valid set: ", len(valid_set), '\n', "Test set: ", len(test_set), '\n')
train_grt = data.DataLoader(train_set, **params, shuffle = True)
val_grt = data.DataLoader(valid_set, **params, shuffle = True)
test_grt = data.DataLoader(test_set, **params, shuffle = False)


Device available:  cuda   Tesla T4
350529
Train set:  8702 
 Valid set:  967 
 Test set:  1074 



  scaler = GradScaler()


## Run the Model

In [None]:
writer = SummaryWriter(flush_secs=14)
N_EPOCHS = 30
best_roc_val = float('-inf')


for epoch in range(N_EPOCHS):
    start_time = time.time()
    train_loss, train_acc, roc_train = train(model, train_grt, optimizer, criterion, torch_vers = False)
    valid_loss, valid_acc, roc_val, _, _ = evaluate(model, val_grt, criterion, torch_vers = False)
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    if best_roc_val < roc_val:
        best_roc_val = roc_val
        torch.save(model.state_dict(),  model_name)

    print_status(epoch, epoch_mins, epoch_secs, train_loss,\
                 train_acc, valid_loss, valid_acc, roc_train, roc_val, optimizer)
    write_scalars_tensorboard(writer, train_loss, valid_loss, train_acc, valid_acc, epoch)


100%|██████████| 436/436 [04:12<00:00,  1.72it/s]


Epoch: 01 | Epoch Time: 4m 30s 	Train Loss: 0.681 | Train Acc: 57.98% 	 Val. Loss: 0.670 |  Val. Acc: 60.32% 	 Roc Train: 0.509 	 Roc Valid: 0.551 ,   0.0001 --LR

100%|██████████| 436/436 [04:02<00:00,  1.80it/s]


Epoch: 02 | Epoch Time: 4m 19s 	Train Loss: 0.676 | Train Acc: 58.29% 	 Val. Loss: 0.666 |  Val. Acc: 63.00% 	 Roc Train: 0.551 	 Roc Valid: 0.624 ,   0.0001 --LR

100%|██████████| 436/436 [04:13<00:00,  1.72it/s]


Epoch: 03 | Epoch Time: 4m 33s 	Train Loss: 0.662 | Train Acc: 60.24% 	 Val. Loss: 0.650 |  Val. Acc: 61.90% 	 Roc Train: 0.615 	 Roc Valid: 0.652 ,   0.0001 --LR

100%|██████████| 436/436 [04:03<00:00,  1.79it/s]


Epoch: 04 | Epoch Time: 4m 23s 	Train Loss: 0.643 | Train Acc: 63.22% 	 Val. Loss: 0.629 |  Val. Acc: 65.45% 	 Roc Train: 0.660 	 Roc Valid: 0.682 ,   0.0001 --LR

100%|██████████| 436/436 [04:08<00:00,  1.76it/s]


Epoch: 05 | Epoch Time: 4m 29s 	Train Loss: 0.629 | Train Acc: 66.03% 	 Val. Loss: 0.612 |  Val. Acc: 66.65% 	 Roc Train: 0.693 	 Roc Valid: 0.703 ,   0.0001 --LR

100%|██████████| 436/436 [04:14<00:00,  1.71it/s]


Epoch: 06 | Epoch Time: 4m 33s 	Train Loss: 0.615 | Train Acc: 66.59% 	 Val. Loss: 0.611 |  Val. Acc: 65.04% 	 Roc Train: 0.709 	 Roc Valid: 0.718 ,   0.0001 --LR

100%|██████████| 436/436 [04:14<00:00,  1.71it/s]


Epoch: 07 | Epoch Time: 4m 35s 	Train Loss: 0.599 | Train Acc: 68.12% 	 Val. Loss: 0.593 |  Val. Acc: 69.21% 	 Roc Train: 0.731 	 Roc Valid: 0.741 ,   0.0001 --LR

100%|██████████| 436/436 [04:16<00:00,  1.70it/s]


Epoch: 08 | Epoch Time: 4m 35s 	Train Loss: 0.587 | Train Acc: 69.59% 	 Val. Loss: 0.583 |  Val. Acc: 69.83% 	 Roc Train: 0.746 	 Roc Valid: 0.744 ,   0.0001 --LR

100%|██████████| 436/436 [04:15<00:00,  1.71it/s]


Epoch: 09 | Epoch Time: 4m 35s 	Train Loss: 0.581 | Train Acc: 70.23% 	 Val. Loss: 0.561 |  Val. Acc: 70.64% 	 Roc Train: 0.754 	 Roc Valid: 0.768 ,   0.0001 --LR

100%|██████████| 436/436 [04:07<00:00,  1.76it/s]


Epoch: 10 | Epoch Time: 4m 26s 	Train Loss: 0.571 | Train Acc: 71.34% 	 Val. Loss: 0.568 |  Val. Acc: 70.74% 	 Roc Train: 0.764 	 Roc Valid: 0.767 ,   0.0001 --LR

100%|██████████| 436/436 [04:05<00:00,  1.78it/s]


Epoch: 11 | Epoch Time: 4m 25s 	Train Loss: 0.562 | Train Acc: 72.02% 	 Val. Loss: 0.555 |  Val. Acc: 71.17% 	 Roc Train: 0.772 	 Roc Valid: 0.778 ,   0.0001 --LR

100%|██████████| 436/436 [04:13<00:00,  1.72it/s]


Epoch: 12 | Epoch Time: 4m 32s 	Train Loss: 0.559 | Train Acc: 71.74% 	 Val. Loss: 0.551 |  Val. Acc: 71.66% 	 Roc Train: 0.776 	 Roc Valid: 0.777 ,   0.0001 --LR

100%|██████████| 436/436 [04:10<00:00,  1.74it/s]


Epoch: 13 | Epoch Time: 4m 28s 	Train Loss: 0.551 | Train Acc: 72.17% 	 Val. Loss: 0.569 |  Val. Acc: 70.25% 	 Roc Train: 0.783 	 Roc Valid: 0.774 ,   0.0001 --LR

100%|██████████| 436/436 [04:12<00:00,  1.73it/s]


Epoch: 14 | Epoch Time: 4m 33s 	Train Loss: 0.549 | Train Acc: 72.87% 	 Val. Loss: 0.564 |  Val. Acc: 70.20% 	 Roc Train: 0.786 	 Roc Valid: 0.777 ,   0.0001 --LR

100%|██████████| 436/436 [04:09<00:00,  1.75it/s]


Epoch: 15 | Epoch Time: 4m 25s 	Train Loss: 0.545 | Train Acc: 72.78% 	 Val. Loss: 0.542 |  Val. Acc: 70.64% 	 Roc Train: 0.789 	 Roc Valid: 0.787 ,   0.0001 --LR

100%|██████████| 436/436 [04:10<00:00,  1.74it/s]


Epoch: 16 | Epoch Time: 4m 27s 	Train Loss: 0.538 | Train Acc: 72.82% 	 Val. Loss: 0.542 |  Val. Acc: 72.97% 	 Roc Train: 0.797 	 Roc Valid: 0.789 ,   0.0001 --LR

100%|██████████| 436/436 [04:09<00:00,  1.74it/s]


Epoch: 17 | Epoch Time: 4m 27s 	Train Loss: 0.534 | Train Acc: 73.22% 	 Val. Loss: 0.528 |  Val. Acc: 72.27% 	 Roc Train: 0.800 	 Roc Valid: 0.798 ,   0.0001 --LR

100%|██████████| 436/436 [04:05<00:00,  1.77it/s]


Epoch: 18 | Epoch Time: 4m 25s 	Train Loss: 0.531 | Train Acc: 73.21% 	 Val. Loss: 0.536 |  Val. Acc: 72.35% 	 Roc Train: 0.802 	 Roc Valid: 0.788 ,   0.0001 --LR

100%|██████████| 436/436 [04:11<00:00,  1.73it/s]


Epoch: 19 | Epoch Time: 4m 30s 	Train Loss: 0.530 | Train Acc: 73.00% 	 Val. Loss: 0.529 |  Val. Acc: 73.09% 	 Roc Train: 0.805 	 Roc Valid: 0.799 ,   0.0001 --LR

100%|██████████| 436/436 [04:10<00:00,  1.74it/s]


Epoch: 20 | Epoch Time: 4m 29s 	Train Loss: 0.525 | Train Acc: 73.67% 	 Val. Loss: 0.544 |  Val. Acc: 72.71% 	 Roc Train: 0.807 	 Roc Valid: 0.794 ,   0.0001 --LR

100%|██████████| 436/436 [04:09<00:00,  1.75it/s]


Epoch: 21 | Epoch Time: 4m 29s 	Train Loss: 0.523 | Train Acc: 73.60% 	 Val. Loss: 0.531 |  Val. Acc: 73.79% 	 Roc Train: 0.809 	 Roc Valid: 0.797 ,   0.0001 --LR

100%|██████████| 436/436 [04:13<00:00,  1.72it/s]


Epoch: 22 | Epoch Time: 4m 33s 	Train Loss: 0.522 | Train Acc: 73.61% 	 Val. Loss: 0.519 |  Val. Acc: 72.29% 	 Roc Train: 0.812 	 Roc Valid: 0.806 ,   0.0001 --LR

100%|██████████| 436/436 [04:15<00:00,  1.71it/s]


Epoch: 23 | Epoch Time: 4m 35s 	Train Loss: 0.518 | Train Acc: 74.06% 	 Val. Loss: 0.540 |  Val. Acc: 71.85% 	 Roc Train: 0.814 	 Roc Valid: 0.797 ,   0.0001 --LR

100%|██████████| 436/436 [04:05<00:00,  1.77it/s]


Epoch: 24 | Epoch Time: 4m 24s 	Train Loss: 0.513 | Train Acc: 73.94% 	 Val. Loss: 0.532 |  Val. Acc: 72.80% 	 Roc Train: 0.818 	 Roc Valid: 0.809 ,   0.0001 --LR

100%|██████████| 436/436 [04:10<00:00,  1.74it/s]


Epoch: 25 | Epoch Time: 4m 29s 	Train Loss: 0.511 | Train Acc: 74.06% 	 Val. Loss: 0.541 |  Val. Acc: 72.07% 	 Roc Train: 0.818 	 Roc Valid: 0.800 ,   0.0001 --LR

100%|██████████| 436/436 [04:08<00:00,  1.76it/s]


Epoch: 26 | Epoch Time: 4m 27s 	Train Loss: 0.511 | Train Acc: 74.44% 	 Val. Loss: 0.521 |  Val. Acc: 73.50% 	 Roc Train: 0.819 	 Roc Valid: 0.807 ,   0.0001 --LR

100%|██████████| 436/436 [04:04<00:00,  1.78it/s]


Epoch: 27 | Epoch Time: 4m 21s 	Train Loss: 0.506 | Train Acc: 74.45% 	 Val. Loss: 0.521 |  Val. Acc: 73.29% 	 Roc Train: 0.824 	 Roc Valid: 0.807 ,   0.0001 --LR

100%|██████████| 436/436 [04:15<00:00,  1.70it/s]


Epoch: 28 | Epoch Time: 4m 35s 	Train Loss: 0.506 | Train Acc: 74.92% 	 Val. Loss: 0.528 |  Val. Acc: 73.29% 	 Roc Train: 0.826 	 Roc Valid: 0.804 ,   0.0001 --LR

100%|██████████| 436/436 [04:29<00:00,  1.62it/s]


Epoch: 29 | Epoch Time: 4m 48s 	Train Loss: 0.496 | Train Acc: 75.48% 	 Val. Loss: 0.509 |  Val. Acc: 73.99% 	 Roc Train: 0.832 	 Roc Valid: 0.814 ,   0.0001 --LR

100%|██████████| 436/436 [04:26<00:00,  1.64it/s]


Epoch: 30 | Epoch Time: 4m 45s 	Train Loss: 0.496 | Train Acc: 74.93% 	 Val. Loss: 0.517 |  Val. Acc: 72.78% 	 Roc Train: 0.830 	 Roc Valid: 0.820 ,   0.0001 --LR

In [None]:
#WRiTING THE PERFORMANCE ON THE TEST SET #

model = TransformerDis(MODEL_SIZE, NR_HEADS, NR_LAYERS, SIZE_FF, DROPOUT)

model.load_state_dict(torch.load(model_name))
model = model.to(device)

EMB_method = 'N2V'

with open(f"{dataset}_{EMB_method}_{PSE}_training-results.txt", "a") as myfile:
    myfile.write(f"\n ### {model_name} ### \n")
    myfile.write(f"\n ### EMB: {EMB_method}; DIM: {EMB_DIM} ### \n")
    myfile.write(f"Train set: {len(train_set)}, Valid set: {len(valid_set)}, Test set: {len(test_set)} \n")

    valid_loss, valid_acc, roc_val, lab, pred = evaluate(model, test_grt, criterion, torch_vers = False)
    myfile.write(f" \n valid_loss: {valid_loss}, valid_acc: {valid_acc}, roc_val: {roc_val} \n")