In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

1.13.0+cu116
[K     |████████████████████████████████| 9.4 MB 6.8 MB/s 
[K     |████████████████████████████████| 4.6 MB 7.3 MB/s 
[K     |████████████████████████████████| 512 kB 9.6 MB/s 
[K     |████████████████████████████████| 280 kB 63.6 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone


In [1]:
%cd drive/MyDrive/altegrad

/content/drive/MyDrive/altegrad


## load graph features

In [2]:
import csv
import time
import numpy as np
import scipy.sparse as sp
from sklearn.metrics import accuracy_score, log_loss

from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

def load_data(dir_name, load_edge_feature=False): 
    """
    Function that loads graphs
    """  
    graph_indicator = np.loadtxt(dir_name+"graph_indicator.txt", dtype=np.int64)
    _,graph_size = np.unique(graph_indicator, return_counts=True)
    
    edges = np.loadtxt(dir_name+"edgelist.txt", dtype=np.int64, delimiter=",")
    A = sp.csr_matrix((np.ones(edges.shape[0]), (edges[:,0], edges[:,1])), shape=(graph_indicator.size, graph_indicator.size))
    A += A.T
    
    x = np.loadtxt(dir_name+"node_attributes.txt", delimiter=",")
    if load_edge_feature:
      edge_attr = np.loadtxt(dir_name+"edge_attributes.txt", delimiter=",")
    
    adj = []
    features = []
    edge_features = []
    idx_n = 0
    idx_m = 0
    for i in range(graph_size.size):
        adj.append(A[idx_n:idx_n+graph_size[i],idx_n:idx_n+graph_size[i]])
        if load_edge_feature:
          edge_features.append(edge_attr[idx_m:idx_m+adj[i].nnz,:])
        features.append(x[idx_n:idx_n+graph_size[i],:])
        idx_n += graph_size[i]
        idx_m += adj[i].nnz

    return adj, features, edge_features, graph_size

In [9]:
# Load graphs
adj, features, edge_features, graphsize = load_data('./data/raw/', True) 

In [10]:
len(adj),len(features),len(edge_features)

(6111, 6111, 6111)

In [None]:
adj[0]

<327x327 sparse matrix of type '<class 'numpy.float64'>'
	with 6233 stored elements in Compressed Sparse Row format>

In [None]:
# Normalize adjacency matrices
adj = [normalize_adjacency(A) for A in adj]

# Split data into training and test sets
adj_train = list()
features_train = list()
y_train = list()
adj_test = list()
features_test = list()
proteins_test = list()
with open('./data/graph_labels.txt', 'r') as f:
    for i,line in enumerate(f):
        t = line.split(',')
        if len(t[1][:-1]) == 0:
            proteins_test.append(t[0])
            adj_test.append(adj[i])
            features_test.append(features[i])
        else:
            adj_train.append(adj[i])
            features_train.append(features[i])
            y_train.append(int(t[1][:-1]))

# Initialize device
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


In [None]:
# Hyperparameters
epochs = 100
batch_size = 64
n_hidden = 64
n_input = 86
dropout = 0.2
learning_rate = 0.001
n_class = 18

# Compute number of training and test samples

N_train = int(len(adj_train) * 0.9)
N_valid = len(adj_train) - N_train
N_test = len(adj_test)

# Initializes model and optimizer
model = GNN(n_input, n_hidden, dropout, n_class).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_function = nn.CrossEntropyLoss()


In [None]:
def batch_loader()

In [None]:
PATH = 'model.pt'
# checkpoint = torch.load(PATH)
# if checkpoint:
#   model.load_state_dict(checkpoint['model_state_dict'])
#   optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# Train model
for epoch in range(epochs):
    t = time.time()
    model.train()
    train_loss = 0
    correct = 0
    count = 0
    # Iterate over the batches
    for i in range(0, N_train, batch_size):
        adj_batch = list()
        features_batch = list()
        idx_batch = list()
        y_batch = list()
        
        # Create tensors
        for j in range(i, min(N_train, i+batch_size)):
            n = adj_train[j].shape[0]
            adj_batch.append(adj_train[j]+sp.identity(n))
            features_batch.append(features_train[j])
            idx_batch.extend([j-i]*n)
            y_batch.append(y_train[j])
            
        adj_batch = sp.block_diag(adj_batch)
        features_batch = np.vstack(features_batch)

        adj_batch = sparse_mx_to_torch_sparse_tensor(adj_batch).to(device)
        features_batch = torch.FloatTensor(features_batch).to(device)
        idx_batch = torch.LongTensor(idx_batch).to(device)
        y_batch = torch.LongTensor(y_batch).to(device)
        
        optimizer.zero_grad()
        output,_ = model(features_batch, adj_batch, idx_batch)
        loss = loss_function(output, y_batch)
        train_loss += loss.item() * output.size(0)
        count += output.size(0)
        preds = output.max(1)[1].type_as(y_batch)
        correct += torch.sum(preds.eq(y_batch).double())
        loss.backward()
        optimizer.step()
    output,_ = model(features_batch, adj_batch, idx_batch)
    loss = loss_function(output, y_batch)
    # validation phase
    with torch.no_grad():
      valid_loss = 0
      for k in range(N_train, N_valid+N_train, batch_size):
        adj_batch = list()
        idx_batch = list()
        features_batch = list()
        y_batch = list()
        
        # Create tensors
        for j in range(k, min(N_valid+N_train, k+batch_size)):
            n = adj_train[j].shape[0]
            adj_batch.append(adj_train[j]+sp.identity(n))
            features_batch.append(features_train[j])
            idx_batch.extend([j-k]*n)
            y_batch.append(y_train[j])
            
        adj_batch = sp.block_diag(adj_batch)
        features_batch = np.vstack(features_batch)
        adj_batch = sparse_mx_to_torch_sparse_tensor(adj_batch).to(device)

        features_batch = torch.FloatTensor(features_batch).to(device)
        idx_batch = torch.LongTensor(idx_batch).to(device)
        y_batch = torch.LongTensor(y_batch).to(device)

        output,_ = model(features_batch, adj_batch, idx_batch)
        loss = loss_function(output, y_batch)
        valid_loss += loss.item() * output.size(0)
    if epoch % 1 == 0:
        print('Epoch: {:03d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(train_loss / count),
              'acc_train: {:.4f}'.format(correct / count),
              'time: {:.4f}s'.format(time.time() - t),
              'loss_valid: {:.4f}'.format(valid_loss / N_valid),
              )

Epoch: 001 loss_train: 2.0624 acc_train: 0.3674 time: 6.6509s loss_valid: 2.2012
Epoch: 002 loss_train: 2.0213 acc_train: 0.3844 time: 6.5583s loss_valid: 2.1661
Epoch: 003 loss_train: 2.0096 acc_train: 0.3860 time: 6.2304s loss_valid: 2.1446
Epoch: 004 loss_train: 1.9793 acc_train: 0.3921 time: 6.3277s loss_valid: 2.1116
Epoch: 005 loss_train: 1.9531 acc_train: 0.3942 time: 6.3016s loss_valid: 2.0956
Epoch: 006 loss_train: 1.9310 acc_train: 0.3992 time: 6.3660s loss_valid: 2.0915
Epoch: 007 loss_train: 1.9106 acc_train: 0.4058 time: 6.5121s loss_valid: 2.0810
Epoch: 008 loss_train: 1.8946 acc_train: 0.4185 time: 6.5006s loss_valid: 2.0547
Epoch: 009 loss_train: 1.8994 acc_train: 0.4062 time: 6.5079s loss_valid: 2.0639
Epoch: 010 loss_train: 1.8767 acc_train: 0.4130 time: 7.5507s loss_valid: 2.0507
Epoch: 011 loss_train: 1.8660 acc_train: 0.4162 time: 6.5605s loss_valid: 2.0451
Epoch: 012 loss_train: 1.8369 acc_train: 0.4262 time: 6.5438s loss_valid: 2.0312
Epoch: 013 loss_train: 1.837

KeyboardInterrupt: ignored

In [None]:
adj_batch

[]

In [None]:
PATH = 'model.pt'
torch.save({
            'epoch': epochs,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss,
            }, PATH)

In [None]:
# Evaluate model
model.eval()
y_pred_proba = list()
train_embeddings= []
test_embeddings = []

# Iterate over the batches
for i in range(0, N_train, batch_size):
    adj_batch = list()
    features_batch = list()
    idx_batch = list()
    y_batch = list()
    
    # Create tensors
    for j in range(i, min(N_train, i+batch_size)):
        n = adj_train[j].shape[0]
        adj_batch.append(adj_train[j]+sp.identity(n))
        features_batch.append(features_train[j])
        idx_batch.extend([j-i]*n)
        y_batch.append(y_train[j])
        
    adj_batch = sp.block_diag(adj_batch)
    features_batch = np.vstack(features_batch)

    adj_batch = sparse_mx_to_torch_sparse_tensor(adj_batch).to(device)
    features_batch = torch.FloatTensor(features_batch).to(device)
    idx_batch = torch.LongTensor(idx_batch).to(device)
    y_batch = torch.LongTensor(y_batch).to(device)
    
    _, embedding = model(features_batch, adj_batch, idx_batch)
    train_embeddings.append(embedding)

y_pred_proba = list()
# Iterate over the batches
for i in range(0, N_test, batch_size):
    adj_batch = list()
    idx_batch = list()
    features_batch = list()
    y_batch = list()
    
    # Create tensors
    for j in range(i, min(N_test, i+batch_size)):
        n = adj_test[j].shape[0]
        adj_batch.append(adj_test[j]+sp.identity(n))
        features_batch.append(features_test[j])
        idx_batch.extend([j-i]*n)
        
    adj_batch = sp.block_diag(adj_batch)
    features_batch = np.vstack(features_batch)

    adj_batch = sparse_mx_to_torch_sparse_tensor(adj_batch).to(device)
    features_batch = torch.FloatTensor(features_batch).to(device)
    idx_batch = torch.LongTensor(idx_batch).to(device)

    output, embedding = model(features_batch, adj_batch, idx_batch)
    test_embeddings.append(embedding)
    y_pred_proba.append(output)
    
y_pred_proba = torch.cat(y_pred_proba, dim=0)
y_pred_proba = torch.exp(y_pred_proba)
y_pred_proba = y_pred_proba.detach().cpu().numpy()
    

In [None]:
# Write predictions to a file
with open('structural_naive_700_epochs.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    lst = list()
    for i in range(18):
        lst.append('class'+str(i))
    lst.insert(0, "name")
    writer.writerow(lst)
    for i, protein in enumerate(proteins_test):
        lst = y_pred_proba[i,:].tolist()
        lst.insert(0, protein)
        writer.writerow(lst) 

In [None]:
np.save('./embedding/gnn_train_embedding', train_embeddings)
np.save('./embedding/gnn_test_embedding', test_embeddings)