In [1]:
import numpy as np

import torch
from torch import nn
import torch.nn.functional as F

from torch_geometric.datasets import TUDataset
import torch_geometric.transforms as T
from torch_geometric.data import DataLoader, NeighborSampler

from sklearn.metrics import accuracy_score

In [2]:
dataset = TUDataset('./data/dense_prot', name='PROTEINS')

print(len(dataset))

Downloading https://www.chrsmrrs.com/graphkerneldatasets/PROTEINS.zip
Extracting data/dense_prot/PROTEINS/PROTEINS.zip
Processing...
Done!
1113


In [3]:
dataset = dataset.shuffle()

val_dataset = dataset[:300]
train_dataset = dataset[300:]

In [5]:
len(dataset)

1113

In [6]:
train_loader = DataLoader(dataset=train_dataset, batch_size=100)

val_loader = DataLoader(dataset=val_dataset, batch_size=100)

In [7]:
batch = next(iter(train_loader))

In [10]:
batch.batch[:1500]

tensor([ 0,  0,  0,  ..., 26, 26, 26])

In [11]:
batch.edge_index

tensor([[   0,    0,    1,  ..., 3975, 3975, 3975],
        [   1,    7,    0,  ..., 3968, 3969, 3974]])

In [14]:
batch.y

tensor([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
        1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 1])

In [12]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_max_pool

In [15]:
class ProteinClassifier(nn.Module):
    
    def __init__(self):
        
        super().__init__()
        
        self.conv1 = GCNConv(dataset.num_features, 16)
        
        self.conv2 = GCNConv(16, 64)
        
        self.fc = torch.nn.Linear(64, dataset.num_classes)
        
    def forward(self, data):
        
        x, edge_index, batch = data.x, data.edge_index, data.batch
        
        h = self.conv1(x, edge_index) # [num_nodes_in_batch, dim]
        
        h = F.relu(h)
        
        h = self.conv2(h, edge_index) # [num_nodes_in_batch, dim]
        
        h = F.relu(h)
                
        h = global_max_pool(h, batch) # [num_graphs, num_classes]
        
        return self.fc(h)

In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [17]:
model = ProteinClassifier().to(device)

In [18]:
opt = torch.optim.Adam(model.parameters())

In [19]:
def train_one_epoch(dataloader=train_loader):
    
    
    losses = []
    
    for data in dataloader:
        
        data = data.to(device)

        out = model(data) # [num_graphs, num_classes]

        loss = F.cross_entropy(out, data.y)

        loss.backward()

        opt.step()

        opt.zero_grad()

        losses.append(loss.item())
            
    return np.array(losses).mean()

In [20]:
@torch.no_grad()
def evaluate(dataloader=val_loader):
    
    true_y = []
    
    pred_y = []
    
    for data in dataloader:
        
        data = data.to(device)

        out = model(data).cpu()

        pred = torch.argmax(out, dim=1).numpy().tolist()
        
        true = data.y.cpu().numpy().tolist()
        
        pred_y.extend(pred)
        
        true_y.extend(true)
            
    return accuracy_score(true_y, pred_y)

In [21]:
for i in range(100):
    
    loss = train_one_epoch(train_loader)
    
    val_acc = evaluate(val_loader)
    
    print(f'train_loss = {loss: .2}\t val_acc={val_acc: .2}')

train_loss =  0.7	 val_acc= 0.66
train_loss =  0.67	 val_acc= 0.59
train_loss =  0.66	 val_acc= 0.59
train_loss =  0.65	 val_acc= 0.59
train_loss =  0.65	 val_acc= 0.59
train_loss =  0.65	 val_acc= 0.59
train_loss =  0.64	 val_acc= 0.59
train_loss =  0.64	 val_acc= 0.59
train_loss =  0.63	 val_acc= 0.59
train_loss =  0.63	 val_acc= 0.6
train_loss =  0.63	 val_acc= 0.61
train_loss =  0.62	 val_acc= 0.62
train_loss =  0.62	 val_acc= 0.63
train_loss =  0.62	 val_acc= 0.65
train_loss =  0.61	 val_acc= 0.66
train_loss =  0.61	 val_acc= 0.64
train_loss =  0.6	 val_acc= 0.65
train_loss =  0.6	 val_acc= 0.65
train_loss =  0.6	 val_acc= 0.66
train_loss =  0.59	 val_acc= 0.67
train_loss =  0.58	 val_acc= 0.68
train_loss =  0.58	 val_acc= 0.7
train_loss =  0.57	 val_acc= 0.71
train_loss =  0.57	 val_acc= 0.71
train_loss =  0.56	 val_acc= 0.71
train_loss =  0.56	 val_acc= 0.72
train_loss =  0.55	 val_acc= 0.72
train_loss =  0.55	 val_acc= 0.73
train_loss =  0.55	 val_acc= 0.73
train_loss =  0.54	 