In [1]:
import numpy as np
import random
from tqdm.auto import tqdm
import warnings; warnings.filterwarnings("ignore")

In [2]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv
from torch_geometric.data import Data
from torch_geometric.loader import NeighborLoader
from torch_geometric.datasets import CoraFull

#### Construct data(sample PyG data)

In [3]:
data = CoraFull('../tmp')

In [4]:
g = data[0]

In [5]:
g

Data(x=[19793, 8710], edge_index=[2, 126842], y=[19793])

In [6]:
print(g.x.shape)
g.x

torch.Size([19793, 8710])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [7]:
print(g.y.shape)
g.y

torch.Size([19793])


tensor([ 0,  0,  0,  ..., 52, 59, 55])

In [8]:
num_classes = len(np.unique(g.y))
num_classes

70

In [9]:
# g.edge_weight = torch.tensor([1. for i in range(g.num_edges)]); g.edge_weight

In [10]:
g.edge_index

tensor([[    0,     0,     0,  ..., 19791, 19791, 19792],
        [ 1227,  4021,  4056,  ...,  5100, 10850,  2947]])

#### Construct data(custom data)

In [11]:
# data = Data(x=x, edge_index=edge_index, y=y, edge_weight=edge_weight)

In [12]:
# g = data[0]

#### Data split

In [13]:
def data_split(gdata, val_ratio=0.2, test_ratio=0.2):
    
    # gdata = data[0]
    
    gidx = list(range(gdata.num_nodes))
    num_val = round(gdata.num_nodes * val_ratio)
    num_test = round(gdata.num_nodes * test_ratio)
    num_train = gdata.num_nodes - num_val - num_test
    print(f'num_train: {num_train}\nnum_val: {num_val}\nnum_test: {num_test}')
    if num_val + num_test + num_train == gdata.num_nodes: 
        print('Sum of splited data is equal to original data: True')
    else:
        print('Error occured in data split')
        
    random.shuffle(gidx)
    val_idx = gidx[:num_val]
    test_idx = gidx[num_val:num_val+num_test]
    train_idx = gidx[num_val+num_test:]
    
    train_mask = torch.zeros(gdata.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(gdata.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(gdata.num_nodes, dtype=torch.bool)
    
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    test_mask[test_idx] = True
    
    gdata.train_mask = train_mask
    gdata.val_mask = val_mask
    gdata.test_mask = test_mask
    gdata.num_classes = len(np.unique(gdata.y))
    
    return gdata

In [14]:
data_ = data_split(data[0], val_ratio=0.2, test_ratio=0.2)

num_train: 11875
num_val: 3959
num_test: 3959
Sum of splited data is equal to original data: True


#### Define model

In [25]:
dim_h= 128
lr = 0.005
weight_decay = 5e-4
dropout = 0.5
epochs = 100
batch_size = 128

- If edge weight exists

In [26]:
# class GCN(torch.nn.Module):
#   def __init__(self, dim_in, dim_h, dim_out):
#     super().__init__()
#     self.gcn1 = GCNConv(dim_in, dim_h)
#     self.gcn2 = GCNConv(dim_h, dim_out)

#   def forward(self, x, edge_index, edge_weight):
#     h = self.gcn1(x, edge_index, edge_weight).relu()
#     h = F.dropout(h, p=dropout, training=self.training)
#     h = self.gcn2(h, edge_index, edge_weight)
#     return h, F.log_softmax(h, dim=1)

- If edge weight not exists

In [27]:
# You can choose GNN model from: https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html
class GCN(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out):
        super().__init__()
        self.gcn1 = GCNConv(dim_in, dim_h)
        self.gcn2 = GCNConv(dim_h, dim_out)
    
    def forward(self, x, edge_index):
        h = self.gcn1(x, edge_index).relu()
        h = F.dropout(h, p=dropout, training=self.training)
        h = self.gcn2(h, edge_index)
        return h, F.log_softmax(h, dim=1)

In [28]:
# class SAGE(torch.nn.Module):
#     def __init__(self, dim_in, dim_h, dim_out):
#         super().__init__()
#         self.sage1 = SAGEConv(dim_in, dim_h)
#         self.sage2 = SAGEConv(dim_h, dim_out)
    
#     def forward(self, x, edge_index):
#         h = self.sage1(x, edge_index).relu()
#         h = F.dropout(h, p=dropout, training=self.training)
#         h = self.sage2(h, edge_index)
#         return h, F.log_softmax(h, dim=1)

In [29]:
trn_loader = NeighborLoader(data_, batch_size=batch_size, num_neighbors=[8, 4], shuffle=True, input_nodes=data_.train_mask)
val_loader = NeighborLoader(data_, batch_size=batch_size, num_neighbors=[8, 4], shuffle=True, input_nodes=data_.val_mask)

In [30]:
def accuracy(pred_y, y):
    return ((pred_y == y).sum() / len(y)).item()

def train(model, data):
    
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    for epoch in tqdm(range(epochs+1)):
        
        train_loss = 0
        train_acc = 0
        val_loss = 0
        val_acc = 0
        model.train()
        for batch in trn_loader:
            # _, out = model(batch.x, batch.edge_index, batch.edge_weight)
            _, out = model(batch.x, batch.edge_index)
            train_loss += criterion(out, batch.y)
            train_acc += accuracy(out.argmax(dim=1), batch.y)
            
        train_loss = train_loss / len(trn_loader)
        train_acc = train_acc / len(trn_loader)
        
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        model.eval()
        for batch in val_loader:
            _, out = model(batch.x, batch.edge_index)
            val_loss += criterion(out, batch.y)
            val_acc += accuracy(out.argmax(dim=1), batch.y)
        
        val_loss = val_loss / len(val_loader)
        val_acc = val_acc / len(val_loader)

        if(epoch % 10 == 0) and (epoch !=0):
            print(f'Epoch {epoch:>3} | Train Loss: {train_loss:.3f} | Train Acc: '
                  f'{train_acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc*100:.2f}%')
    print('Training finished!')
    
def test(model, data):
    model.eval()
    _, out = model(data.x, data.edge_index)
    # _, out = model(data.x, data.edge_index, data.edge_weight) # If edge weight exists
    test_acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
    return test_acc

#### Training

In [31]:
model = GCN(data_.num_features, dim_h, data_.num_classes)

In [32]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')
device

device(type='cpu')

In [33]:
model = model.to(device)
data_ = data_.to(device)

In [34]:
print(model)
train(model, data_)

GCN(
  (gcn1): GCNConv(8710, 128)
  (gcn2): GCNConv(128, 70)
)


  0%|          | 0/101 [00:00<?, ?it/s]

Epoch  10 | Train Loss: 1.808 | Train Acc:  63.41% | Val Loss: 1.53 | Val Acc: 71.76%
Epoch  20 | Train Loss: 1.134 | Train Acc:  75.23% | Val Loss: 0.95 | Val Acc: 80.61%
Epoch  30 | Train Loss: 0.933 | Train Acc:  79.81% | Val Loss: 0.78 | Val Acc: 84.23%
Epoch  40 | Train Loss: 0.858 | Train Acc:  81.77% | Val Loss: 0.71 | Val Acc: 85.76%
Epoch  50 | Train Loss: 0.812 | Train Acc:  82.93% | Val Loss: 0.67 | Val Acc: 86.79%
Epoch  60 | Train Loss: 0.770 | Train Acc:  83.77% | Val Loss: 0.63 | Val Acc: 87.11%
Epoch  70 | Train Loss: 0.738 | Train Acc:  84.34% | Val Loss: 0.61 | Val Acc: 87.47%
Epoch  80 | Train Loss: 0.716 | Train Acc:  84.82% | Val Loss: 0.59 | Val Acc: 87.69%
Epoch  90 | Train Loss: 0.699 | Train Acc:  85.17% | Val Loss: 0.58 | Val Acc: 87.94%
Epoch 100 | Train Loss: 0.679 | Train Acc:  85.56% | Val Loss: 0.56 | Val Acc: 88.17%
Training finished!


#### Prediction

In [35]:
acc = test(model, data_)
print(f'GCN test accuracy: {acc*100:.2f}%\n')

GCN test accuracy: 77.19%



#### Model Save & load

In [None]:
# torch.save(model.state_dict(), PATH)

In [29]:
# model = GCN(g_data.num_features, dim_h, g_data.num_classes)
# model.load_state_dict(torch.load(PATH))
# model.eval()
# acc = test(model, data_)
# print(f'GCN test accuracy: {acc*100:.2f}%\n')