In [1]:
import numpy as np
import random
from tqdm.auto import tqdm
import warnings; warnings.filterwarnings("ignore")

In [25]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
from torch_geometric.transforms import RandomNodeSplit
from torch_geometric.datasets import CoraFull

#### Construct data(sample PyG data)

In [3]:
data = CoraFull('../tmp')

In [4]:
g = data[0]

In [5]:
g

Data(x=[19793, 8710], edge_index=[2, 126842], y=[19793])

In [6]:
print(g.x.shape)
g.x

torch.Size([19793, 8710])


tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [7]:
print(g.y.shape)
g.y

torch.Size([19793])


tensor([ 0,  0,  0,  ..., 52, 59, 55])

In [8]:
num_classes = len(np.unique(g.y))
num_classes

70

In [9]:
# g.edge_weight = torch.tensor([1. for i in range(g.num_edges)]); g.edge_weight

tensor([1., 1., 1.,  ..., 1., 1., 1.])

In [10]:
g.edge_index

tensor([[    0,     0,     0,  ..., 19791, 19791, 19792],
        [ 1227,  4021,  4056,  ...,  5100, 10850,  2947]])

#### Construct data(custom data)

In [11]:
# data = Data(x=x, edge_index=edge_index, y=y, edge_weight=edge_weight)

In [12]:
# g = data[0]

#### Data split

In [13]:
def data_split(gdata, val_ratio=0.2, test_ratio=0.2):
    gidx = list(range(gdata.num_nodes))
    num_val = round(gdata.num_nodes * val_ratio)
    num_test = round(gdata.num_nodes * test_ratio)
    num_train = gdata.num_nodes - num_val - num_test
    print(f'num_train: {num_train}\nnum_val: {num_val}\nnum_test: {num_test}')
    if num_val + num_test + num_train == gdata.num_nodes: 
        print('Sum of splited data is equal to original data: True')
    else:
        print('Error occured in data split')
        
    random.shuffle(gidx)
    val_idx = gidx[:num_val]
    test_idx = gidx[num_val:num_val+num_test]
    train_idx = gidx[num_val+num_test:]
    
    train_mask = torch.zeros(gdata.num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(gdata.num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(gdata.num_nodes, dtype=torch.bool)
    
    train_mask[train_idx] = True
    val_mask[val_idx] = True
    test_mask[test_idx] = True
    
    gdata.train_mask = train_mask
    gdata.val_mask = val_mask
    gdata.test_mask = test_mask
    gdata.num_classes = len(np.unique(gdata.y))
    return gdata

In [14]:
g_ = data_split(g, val_ratio=0.2, test_ratio=0.2)

num_train: 11875
num_val: 3959
num_test: 3959
Sum of splited data is equal to original data: True


#### Define model

In [15]:
dim_h= 256
lr = 0.005
weight_decay = 5e-4
dropout = 0.5
epochs = 100

In [16]:
# You can choose GNN model from: https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html
class GCN(torch.nn.Module):
  def __init__(self, dim_in, dim_h, dim_out):
    super().__init__()
    self.gcn1 = GCNConv(dim_in, dim_h)
    self.gcn2 = GCNConv(dim_h, dim_out)
    self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)

  def forward(self, x, edge_index, edge_weight):
    h = self.gcn1(x, edge_index, edge_weight).relu()
    h = F.dropout(h, p=dropout, training=self.training)
    h = self.gcn2(h, edge_index, edge_weight)
    return h, F.log_softmax(h, dim=1)

In [None]:
# class SAGE(torch.nn.Module):
#     def __init__(self, dim_in, dim_h, dim_out):
#         super().__init__()
#         self.sage1 = SAGEConv(dim_in, dim_h)
#         self.sage2 = SAGEConv(dim_h, dim_out)
    
#     def forward(self, x, edge_index):
#         h = self.sage1(x, edge_index).relu()
#         h = F.dropout(h, p=dropout, training=self.training)
#         h = self.sage2(h, edge_index)
#         return h, F.log_softmax(h, dim=1)

In [17]:
def accuracy(pred_y, y):
    return ((pred_y == y).sum() / len(y)).item()

def train(model, data):
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = model.optimizer

    model.train()
    for epoch in tqdm(range(epochs+1)):
        
        optimizer.zero_grad()
        _, out = model(data.x, data.edge_index, data.edge_weight)
        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        acc = accuracy(out[data.train_mask].argmax(dim=1), data.y[data.train_mask])
        loss.backward()
        optimizer.step()

        val_loss = criterion(out[data.val_mask], data.y[data.val_mask])
        val_acc = accuracy(out[data.val_mask].argmax(dim=1), data.y[data.val_mask])

        if(epoch % 10 == 0) and (epoch !=0):
            print(f'Epoch {epoch:>3} | Train Loss: {loss:.3f} | Train Acc: '
                  f'{acc*100:>6.2f}% | Val Loss: {val_loss:.2f} | '
                  f'Val Acc: {val_acc*100:.2f}%')
    print('Training finished!')
    
def test(model, data):
    model.eval()
    _, out = model(data.x, data.edge_index, data.edge_weight)
    acc = accuracy(out.argmax(dim=1)[data.test_mask], data.y[data.test_mask])
    return acc

#### Training

In [18]:
model = GCN(g_.num_features, dim_h, g_.num_classes)

In [26]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [27]:
model = model.to(device)
g_ = g_.to(device)

In [28]:
print(model)
train(model, g_)

GCN(
  (gcn1): GCNConv(8710, 256)
  (gcn2): GCNConv(256, 70)
)


  0%|          | 0/101 [00:00<?, ?it/s]

Epoch  10 | Train Loss: 0.429 | Train Acc:  88.43% | Val Loss: 1.11 | Val Acc: 70.37%
Epoch  20 | Train Loss: 0.423 | Train Acc:  88.44% | Val Loss: 1.11 | Val Acc: 70.07%
Epoch  30 | Train Loss: 0.423 | Train Acc:  88.51% | Val Loss: 1.10 | Val Acc: 70.32%
Epoch  40 | Train Loss: 0.420 | Train Acc:  88.50% | Val Loss: 1.11 | Val Acc: 70.37%
Epoch  50 | Train Loss: 0.419 | Train Acc:  88.38% | Val Loss: 1.12 | Val Acc: 69.69%
Epoch  60 | Train Loss: 0.413 | Train Acc:  89.11% | Val Loss: 1.09 | Val Acc: 70.35%
Epoch  70 | Train Loss: 0.416 | Train Acc:  88.60% | Val Loss: 1.12 | Val Acc: 70.45%
Epoch  80 | Train Loss: 0.414 | Train Acc:  89.05% | Val Loss: 1.11 | Val Acc: 70.70%
Epoch  90 | Train Loss: 0.416 | Train Acc:  88.29% | Val Loss: 1.11 | Val Acc: 70.42%
Epoch 100 | Train Loss: 0.410 | Train Acc:  89.27% | Val Loss: 1.12 | Val Acc: 70.30%
Training finished!


#### Prediction

In [24]:
acc = test(model, g_)
print(f'GCN test accuracy: {acc*100:.2f}%\n')

GCN test accuracy: 71.00%



#### Model Save & load

In [None]:
# torch.save(model.state_dict(), PATH)https://tutorials.pytorch.kr/beginner/saving_loading_models.html

In [None]:
# model = GCN(g_data.num_features, dim_h, g_data.num_classes)
# model.load_state_dict(torch.load(PATH))
# model.eval()
# acc = test(model, g_)
# print(f'GCN test accuracy: {acc*100:.2f}%\n')