https://github.com/zhulf0804/GCN.PyTorch/tree/master

https://relational.fit.cvut.cz/dataset/CORA

https://m.blog.naver.com/winddori2002/222183504185

https://chioni.github.io/posts/gnn/

https://pytorch-geometric.readthedocs.io/en/latest/generated/torch_geometric.datasets.Planetoid.html#torch_geometric.datasets.Planetoid

https://baeseongsu.github.io/posts/pytorch-geometric-introduction/

 data_load 함수는 citation network인 cora 데이터셋을 load하는 함수입니다. 데이터의 구성은 노드(논문) features(논문에서 사용하는 단어사전 1433개), label(문서 분류), edge(인용 관계)입니다. 여기서 features 1433개는 one-hot vector로 표현됩니다.

위의 함수에서 핵심은 각 노드와 엣지 데이터를 기반으로 symmertric adjacency matrix를 만들고 indentity matrix와 더해줍니다. 또한, train, val, test에 index를 지정하여 semi-supervised transductive를 사용할 수 있도록 합니다. 이 때의 index들은 이후 loss를 계산할 때 train index만 적용하기 위함입니다.

In [1]:
# DataSet
import torch
import numpy as np
from torch_geometric.datasets import Planetoid # The citation network datasets "Cora", "CiteSeer" and "PubMed" from the "Revisiting Semi-Supervised Learning with Graph Embeddings" paper.
from torch_geometric.utils import to_dense_adj
from datasets import load_data
import torch.nn.functional as F


# 데이터들을 로드. loader_dataset은 torch_geometric.datasets 모듈로 로드
loader_dataset = 'cora'
loader_dataset = Planetoid(root='../../datasets/Cora', name='Cora')
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data("../../datasets/Cora/Cora/raw", "cora")

# loader_dataset에서 load_data에서 로드한 데이터와 같은 형식으로 편집
loader_adj = to_dense_adj(loader_dataset[0].edge_index).squeeze(0).double()
loader_features = loader_dataset[0].x.double()
# y는 one hot encoding되어 있음
loader_y_train = F.one_hot(loader_dataset[0].y).double()
loader_y_val = loader_dataset[0].y[loader_dataset[0].val_mask].double()
loader_y_test = loader_dataset[0].y[loader_dataset[0].test_mask].double()
loader_train_mask = loader_dataset[0].train_mask
loader_val_mask = loader_dataset[0].val_mask
loader_test_mask = loader_dataset[0].test_mask
#########################################

# print("adj is same?: {}".format(np.allclose(adj, loader_adj)))

print("<<features>>")
print("features is same?: {}, but dtype and shape is same. ({}, {})".format(np.allclose(features, loader_features), features.dtype, loader_features.dtype))
print("-"*50)
print("<<y>>")
print("y_train, test, val is same?: {}".format(torch.eq(y_train, loader_y_train).all()))
t = [0, 0]
f = [0, 0]
for i in range(len(y_train)):
    if torch.eq(y_train[i], loader_y_train[i]).all(): 
        t[0] += 1
        if t[0] == 1: t[1] = i
    else:
        f[0] += 1
        if f[0] == 1: f[1] = i
print("True node is {}, flase node is {}. because train set is 140개".format(t[0], f[0]))
print("False node is y_train: {}, loader_y_train: {}".format(y_train[f[1]], loader_y_train[f[1]]))
print("즉, 각 y_**들은 각자 val, train, test 데이터셋에 대한 부분만 라벨이 블라인드 되어 있는 label들이다. 따라서 전체 길이는 모두 같다.")
print("-"*50)
print("<<mask>>")
print("train, test, val_mask is same?: {}".format(torch.eq(train_mask, loader_train_mask).all()))

<<features>>
features is same?: False, but dtype and shape is same. (torch.float64, torch.float64)
--------------------------------------------------
<<y>>
y_train, test, val is same?: False
True node is 140, flase node is 2568. because train set is 140개
False node is y_train: tensor([0., 0., 0., 0., 0., 0., 0.], dtype=torch.float64), loader_y_train: tensor([0., 0., 0., 0., 1., 0., 0.], dtype=torch.float64)
즉, 각 y_**들은 각자 val, train, test 데이터셋에 대한 부분만 라벨이 블라인드 되어 있는 label들이다. 따라서 전체 길이는 모두 같다.
--------------------------------------------------
<<mask>>
train, test, val_mask is same?: True


In [2]:
print("<<Cora Dataset Info>>\n", "-"*50)
print("Cora Dataset은 여러 그래프가 모인 데이터셋이 아닌, 통채로 큰 그래프인 데이터셋이므로(for node level gnn) Cora[0] 그래프 하나만 가지고 있다.")
print("- Cora: {}, len = {}".format(loader_dataset, len(loader_dataset)))
print("- num_classes: {}".format(loader_dataset.num_classes))
print("-"*50)
print("-"*50)
print("- dataset[0].keys: {}".format(loader_dataset[0].keys))
print("-"*50)
print("- node: {}".format(loader_dataset[0].x.shape))
print("- num_node: {}, {}".format(loader_dataset[0].num_nodes, len(loader_dataset[0].x)))
print("- num_node_features: {}, {}".format(loader_dataset[0].num_node_features, len(loader_dataset[0].x[0])))
print("-"*50)
print("- edges: {}".format(loader_dataset[0].edge_index.shape))
print("- num_edges: {}, {}".format(loader_dataset[0].num_edges, len(loader_dataset[0].edge_index[1])))
print("-"*50)
print("- class: {}".format(loader_dataset[0].y))
print("- class_set: {}".format(torch.unique(loader_dataset[0].y)))
print("-"*50)
print("- num_train_mask: {}".format(loader_dataset[0].train_mask.sum().item()))
print("- num_val_mask: {}".format(loader_dataset[0].val_mask.sum().item()))
print("- num_test_mask: {}".format(loader_dataset[0].test_mask.sum().item()))
print("-"*50)

<<Cora Dataset Info>>
 --------------------------------------------------
Cora Dataset은 여러 그래프가 모인 데이터셋이 아닌, 통채로 큰 그래프인 데이터셋이므로(for node level gnn) Cora[0] 그래프 하나만 가지고 있다.
- Cora: Cora(), len = 1
- num_classes: 7
--------------------------------------------------
--------------------------------------------------
- dataset[0].keys: ['edge_index', 'x', 'test_mask', 'y', 'val_mask', 'train_mask']
--------------------------------------------------
- node: torch.Size([2708, 1433])
- num_node: 2708, 2708
- num_node_features: 1433, 1433
--------------------------------------------------
- edges: torch.Size([2, 10556])
- num_edges: 10556, 10556
--------------------------------------------------
- class: tensor([3, 4, 4,  ..., 3, 3, 3])
- class_set: tensor([0, 1, 2, 3, 4, 5, 6])
--------------------------------------------------
- num_train_mask: 140
- num_val_mask: 500
- num_test_mask: 1000
--------------------------------------------------


In [3]:
from models.gcn import GCN
from models.utils import build_optimizer, get_loss, get_accuracy

In [4]:
hidden_dim = 16
dropout = 0.5
init_lr = 0.01
weight_decay = 5e-4
epoches = 200
log_interval = 10
checkpoint_interval = 20

In [5]:
print("Input of GCN model: input_dim = {}, hidden_dim = {}, num_classes = {}, dropout = {}".format(loader_dataset.num_node_features, hidden_dim, loader_dataset.num_classes, dropout))

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = GCN(loader_dataset.num_node_features, hidden_dim, loader_dataset.num_classes, dropout)
optimizer = build_optimizer(model, init_lr, weight_decay)

Input of GCN model: input_dim = 1433, hidden_dim = 16, num_classes = 7, dropout = 0.5


In [11]:
def train():
    for epoch in range(epoches + 1):
        outputs = model(adj, features)
        loss = get_loss(outputs, y_train, train_mask)
        val_loss = get_loss(outputs, y_val, val_mask).detach().numpy()
        model.eval()
        outputs = model(adj, features)
        train_accuracy = get_accuracy(outputs, y_train, train_mask)
        val_accuracy = get_accuracy(outputs, y_val, val_mask)
        model.train()
        # print('loss', {'train_loss': loss.detach().numpy(), 'val_loss': val_loss}, epoch)
        # print('accuracy', {'train_ac': train_accuracy, 'val_ac': val_accuracy}, epoch)
        if epoch % log_interval == 0:
            print("Epoch: %d, train loss: %f, val loss: %f, train ac: %f, val ac: %f"
                    %(epoch, loss.detach().numpy(), val_loss, train_accuracy, val_accuracy))
        optimizer.zero_grad()  # Important
        loss.backward()
        optimizer.step()

In [12]:
train()

Epoch: 0, train loss: 0.356938, val loss: 1.006602, train ac: 0.992857, val ac: 0.794000
Epoch: 10, train loss: 0.389274, val loss: 0.944934, train ac: 1.000000, val ac: 0.808000
Epoch: 20, train loss: 0.341875, val loss: 0.957859, train ac: 0.992857, val ac: 0.782000
Epoch: 30, train loss: 0.321970, val loss: 0.974770, train ac: 0.992857, val ac: 0.814000
Epoch: 40, train loss: 0.308768, val loss: 0.994999, train ac: 1.000000, val ac: 0.794000
Epoch: 50, train loss: 0.319464, val loss: 0.940084, train ac: 0.992857, val ac: 0.800000
Epoch: 60, train loss: 0.339615, val loss: 0.918505, train ac: 1.000000, val ac: 0.798000
Epoch: 70, train loss: 0.297739, val loss: 0.914063, train ac: 1.000000, val ac: 0.796000
Epoch: 80, train loss: 0.302142, val loss: 0.914630, train ac: 1.000000, val ac: 0.804000
Epoch: 90, train loss: 0.268684, val loss: 0.904370, train ac: 1.000000, val ac: 0.798000
Epoch: 100, train loss: 0.277776, val loss: 0.978261, train ac: 1.000000, val ac: 0.796000
Epoch: 110