In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

def MLP(channels: list):
    n = len(channels)
    layers = []
    for i in range(1, n):
        layers.append(nn.Conv1d(channels[i-1], channels[i], kernel_size=1, bias=True))

        if i < (n-1):
            layers.append(nn.BatchNorm1d(channels[i]))
            layers.append(nn.ReLU())

    return nn.Sequential(*layers)

class Attention(nn.Module):
    def __init__(self, num_heads, feature_dim):
        super().__init__()
        assert feature_dim % num_heads == 0
        self.dim = feature_dim // num_heads
        self.num_heads = nun_heads
        self.merge = nn.Conv1d(feature_dim, feature_dim, kernel_size=1)
        self.proj = nn.ModuleList([deepcopy(self.merge) for _ in range(3)])

    def forward(self, qeury, key, value):
        # q : [B, feature_dim,  seq_len] : kernel =1 짜리 conv로 convolving
        batch_dim = qeury.size(0)
        # projection
        query, key, value = [l(x).view(batch_dim, self.dim, self.num_heads, -1) for l, x in zip(self.proj, (query, key, value))]
        x, prob = self.attention(query, key, value)

    def attention(self, query, key, value):
        dim = query.shape[1]
        # batched matrix multiplication
        scores = torch.einsum('bdhn,bdhm->bhnm', query, key) / dim ** .5
        prob = F.softmax(scores, dim=-1)
        return torch.einsum('bhnm,bdhm->bdhn', prob, value), prob

class MessagePassing(nn.Module):
    def __init__(self, feature_dim: int, num_heads):
        super().__init__()
        self.attn = Attention(num_heads, feature_dim)
        self.mlp = MLP([feature_dim*2, feature_dim*2, out_dim])

    def forward(self, features):
        message, prob = self.attn(features, features, features)
        return self.mlp(torch.cat([features, message], dim=1)), prob

class GAT(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, num_heads=4):
        super().__init__()

        num_heads_in = num_heads
        self.reshape = None
        if input_dim != output_dim:
            for num_heads_in in range(num_heads, 0, -1):
                if input_dim % num_heads_in == 0:
                    break
            self.reshape = MLP([input_dim, output_dim])

        self.attention = MessagePassing(input_dim, num_heads_in, out_dim=output_dim)

    def forward(self, features):
        message, prob = self.attnetion(features)
        if self.reshape:
            features = self.reshape(features)
        output = features + message
        return output, prob


# torch geometric
from torch_geometric.nn import GATConv

class GAT(nn.Module):
    def __init__(self, in_channels, out_channels, hidden_channels, heads=1, dropout=0.5):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=dropout)
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=dropout)
        self.dropout = dropout
    
    def forward(self, x, edge_index):
        x = F.dropout(x, p=self.dropout, training=self.training)  # Input: (num_nodes, num_node_features)
        x = F.elu(self.conv1(x, edge_index))  # Output of conv1: (num_nodes, hidden_channels * heads)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.conv2(x, edge_index)  # Output of conv2: (num_nodes, out_channels)
        return F.log_softmax(x, dim=1)  # Final output: (num_nodes, out_channels)

from torch_geometric.datasets import Planetoid
dataset = Planetoid(root='/tmp/Cora', name='Cora')
data = dataset[0]


# import debugpy

# debugpy.listen(('0.0.0.0', 5678))

# print("Waiting for debugger attach")
# debugpy.wait_for_client()

model = GAT(
    in_channels=dataset.num_node_features,
    out_channels=dataset.num_classes, 
    hidden_channels=8, 
    heads=8
).to('cuda')
data = data.to('cuda')
optimizer = torch.optim.AdamW(model.parameters(), lr=0.005, weight_decay=5e-4)

def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    logits, accs = model(data.x, data.edge_index), []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        pred = logits[mask].max(1)[1] # max 함수를 dim=1에 대해 수행 후 값[0]과 인덱스[1]를 얻음
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

for epoch in range(100):
    loss = train()
    accs = test()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {accs[0]:.4f}, Val: {accs[1]:.4f}, Test: {accs[2]:.4f}')