In [150]:
# Entity
import os
from collections import Counter

import gzip
# rdfはWeb上のリンクをグラフ構造で表してWeb自体を
# 体系的な知識構造にしようと言うところででてきたやつ
import rdflib as rdf
import pandas as pd
import numpy as np
import torch
from torch_scatter import scatter_add

from torch_geometric.data import (InMemoryDataset, Data, 
                                  download_url, extract_tar)
from torch_geometric.utils import one_hot


# rgcn
import math

import torch
import torch.nn.functional as F
from torch.nn import Parameter as Param
from torch_geometric.nn.conv import MessagePassing

In [151]:
class Entities(InMemoryDataset):
    url = 'https://s3.us-east-2.amazonaws.com/dgl.ai/dataset/{}.tgz'
    
    def __init__(self, root, name, transform=None, pre_transform=None):
        assert name in ['AIFB', 'AM', 'MUTAG', 'BGS']
        self.name = name.lower()
        super(Entities, self).__init__(root, transform, pre_transform)
        # processed_path[0]は処理された後のデータで，process methodで定義される
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    """
    ここで定義されている@propertyはclass_name.propertyとして
    アクセスできるようにしているデコレータで，計算してからself.~~と
    今まで書いていたものを，このようにスマートに定義することが可能らしい．いいね．
    知らなかったのは多分やばい．．．
    """
    @property
    def num_relations(self):
        return self.data.edge_type.max().item() + 1

    @property
    def num_classes(self):
        return self.data.train_y.max().item() + 1

    @property
    def raw_file_names(self):
        return [
            '{}_stripped.nt.gz'.format(self.name),
            'completeDataset.tsv',
            'trainingSet.tsv',
            'testSet.tsv'
        ]

    @property
    def processed_file_names(self):
        return 'data.pt'
    
    
    def download(self):
        path = download_url(self.url.format(self.name), self.root)
        extract_tar(path, self.raw_dir)
        os.unlink(path)
        
    def triples(self, graph, relation=None):
        for s, p, o in graph.triples((None, relation, None)):
            yield s, p, o
            
    def process(self):
        graph_file, task_file, train_file, test_file = self.raw_paths
        print(self.raw_paths)
        
        g = rdf.Graph()
        with gzip.open(graph_file, 'rg') as f:
            g.parse(file=f, format='nt')
            
        relations = sorted(set(g.predicates()), key=lambda rel: -freq(rel))
        subjects = set(g.subjects())
        objects = set(g.objects())
        nodes = list(subjects.union(objects))
        
        relations_dict = {rel: i for i in enumerate(list(relations))}
        nodes_dict = {node: i for i, node in enumerate(nodes)}
        
        edge_list = []
        for s, p, o in g.triples((None, None, None)):
            src, dst, rel = nodes_dict[s], nodes_dict[o], relations_dict[p]
            edge_list.append([src, dst, 2 * rel])
            edge_list.append([dst, src, 2 * rel + 1])
            
        edge_list = sorted(edge_list, key=lambda x: (x[0], x[1], x[2]))
        edge = torch.tensor(edge_list, dtype=torch.long).t().contiguous()
        edge_index, edge_type = edge[:2], edge[2]
        
        oh = one_hot(edge_type, 2 * len(relations), dtype=torch.float)
        deg = scatter_add(oh, edge_index[0], dim=0, dim_size=len(nodes))
        index = edge_type + torch.arange(len(edge_list)) * 2 * len(relations)
        edge_norm = 1 / deg[edge_index[0]].view(-1)[index]
        
        if self.name == 'am':
            label_header = 'label_category'
            nodes_header = 'proxy'
        elif self.name == 'aifb':
            label_header = 'label_affiliation'
            nodes_header = 'person'
        elif self.name == 'mutag':
            label_header = 'label_mutagenic'
            nodes_header = 'bond'
        elif self.name == 'bgs':
            label_header = 'label_lithogenesis'
            nodes_header = 'rock'
            
            
        labels_df = pd.read_csv(task_file, sep='\t')
        labels_set = set(labels_df[label_header].values.tolist())
        labels_dict = {lab: i for i, lab in enumerate(list(labels_set))}
        nodes_dict = {np.unicode(key): val for key, val in nodes_dict.items()}
        
        train_labels_df = pd.read_csv(train_file, sep='\t')
        train_indices, train_lables = [], []
        for nod, lab in zip(train_labels_df[nodes_header].values,
                           train_labels_df[label_header].values):
            train_indices.append(nodes_dict[nod])
            train_labels.append(labels_dict[lab])
            
        train_idx = torch.tensor(train_indices, dtype=torch.long)
        train_y = torch.tensor(train_labels, dtype=torch.long)
        
        test_labels_df = pd.read_csv(test_file, sep='\t')
        test_indices, test_labels = [], []
        for nod, lab in zip(test_labels_df[nodes_header].values, 
                            test_labels_df[label_header].values):
            test_indices.append(nodes_dict[nod])
            test_labels.append(labels_dict[lab])
            
        test_idx = torch.tensor(test_indices, dytpe=torch.long)
        test_y = torch.tensor(test_labels, dtype=torch.long)
        
        data = Data(edge_index=edge_index)
        data.edge_type = edge_type
        data.edge_norm = edge_norm
        data.train_idx = train_idx
        data.train_y = train_y
        data.test_idx = test_idx
        data.test_y = test_y
        
        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])
        
    def __repr__(self):
        return '{}{}()'.format(self.name.upper(), self.__class__.__name__)
        

In [165]:
"""
The MUTAG dataset contains graphs representing 188 chemical compounds which are either mutagenic or not mutagenic.
So here the task of the classifier is to predict the mutagenicity of the chemical compounds, which is a two class classification problem.
"""
name = 'MUTAG'
path = './data/MUTAG'
dataset = Entities(path, name)
data = dataset[0]
print(dataset[1])
print(dataset.raw_paths)
print(data)
print(data.edge_index.shape)

IndexError: index 2 is out of bounds for dimension 0 with size 2

In [14]:
def uniform(size, tensor):
    stdv = 1.0 / math.sqrt(size)
    if tensor is not None:
        tensor.data.uniform_(-stdv, stdv)

In [15]:
class RGCNConv(MessagePassing):
    r"""The relational graph convolutional operator from the `"Modeling
    Relational Data with Graph Convolutional Networks"
    <https://arxiv.org/abs/1703.06103>`_ paper

    .. math::
        \mathbf{x}^{\prime}_i = \mathbf{\Theta}_0 \cdot \mathbf{x}_i +
        \sum_{r \in \mathcal{R}} \sum_{j \in \mathcal{N}_r(i)}
        \frac{1}{|\mathcal{N}_r(i)|} \mathbf{\Theta}_r \cdot \mathbf{x}_j,

    where :math:`\mathcal{R}` denotes the set of relations, *i.e.* edge types.

    Args:
        in_channels (int): Size of each input sample.
        out_channels (int): Size of each output sample.
        num_relations (int): Number of relations.
        num_bases (int): Number of bases used for basis-decomposition.
        bias (bool, optional): If set to :obj:`False`, the layer will not learn
            an additive bias. (default: :obj:`True`)
    """

    def __init__(self,
                 in_channels,
                 out_channels,
                 num_relations,
                 num_bases,
                 bias=True):
        super(RGCNConv, self).__init__()

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.num_relations = num_relations
        self.num_bases = num_bases

        self.basis = Param(torch.Tensor(num_bases, in_channels, out_channels))
        self.att = Param(torch.Tensor(num_relations, num_bases))
        self.root = Param(torch.Tensor(in_channels, out_channels))

        if bias:
            self.bias = Param(torch.Tensor(out_channels))
        else:
            self.register_parameter('bias', None)

        self.reset_parameters()

    def reset_parameters(self):
        size = self.num_bases * self.in_channels
        uniform(size, self.basis)
        uniform(size, self.att)
        uniform(size, self.root)
        uniform(size, self.bias)


    def forward(self, x, edge_index, edge_type, edge_norm=None):
        print('in_channels: ', self.in_channels)
        print('out_channels: ', self.out_channels)
        print('num_relations: ', self.num_relations)
        print('num_bases: ', self.num_bases)
        
        print('basis: ', self.basis.shape)
        print('att: ', self.att.shape)
        print('root: ', self.root.shape)

        """"""
        if x is None:
            x = torch.arange(
                edge_index.max().item() + 1,
                dtype=torch.long,
                device=edge_index.device)

        print('x: ', x.shape)
        print('edge_index: ', edge_index.shape)
        
        return self.propagate(
            'add', edge_index, x=x, edge_type=edge_type, edge_norm=edge_norm)


    def message(self, x_j, edge_type, edge_norm):
        w = torch.matmul(self.att, self.basis.view(self.num_bases, -1))
        print('w1 ', w.shape)
        print('x_j: ', x_j.shape, x_j.min(), x_j.max())
        print('edge_type: ', edge_type.shape, edge_type.min(), edge_type.max())
        print('edge_norm: ', edge_norm)

        # ネットワークの最初の段階で，one-hot vectorを入力した場合
        if x_j.dtype == torch.long:
            print('torch is long')
            w = w.view(-1, self.out_channels)
            print('w2: ', w.shape)
            index = edge_type * self.in_channels + x_j
            print('index: ', index.shape)
            out = w[index]
            print('out: ', out.shape)
            return out if edge_norm is None else out * edge_norm.view(-1, 1)
        
        # ネットワークの中間層の段階で，中間特徴量を入力した場合
        else:
            print('torch is not long')
            w = w.view(self.num_relations, self.in_channels, self.out_channels)
            w = w[edge_type]
            out = torch.bmm(x_j.unsqueeze(1), w).squeeze(-2)
            return out if edge_norm is None else out * edge_norm.view(-1, 1)

    def update(self, aggr_out, x):
        # propagateで指定されたaggregateが行われた結果が，aggr_outとしてくる．
        print('aggr_out: ', aggr_out.shape, aggr_out.min(), aggr_out.max())
        print('x: ', x.shape, x.min(), x.max())
        print('root: ', self.root.shape, self.root.min(), self.root.max())
        # ネットワークの最初の段階で，one-hot vectorを入力した場合    
        if x.dtype == torch.long:
            print('self.root[x]: ', self.root[x].shape)
            # root[x]を足して，self-loopを別の重みであることを実現している．
            # つまり，rootがself-loopのweightを示している．
            out = aggr_out + self.root[x]
            
        # ネットワークの中間層の段階で，中間特徴量を入力した場合
        else:
            # rootとxをかけて，Wx(self-loop)を行なっている
            out = aggr_out + torch.matmul(x, self.root)

        if self.bias is not None:
            out = out + self.bias
        return out

    def __repr__(self):
        return '{}({}, {}, num_relations={})'.format(
            self.__class__.__name__, self.in_channels, self.out_channels,
            self.num_relations)


In [16]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = RGCNConv(
            data.num_nodes, 16, dataset.num_relations, num_bases=30)
        self.conv2 = RGCNConv(
            16, dataset.num_classes, dataset.num_relations, num_bases=30)

    def forward(self, edge_index, edge_type, edge_norm):
        x = F.relu(self.conv1(None, edge_index, edge_type))
        print(' ')
        x = self.conv2(x, edge_index, edge_type)
        return F.log_softmax(x, dim=1)

In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0005)

In [24]:
out = model(data.edge_index, data.edge_type, data.edge_norm)
print('')
print(data.train_idx.shape)
print(data.test_idx.shape)
print(out.shape)

in_channels:  23644
out_channels:  16
num_relations:  46
num_bases:  30
basis:  torch.Size([30, 23644, 16])
att:  torch.Size([46, 30])
root:  torch.Size([23644, 16])
x:  torch.Size([23644])
edge_index:  torch.Size([2, 148454])
w1  torch.Size([46, 378304])
x_j:  torch.Size([148454]) tensor(0) tensor(23643)
edge_type:  torch.Size([148454]) tensor(0) tensor(45)
edge_norm:  None
torch is long
w2:  torch.Size([1087624, 16])
index:  torch.Size([148454])
out:  torch.Size([148454, 16])
aggr_out:  torch.Size([23644, 16]) tensor(-0.0004, grad_fn=<MinBackward1>) tensor(0.0002, grad_fn=<MaxBackward1>)
x:  torch.Size([23644]) tensor(0) tensor(23643)
root:  torch.Size([23644, 16]) tensor(-0.0012, grad_fn=<MinBackward1>) tensor(0.0012, grad_fn=<MaxBackward1>)
self.root[x]:  torch.Size([23644, 16])
 
in_channels:  16
out_channels:  2
num_relations:  46
num_bases:  30
basis:  torch.Size([30, 16, 2])
att:  torch.Size([46, 30])
root:  torch.Size([16, 2])
x:  torch.Size([23644, 16])
edge_index:  torch.Siz

In [8]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.edge_index, data.edge_type, data.edge_norm)
    # train_maskとtest_maskっぽいのが使用されているのは，ここ
    F.nll_loss(out[data.train_idx], data.train_y).backward()
    optimizer.step()


def test():
    model.eval()
    out = model(data.edge_index, data.edge_type, data.edge_norm)
    pred = out[data.test_idx].max(1)[1]
    acc = pred.eq(data.test_y).sum().item() / data.test_y.size(0)
    return acc

In [9]:
for epoch in range(1):
    train()
#     test_acc = test()
#     print('Epoch: {:02d}, Accuracy: {:.4f}'.format(epoch, test_acc))

in_channels:  23644
out_channels:  16
num_relations:  46
num_bases:  30
basis:  torch.Size([30, 23644, 16])
att:  torch.Size([46, 30])
root:  torch.Size([23644, 16])
x:  torch.Size([23644])
edge_index:  torch.Size([2, 148454])
w1  torch.Size([46, 378304])
x_j:  torch.Size([148454]) tensor(0) tensor(23643)
edge_type:  torch.Size([148454]) tensor(0) tensor(45)
edge_norm:  None
torch is long
w2:  torch.Size([1087624, 16])
index:  torch.Size([148454])
out:  torch.Size([148454, 16])
aggr_out:  torch.Size([23644, 16]) tensor(-0.0003, grad_fn=<MinBackward1>) tensor(0.0005, grad_fn=<MaxBackward1>)
x:  torch.Size([23644]) tensor(0) tensor(23643)
root:  torch.Size([23644, 16]) tensor(-0.0012, grad_fn=<MinBackward1>) tensor(0.0012, grad_fn=<MaxBackward1>)
self.root[x]:  torch.Size([23644, 16])
 
in_channels:  16
out_channels:  2
num_relations:  46
num_bases:  30
basis:  torch.Size([30, 16, 2])
att:  torch.Size([46, 30])
root:  torch.Size([16, 2])
x:  torch.Size([23644, 16])
edge_index:  torch.Siz

# Experiment

In [223]:
in_c = 100
out_c = 10
num_relations = 5

ord_basis = [Param(torch.Tensor(1, in_c, out_c)) for _ in range(num_relations)]

In [224]:
len(ord_basis)

5

In [225]:
tmp, w = 0, 0
for relation in range(num_relations):
    tmp = ord_basis[relation]
    if relation == 0:
        w = tmp
    else:
        w = torch.cat((w, tmp), 0)

In [226]:
w.requires_grad

True

In [227]:
ord_basis[0].requires_grad

True

In [228]:
data.edge_index.shape

torch.Size([2, 148454])

In [229]:
data.edge_type.shape

torch.Size([148454])

In [230]:
dataset.num_relations

46

In [27]:
num_node = data.edge_index.max() + 1

In [32]:
edge = torch.where(data.edge_index == 23000, 
                   torch.ones(data.edge_index.shape, 
                              dtype=torch.int64)*dataset.num_relations, 
                   torch.zeros(data.edge_index.shape, dtype=torch.int64))


print(data.edge_index.dtype, edge.dtype)
print(torch.sum(edge[0]))

relation = data.edge_type + edge[0]

import numpy as np
print(set(list(np.array(relation))))

print(dataset.num_relations)

relation = torch.where(relation >= dataset.num_relations, 
#                        torch.ones(relation.shape, dtype=relation.dtype),
                       relation - dataset.num_relations, 
                       torch.zeros(relation.shape, dtype=relation.dtype))

print(set(list(np.array(relation))))

# edge_normは最終的に，edges x 1のshapeになる必要がある．
# つまり，target_edgeのもつedgeの本数がわかればよく，
# target_edges x 1のshapeで要素にnum_edgesが入る．
# そこからそれの逆行列を取り，それをoutに通常の乗算を行う．

node_norm = torch.zeros(num_node, dtype=data.edge_index.dtype)
print(node_norm.size(0))

import time

start = time.time()
for idx in range(node_norm.size(0)):
    edge = torch.where(
                        data.edge_index == idx,
                        torch.ones(data.edge_index.shape,
                                   dtype=data.edge_index.dtype),
                        torch.zeros(data.edge_index.shape, 
                                    dtype=data.edge_index.dtype)
                        )
    
    node_norm[idx] = int(torch.sum(edge) / 2)
    
print('Elapsed: ', time.time() - start)

torch.int64 torch.int64
tensor(230)
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 49, 53, 54}
46
{0, 8, 3, 7}
23644
Elapsed:  88.96163487434387


In [33]:
print(node_norm.mean())

RuntimeError: Can only calculate the mean of floating types. Got Long instead.

In [119]:
a = torch.ones(10, 3)
b = torch.arange(10, dtype=a.dtype)
print(torch.diag(b).shape)
c = torch.matmul(torch.diag(b), a)
print(c)
d = a * b.unsqueeze(1)
print(d)

torch.Size([10, 10])
tensor([[0., 0., 0.],
        [1., 1., 1.],
        [2., 2., 2.],
        [3., 3., 3.],
        [4., 4., 4.],
        [5., 5., 5.],
        [6., 6., 6.],
        [7., 7., 7.],
        [8., 8., 8.],
        [9., 9., 9.]])
tensor([[0., 0., 0.],
        [1., 1., 1.],
        [2., 2., 2.],
        [3., 3., 3.],
        [4., 4., 4.],
        [5., 5., 5.],
        [6., 6., 6.],
        [7., 7., 7.],
        [8., 8., 8.],
        [9., 9., 9.]])


## Read the dataset

In [48]:
import pandas as pd

In [101]:
train_path = '../../data/ml-100k/u1.base'
col_names = ['user_id', 'item_id', 'relation', 'ts']
train_raw_data = pd.read_csv(train_path, sep='\t', names=col_names)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [108]:
num_users = 943
num_items = 1682
num_nodes = num_users + num_items
num_edges = 100000
num_train_edges = 80000
num_test_edges = 20000

In [109]:
train_data = train_raw_data.drop('ts', axis=1)
train_data['user_id'] = train_data['user_id'] - 1
train_data['item_id'] = train_data['item_id'] + num_users - 1

In [169]:
print(train_data.max()['user_id'])
print(len(train_data))

942
80000


In [114]:
x = torch.arange(num_nodes, dtype=torch.long, device=device)

In [110]:
edge_user = torch.tensor(train_data['user_id'].values)
edge_item = torch.tensor(train_data['item_id'].values)
edge_index = torch.stack((torch.cat((edge_user, edge_item), 0),
                          torch.cat((edge_item, edge_user), 0)), 0)
edge_index = edge_index.to(device).to(torch.long)

In [116]:
edge_type = torch.tensor(train_data['relation'])
edge_type = torch.cat((edge_type, edge_type), 0)

In [118]:
print(x.shape)
print(edge_index.shape)
print(edge_type.shape)

torch.Size([2625])
torch.Size([2, 160000])
torch.Size([160000])


In [164]:
import copy
import time

edge_norm = copy.deepcopy(edge_index[1])
print(edge_norm)

start = time.time()
for idx in range(num_nodes):
    count = (train_data == idx).values.sum()
#     count = torch.sum(torch.where(edge_index == idx,
#                                   torch.ones(edge_index.shape,
#                                              dtype=torch.long),
#                                   torch.zeros(edge_index.shape, 
#                                               dtype=torch.long)))
    
    edge_norm = torch.where(edge_norm==idx,
                            torch.tensor(count),
                            edge_norm)

print('Elapsed: ', time.time() - start)

print(1 / edge_norm.to(torch.double), edge_norm.shape)

tensor([943, 944, 945,  ..., 942, 942, 942])
Elapsed:  6.865419149398804
tensor([0.0026, 0.0095, 0.0133,  ..., 0.0060, 0.0060, 0.0060],
       dtype=torch.float64) torch.Size([160000])


In [141]:
count = (train_data == 942).values.sum()
print(count)

168


In [72]:
train_data.head(10)

Unnamed: 0,user_id,item_id,relation
0,0,943,5
1,0,944,3
2,0,945,4
3,0,946,3
4,0,947,3
5,0,949,4
6,0,950,1
7,0,951,5
8,0,953,2
9,0,955,5
