[Metapath2vec paper](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf)


In [9]:
from typing import Optional, Callable, List

import os
import os.path as osp
import shutil

import torch
import pandas
from torch_sparse import coalesce, transpose
from torch_geometric.data import (InMemoryDataset, Data, download_url,
                                  extract_zip)

import numpy as np
from torch_geometric.datasets import AMiner
from torch_geometric.nn import MetaPath2Vec

In [None]:
# Get author labels
path = osp.join('net_aminer/')
author = pandas.read_csv(path+'id_author.txt', sep='\t', names=['idx', 'name'],
                                 index_col=1)
path = osp.join('label/googlescholar.8area.author.label.txt')
df = pandas.read_csv(path, sep=' ', names=['name', 'y'])
df = df.join(author, on='name')
author_y = torch.from_numpy(df['y'].values) - 1
author_y_index = torch.from_numpy(df['idx'].values)

In [31]:
!ls label/

googlescholar.8area.author.label.txt  googlescholar.8area.venue.label.txt


In [46]:
# Get venue labels.
path = osp.join('net_aminer/')
venue = pandas.read_csv(path+'id_conf.txt', sep='\t', names=['idx', 'name'],
                                index_col=1)
path = osp.join('label/googlescholar.8area.venue.label.txt')
df = pandas.read_csv(path, sep=' ', names=['name', 'y'])
df = df.join(venue, on='name')
                        
venue_y = torch.from_numpy(df['y'].values) - 1
venue_y_index = torch.from_numpy(df['idx'].values)

In [35]:
# Get paper <-> author connectivity
path = osp.join('net_aminer/paper_author.txt')
paper_author = pandas.read_csv(path, sep='\t', header=None)
paper_author = torch.from_numpy(paper_author.values)
paper_author = paper_author.t().contiguous() # t 가 무슨 함수인지 아직도 의문.
M, N = int(paper_author[0].max() + 1), int(paper_author[1].max() + 1)
paper_author, _ = coalesce(paper_author, None, M, N)
author_paper, _ = transpose(paper_author, None, M, N)

_coalesce function_


torch_sparse.coalesce(index, value, m, n, op="add") -> (torch.LongTensor, torch.Tensor)

Row-wise sorts index and removes duplicate entries. 

In [40]:
# example
index = torch.tensor([[1, 0, 1, 0, 2, 1],
                     [0, 1, 1, 1, 0, 0]])
value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])

index, value = coalesce(index, value, m=3, n=2)

print(index)
print(value)

tensor([[0, 1, 1, 2],
        [1, 0, 1, 0]])
tensor([[6., 8.],
        [7., 9.],
        [3., 4.],
        [5., 6.]])


_transpose function_

torch_sparse.transpose(index, value, m, n) -> (torch.LongTensor, torch.Tensor)

Transposes dimensions 0 and 1 of a sparse matrix.

In [43]:
# example 
index = torch.tensor([[1, 0, 1, 0, 2, 1],
                      [0, 1, 1, 1, 0, 0]])
value = torch.Tensor([[1, 2], [2, 3], [3, 4], [4, 5], [5, 6], [6, 7]])

index, value = transpose(index, value, 3, 2)

print(index)
print(value)

tensor([[0, 0, 1, 1],
        [1, 2, 0, 1]])
tensor([[7., 9.],
        [5., 6.],
        [6., 8.],
        [3., 4.]])


In [44]:
# Get paper<->venue connectivity
path = osp.join('net_aminer/paper_conf.txt')
paper_venue = pandas.read_csv(path, sep='\t', header=None)
paper_venue = torch.from_numpy(paper_venue.values)
paper_venue = paper_venue.t().contiguous()
M, N = int(paper_venue[0].max() + 1), int(paper_venue[1].max() + 1)
paper_venue, _ = coalesce(paper_venue, None, M, N)
venue_paper, _ = transpose(paper_venue, None, M, N)

In [47]:
data = Data(
            edge_index_dict={
                ('paper', 'written by', 'author'): paper_author,
                ('author', 'wrote', 'paper'): author_paper,
                ('paper', 'published in', 'venue'): paper_venue,
                ('venue', 'published', 'paper'): venue_paper,
            },
            y_dict={
                'author': author_y,
                'venue': venue_y,
            },
            y_index_dict={
                'author': author_y_index,
                'venue': venue_y_index,
            },
            num_nodes_dict={
                'paper': int(paper_author[0].max()) + 1,
                'author': author.shape[0],
                'venue': venue.shape[0],
            },
        )

In [62]:
data

Data(
  edge_index_dict={
    ('paper', 'written by', 'author')=[2, 9323605],
    ('author', 'wrote', 'paper')=[2, 9323605],
    ('paper', 'published in', 'venue')=[2, 3194405],
    ('venue', 'published', 'paper')=[2, 3194405]
  },
  num_nodes_dict={
    paper=3194405,
    author=1693531,
    venue=3883
  },
  y_dict={
    author=[246678],
    venue=[134]
  },
  y_index_dict={
    author=[246678],
    venue=[134]
  }
)

In [2]:
'''Class MetaPath2vec(
edge_index_dict,
embedding_dim,
metapath,
walk_length,
context_size,
walks_pernode,
num_negative_samples,
num_nodes_dict,
sparse)
'''

'Class MetaPath2vec(\nedge_index_dict,\nembedding_dim,\nmetapath,\nwalk_length,\ncontext_size,\nwalks_pernode,\nnum_negative_samples,\nnum_nodes_dict,\nsparse)\n'

Metapath2vec


- Given a specific Schema P


스키마 p 를 manually building 해주어야함.

- Extract random meta path from the input graph


random walk 를 heterogeneous 하게 적용했다고 생각함.


- Use the skip-gram model

In [63]:
# 데이터셋을 불러옴
'''
path = osp.join('..', 'data', 'AMiner')
dataset = AMiner(path)
data = dataset[0]
'''

"\npath = osp.join('..', 'data', 'AMiner')\ndataset = AMiner(path)\ndata = dataset[0]\n"

In [68]:
print(type(data.edge_index_dict))
print(data.edge_index_dict[('paper', 'written by', 'author')])

<class 'dict'>
tensor([[      0,       1,       2,  ..., 3194404, 3194404, 3194404],
        [      0,       1,       2,  ...,    4393,   21681,  317436]])


In [112]:
# cpu -> GPU
#device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

In [113]:
# modeling

metapath = [
    ('author', 'wrote', 'paper'),
    ('paper', 'published in', 'venue'),
    ('venue', 'published', 'paper'),
    ('paper', 'written by', 'author'),
]

model = MetaPath2Vec(data.edge_index_dict,
                    embedding_dim=128,
                    metapath=metapath,
                    context_size=3,
                    walks_per_node=3,
                    walk_length=5,
                    num_negative_samples=1,
                    sparse=True
                    ).to(device)

In [114]:
# loader
loader = model.loader(batch_size=32, shuffle=True, num_workers=4)

In [115]:
for idx, (pos_rw, neg_rw) in enumerate(loader):
    if idx == 10: break
    print(idx, pos_rw.shape, neg_rw.shape)

0 torch.Size([384, 3]) torch.Size([384, 3])
1 torch.Size([384, 3]) torch.Size([384, 3])
2 torch.Size([384, 3]) torch.Size([384, 3])
3 torch.Size([384, 3]) torch.Size([384, 3])
4 torch.Size([384, 3]) torch.Size([384, 3])
5 torch.Size([384, 3]) torch.Size([384, 3])
6 torch.Size([384, 3]) torch.Size([384, 3])
7 torch.Size([384, 3]) torch.Size([384, 3])
8 torch.Size([384, 3]) torch.Size([384, 3])
9 torch.Size([384, 3]) torch.Size([384, 3])


In [116]:
print(pos_rw[0], neg_rw[0])

tensor([1489937, 4185789, 4890570]) tensor([1489937, 4138251, 4890223])


In [117]:
import itertools

In [118]:
# Inizialize optimizer
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [119]:
def train(epoch, log_steps=500, eval_steps=1000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Acc: {acc:.4f}'))

# Evaluation 
            
@torch.no_grad()
def test(train_ratio=0.1):
    model.eval()

    z = model('author', batch=data.y_index_dict['author'])
    y = data.y_dict['author']
    ## perm function 은 여기에서 처음 접하는거같음...docu 보니까 그냥 n-1 random permutation을 ouptut으로 주는 역할.
    perm = torch.randperm(z.size(0))
    train_perm = perm[:int(z.size(0) * train_ratio)]
    test_perm = perm[int(z.size(0) * train_ratio):]

    return model.test(z[train_perm], y[train_perm], z[test_perm],
                      y[test_perm], max_iter=150)

In [120]:
'''torch.randperm
argument
n, 
*, 
generator=None, 
out=None, 
dtype=torch.int64, 
layout=torch.strided, 
device=None, 
requires_grad=False, 
pin_memory=False) → Tensor

>>> torch.randperm(4)
tensor([2, 1, 0, 3])
'''

'torch.randperm\nargument\nn, \n*, \ngenerator=None, \nout=None, \ndtype=torch.int64, \nlayout=torch.strided, \ndevice=None, \nrequires_grad=False, \npin_memory=False) → Tensor\n\n>>> torch.randperm(4)\ntensor([2, 1, 0, 3])\n'

In [121]:
for epoch in range(1,2):
    train(epoch)
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc}')

Epoch: 1, Step: 00500/52923, Loss: 9.7243
Epoch: 1, Step: 01000/52923, Loss: 9.4063
Epoch: 1, Step: 01000/52923, Acc: 0.2763
Epoch: 1, Step: 01500/52923, Loss: 9.0478
Epoch: 1, Step: 02000/52923, Loss: 8.7355
Epoch: 1, Step: 02000/52923, Acc: 0.2768
Epoch: 1, Step: 02500/52923, Loss: 8.4120
Epoch: 1, Step: 03000/52923, Loss: 8.1381
Epoch: 1, Step: 03000/52923, Acc: 0.2753
Epoch: 1, Step: 03500/52923, Loss: 7.9349
Epoch: 1, Step: 04000/52923, Loss: 7.7319
Epoch: 1, Step: 04000/52923, Acc: 0.2737
Epoch: 1, Step: 04500/52923, Loss: 7.5585
Epoch: 1, Step: 05000/52923, Loss: 7.3996
Epoch: 1, Step: 05000/52923, Acc: 0.2761
Epoch: 1, Step: 05500/52923, Loss: 7.2330
Epoch: 1, Step: 06000/52923, Loss: 7.0731
Epoch: 1, Step: 06000/52923, Acc: 0.2756
Epoch: 1, Step: 06500/52923, Loss: 6.9587
Epoch: 1, Step: 07000/52923, Loss: 6.8318
Epoch: 1, Step: 07000/52923, Acc: 0.2760
Epoch: 1, Step: 07500/52923, Loss: 6.7207
Epoch: 1, Step: 08000/52923, Loss: 6.6283
Epoch: 1, Step: 08000/52923, Acc: 0.2755


batch size 를 128에서 32까지 줄여봤음에도 불구하고 gpu oom 이 발생함...
부득이하게 device 를 cpu로 전환하였습니다...