**Metapath2vec**

Metapath2vec is an algorithm allowing us to perform random walk on heterogeneous network.

In real life, many networks are **heterogeneous**. However, past algorithm mainly focus on learning embedding of homogeneous network, so this paper proposes a new idea about how to perform random walk on heterogeneous network. 

What is heterogeneous network? It's a network containing **different types** of nodes and edges. For example, in academic network, the nodes could be authors, papers, venues, etc. And the edges could be co-author(author-author), publish(author-paper), belong(paper-venue), etc. In such network, simplily performing random walk would lose the information of this heterogeneous part. 

The author first define some **metapaths** like, APA, APVPA, etc. APA means two authors co-authored a paper and APVPA means two authors published two papers in the same venue separately. In this case, the heterogeneous information of the nodes and edges are maintained. 

And for each metapath, they perform random walk to follow the pattern of the path to form the walks. And then they use the model similar to LINE to train on these walks.

In [None]:
import os
import re
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from collections import defaultdict
import numpy as np
from sklearn.cluster import KMeans
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
authornum = 100
papernum = 5000
venuenum = 20
authorset = set(range(0, authornum))
paperset = set(range(authornum, authornum + papernum))
venueset = set(range(authornum + papernum, authornum + papernum + venuenum))
author2paper = {}
paper2venue = {}
edges = {}

tmp = set()
for author in authorset:
  n = random.randint(1, int(papernum / authornum * 5))
  papers = random.sample(list(paperset), n)
  for paper in papers:
    author2paper[(author, paper)] = 1
    edges[(author, paper)] = 1
    edges[(paper, author)] = 1
    tmp.add(paper)
tmp = paperset - tmp
for paper in tmp:
  author = random.sample(list(authorset), 1)[0]
  author2paper[(author, paper)] = 1
  edges[(author, paper)] = 1
  edges[(paper, author)] = 1

tmp = set()
for paper in paperset:
  venue = random.sample(list(venueset), 1)[0]
  tmp.add(venue)
  paper2venue[(paper, venue)] = 1
  edges[(paper, venue)] = 1
  edges[(venue, paper)] = 1

tmp = venueset - tmp
for venue in tmp:
  paper = random.sample(list(paperset), 1)[0]
  paper2venue[(paper, venue)] = 1
  edges[(paper, venue)] = 1
  edges[(venue, paper)] = 1

A = 'a'
P = 'p'
V = 'v'

nodes = {A: list(authorset), P: list(paperset), V: list(venueset)}
metapaths = [[A, P, V, P, A], [A, P, A]]

print("num of authors: {0}, num of papers: {1}, num of venues: {2}".format(len(nodes[A]), len(nodes[P]), len(nodes[V])))
print("edge of author-paper: {0}".format(len(author2paper)))
print("edge of venue-paper: {0}".format(len(paper2venue)))

num of authors: 100, num of papers: 5000, num of venues: 20
edge of author-paper: 12244
edge of venue-paper: 5000


In [None]:
node2neighbors = defaultdict(lambda: defaultdict(dict))
for author, paper in author2paper:
  node2neighbors[author][P][paper] = author2paper[(author, paper)]
  node2neighbors[paper][A][author] = author2paper[(author, paper)]

for paper, venue in paper2venue:
  node2neighbors[paper][V][venue] = paper2venue[(paper, venue)]
  node2neighbors[venue][P][paper] = paper2venue[(paper, venue)]

for author, paper in author2paper:
  node2neighbors[author][P][paper] /= len(node2neighbors[author][P])
  node2neighbors[paper][A][author] /= len(node2neighbors[paper][A])

for paper, venue in paper2venue:
  node2neighbors[paper][V][venue] /= len(node2neighbors[paper][V])
  node2neighbors[venue][P][paper] /= len(node2neighbors[venue][P])

In [None]:
class alias():
  def __init__(self, probs):
    self.n = len(probs)
    self.scaledprobs = {}
    self.table = {}
    self.aliastable = {}
    self.small = []
    self.big = []
    self.keys = list(probs.keys())

    for item in probs:
      prob = probs[item]
      self.scaledprobs[item] = prob * self.n
      if self.scaledprobs[item] > 1:
        self.big.append(item)
      elif self.scaledprobs[item] < 1:
        self.small.append(item)
      else:
        self.table[item] = 1
    
    while self.small and self.big:
      smallitem = self.small.pop()
      bigitem = self.big.pop()
      newprob = self.scaledprobs[bigitem] - (1 - self.scaledprobs[smallitem])
      self.table[smallitem] = self.scaledprobs[smallitem]
      self.aliastable[smallitem] = bigitem
      self.scaledprobs[bigitem] = newprob
      if self.scaledprobs[bigitem] > 1:
        self.big.append(bigitem)
      elif self.scaledprobs[bigitem] < 1:
        self.small.append(bigitem)
      else:
        self.table[bigitem] = 1
    
    while self.small:
      smallitem = self.small.pop()
      self.table[smallitem] = 1
    
    while self.big:
      bigitem = self.big.pop()
      self.table[bigitem] = 1

  def sampling_one(self):
    sample = random.choice(self.keys)
    if self.table[sample] >= random.uniform(0, 1):
      return sample
    else:
      return self.aliastable[sample]
  
  def sampling_n(self, n):
    samples = []
    for i in range(n):
      samples.append(self.sampling_one())
    return samples

In [None]:
nodeprobs = defaultdict(dict)
for author, paper in author2paper:
  nodeprobs[author][P] = alias(node2neighbors[author][P])
  nodeprobs[paper][A] = alias(node2neighbors[paper][A])

for paper, venue in paper2venue:
  nodeprobs[paper][V] = alias(node2neighbors[paper][V])
  nodeprobs[venue][P] = alias(node2neighbors[venue][P])

In [None]:
walk_len = 10
walk_num = 10
walks = []
for node in range(authornum):
  walk_count = 0
  while(walk_count < walk_num):
    node_count = 1
    walk = [node]
    t = random.randint(0, 1)
    while(node_count < walk_len):
      prev = walk[-1]
      if prev in authorset:
        cur = nodeprobs[prev][P].sampling_one()
      elif prev in paperset:
        if t == 0:
          if walk[-2] in authorset:
            cur = nodeprobs[prev][V].sampling_one()
          elif walk[-2] in venueset:
            cur = nodeprobs[prev][A].sampling_one()
        elif t == 1:
          cur = nodeprobs[prev][A].sampling_one()
      elif prev in venueset:
        cur = nodeprobs[prev][P].sampling_one()
      walk.append(cur)
      node_count += 1
    walks.append(walk)
    walk_count += 1

In [None]:
print("number of walks: {0}".format(len(walks)))
print(walks[0])

number of walks: 1000
[0, 2159, 5115, 1350, 87, 3749, 5109, 331, 93, 1042]


In [None]:
wordsz = authornum + papernum + venuenum
def sampling_negedge(dataset, batchsz):
  trainset = []
  posdata = random.sample(dataset, batchsz)
  for source, target in posdata:
    trainset.append([source, target, 1])
    count = 0
    while(count < batchsz):
      if target in authorset:
        new = random.choice(list(authorset))
      elif target in paperset:
        new = random.choice(list(paperset))
      elif target in venueset:
        new = random.choice(list(venueset))
      if (source, new) not in edges:
        trainset.append([source, new, -1])
        break
      count += 1
  return trainset

def one_hot(node):
  vec = [0] * wordsz
  vec[node] = 1
  return vec

def gen_data(walks):
  dataset = []
  for walk in walks:
    for i, node in enumerate(walk):
      source = node
      for j in range(i - windowsz, i + windowsz + 1):
        if j != i and j >= 0 and j < len(walk):
          target = walk[j]
          dataset.append([source, target])
  return dataset

def tensor_trainset(trainset):
  vi = []
  vj = []
  labels = []
  for item in trainset:
    vi.append(one_hot(item[0]))
    vj.append(one_hot(item[1]))
    labels.append(item[2])
  return torch.Tensor(vi), torch.Tensor(vj), torch.tensor(labels)

In [None]:
windowsz = 2
dataset = gen_data(walks)

In [None]:
epochs = 20
lr = 0.1
batchsz = 64
batchnum = len(edges) // batchsz
featuresz = 32

In [None]:
class Metapath2vec(nn.Module):
  def __init__(self, nodenum, featuresz):
    super(Metapath2vec, self).__init__()
    self.node_embeddings = nn.Linear(nodenum, featuresz, bias=False)
    self.sigmoid = nn.LogSigmoid()
    self.node_embeddings.weight.data = self.node_embeddings.weight.data.uniform_(
            -.5, .5) / featuresz
  def forward(self, vi, vj, labels):
    viembeddings = self.node_embeddings(vi)
    vjembeddings = self.node_embeddings(vj)
    inner_product = torch.sum(viembeddings * vjembeddings, 1)
    loss = -torch.sum(self.sigmoid(inner_product * labels))
    return loss

In [None]:
model = Metapath2vec(wordsz, featuresz)
optimier = optim.SGD(model.parameters(), lr=lr)

In [None]:
def train():
  model.train()
  for epoch in range(epochs):
    avgloss = 0
    for batch in range(batchnum):
      trainset = sampling_negedge(dataset, batchsz)
      vi, vj, labels = tensor_trainset(trainset)
      model.zero_grad()
      loss = model(vi, vj, labels)
      loss.backward()
      optimier.step()
      avgloss += loss
    avgloss /= batchnum * batchsz
    print("epoch: {0}, loss: {1}".format(epoch, avgloss))

In [None]:
train()

epoch: 0, loss: 1.374491572380066
epoch: 1, loss: 1.2628422975540161
epoch: 2, loss: 1.125109314918518
epoch: 3, loss: 1.040333867073059
epoch: 4, loss: 0.986588716506958
epoch: 5, loss: 0.960662841796875
epoch: 6, loss: 0.937824547290802
epoch: 7, loss: 0.9209144711494446
epoch: 8, loss: 0.9173742532730103
epoch: 9, loss: 0.9110962748527527
epoch: 10, loss: 0.9011550545692444
epoch: 11, loss: 0.9040653705596924
epoch: 12, loss: 0.8976414203643799
epoch: 13, loss: 0.9029335379600525
epoch: 14, loss: 0.8958593606948853
epoch: 15, loss: 0.8934189081192017
epoch: 16, loss: 0.8911520838737488
epoch: 17, loss: 0.8945470452308655
epoch: 18, loss: 0.8927327990531921
epoch: 19, loss: 0.8952814340591431


In [None]:
embedding = model.node_embeddings.weight.T
similaity = F.cosine_similarity(embedding.unsqueeze(1), embedding.unsqueeze(0), dim=2)
a, idx = torch.sort(similaity, descending=True)
k = 4
lists=idx[:,1:k+1]
for i in range(100):
  print("[{0}] is similar to ".format(i), end="")
  for j in range(k):
    print("[{0}]".format(int(lists[i][j])), end=" ")
  print()

[0] is similar to [2989] [1585] [3020] [5043] 
[1] is similar to [3524] [339] [1360] [522] 
[2] is similar to [369] [1938] [3732] [4720] 
[3] is similar to [1051] [3426] [627] [4741] 
[4] is similar to [2613] [165] [1476] [4823] 
[5] is similar to [4490] [303] [4319] [3759] 
[6] is similar to [2589] [4282] [4074] [3698] 
[7] is similar to [1402] [2803] [2163] [4030] 
[8] is similar to [4468] [635] [2506] [371] 
[9] is similar to [3757] [1139] [5020] [3998] 
[10] is similar to [2682] [4390] [4812] [3131] 
[11] is similar to [2013] [4974] [4982] [1223] 
[12] is similar to [1394] [1930] [4006] [488] 
[13] is similar to [3246] [2592] [1961] [1880] 
[14] is similar to [3513] [2540] [1560] [2425] 
[15] is similar to [891] [2311] [2868] [2941] 
[16] is similar to [2509] [4108] [2695] [1765] 
[17] is similar to [2094] [2173] [2674] [3614] 
[18] is similar to [3550] [3048] [1391] [1911] 
[19] is similar to [1712] [3746] [4426] [395] 
[20] is similar to [4813] [3440] [1826] [334] 
[21] is simila