# Metapath2Vec 示例

针对异构图，我们使用Metapath2Vec的方法来进行无监督的学习。[Metapath2Vec](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf)是比较经典的异构图上表征学习的方法，它使用特定的metapath进行随机游走，然后对于metapath使用skip-gram模型进行表征学习。

之前我们通过DGL的randomwalk接口了解到了如何在DGL异构图上进行随机游走，获取了每个公司节点的随机游走的迹(trace)。Metapath2Vec使用了负采样的方法来计算没有边的领居点。这里我们使用自定义的负采样的方法，即从源节点没有连接的目标节点里选择节点。

In [29]:
import torch
import dgl
import pandas as pd
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import init
from torch import optim
from dgl.sampling import random_walk

### 导入保存的样例图数据

In [30]:
graphs, _ = dgl.load_graphs('./hetero_dgl.bin')
graph = graphs[0]
graph

Graph(num_nodes={'company': 3, 'product': 8},
      num_edges={('company', 'produce', 'product'): 8, ('company', 'suppliedby', 'company'): 3, ('company', 'supply', 'company'): 3, ('product', 'downstream', 'product'): 7, ('product', 'producedby', 'company'): 8, ('product', 'upstream', 'product'): 7},
      metagraph=[('company', 'product', 'produce'), ('company', 'company', 'suppliedby'), ('company', 'company', 'supply'), ('product', 'product', 'downstream'), ('product', 'product', 'upstream'), ('product', 'company', 'producedby')])

### 导入节点的原始名称并形成ID

In [31]:
df_company = pd.read_csv('raw_data/companies.csv')
df_product = pd.read_csv('raw_data/products.csv')

comp2id = {comp: i for i, comp in enumerate(df_company['CID'].unique())}
id2comp = {i: comp for i, comp in enumerate(df_company['CID'].unique())}

prod2id = {prod: i for i, prod in enumerate(df_product['PID'].unique())}
id2prod = {i: prod for i, prod in enumerate(df_product['PID'].unique())}

### 样例数据随机游走的数据处理

对所有的公司节点按照“公司-〈生产〉-产品-〈下游〉-产品-〈被生产〉-公司”这种metapath随机游走得到的trace,这里我们对所有的公司节点都进行随机游走。为了给skip_gram模型使用，我们需要对traces进行处理，得到 (target,neighbor,negative_neighbor)三元组。这里使用的负采样方法是非常naive的，只是从整个corpus里面随机选择其他的节点。这种方法会有一定概率的collision，即随机选择的其他点可能和target点有连接。但是对于比较大的图，这个影响可以忽略不计。

In [32]:
# define some hyperparameters
num_walks = 4
compId = [v for k,v in comp2id.items()]
window_size = 2
num_negative_samples = 2

In [33]:
traces, _ = random_walk(graph, compId * num_walks, metapath=['produce', 'downstream', 'producedby'])
print(traces)

# trace has type [company, product, product, company]. We map the ids back to their original name
name_traces = []
for trace in traces:
    name_trace = [(id2comp if i % 3 == 0 else id2prod)[idx] for i, idx in enumerate(trace.numpy()) 
                  if idx != -1]
    name_traces.append(name_trace)
    
name_traces

tensor([[ 0,  1, -1, -1],
        [ 1,  5,  4,  1],
        [ 2,  6,  7,  2],
        [ 0,  2,  3,  1],
        [ 1,  3,  4,  1],
        [ 2,  6,  7,  2],
        [ 0,  0,  1,  0],
        [ 1,  4,  1,  0],
        [ 2,  6,  7,  2],
        [ 0,  2,  3,  1],
        [ 1,  3,  4,  1],
        [ 2,  6,  7,  2]])


[['comp_1', 'prod1_2'],
 ['comp_2', 'prod2_3', 'prod2_2', 'comp_2'],
 ['comp_3', 'prod3_1', 'prod3_2', 'comp_3'],
 ['comp_1', 'prod1_3', 'prod2_1', 'comp_2'],
 ['comp_2', 'prod2_1', 'prod2_2', 'comp_2'],
 ['comp_3', 'prod3_1', 'prod3_2', 'comp_3'],
 ['comp_1', 'prod1_1', 'prod1_2', 'comp_1'],
 ['comp_2', 'prod2_2', 'prod1_2', 'comp_1'],
 ['comp_3', 'prod3_1', 'prod3_2', 'comp_3'],
 ['comp_1', 'prod1_3', 'prod2_1', 'comp_2'],
 ['comp_2', 'prod2_1', 'prod2_2', 'comp_2'],
 ['comp_3', 'prod3_1', 'prod3_2', 'comp_3']]

In [74]:
def naive_neg_sampling(total_num, u, v, num_neg_samples):
    if u < v:
        sample_space = np.concatenate([np.arange(0, u), np.arange(u + 1, v), np.arange(v, total_num)])
        neg_samples = np.random.choice(sample_space, 2)
    else:
        sample_space = np.concatenate([np.arange(0, v), np.arange(v + 1, u), np.arange(v, total_num)])
        neg_samples = np.random.choice(sample_space, 2)
    return neg_samples.tolist()

把游走路径转化成3元组。

In [82]:
# collect all tokens
tokenset = set()
num_tokens = 0
for trace in name_traces:
    for token in trace:
        tokenset.add(token)
num_tokens = len(tokenset)
print(num_tokens)

token2id = {token: i for i, token in enumerate(tokenset)}
id2token = {i: token for i, token in enumerate(tokenset)}

# process all traces
triple_cache = []
for trace in name_traces:
    for i, u in enumerate(trace):
        for j, v in enumerate(trace[max(i - window_size, 0): i + window_size]):
            if (i == j) or (u == v):
                continue
            uid = token2id[u]
            vid = token2id[v]
            triple_cache.append([uid] + [vid] + naive_neg_sampling(num_tokens, uid, vid, num_negative_samples))

triple_cache = np.array(triple_cache)
triple_cache[:, 1:]

11


array([[ 1,  6,  2],
       [ 3,  5,  4],
       [ 4,  6,  7],
       [ 8,  5,  5],
       [ 2,  4,  8],
       [ 8,  4,  0],
       [ 4,  0,  4],
       [ 8,  9,  4],
       [ 4,  3,  5],
       [ 2,  5,  2],
       [10,  5,  8],
       [ 0,  7,  4],
       [ 5,  4,  7],
       [ 0,  4,  3],
       [10,  6,  3],
       [ 0,  3,  9],
       [10,  3,  3],
       [ 5,  5,  2],
       [ 9,  9,  2],
       [ 3,  8,  7],
       [ 6,  7,  5],
       [ 3,  6,  5],
       [ 9,  4,  9],
       [ 8,  7,  4],
       [ 9,  5,  5],
       [ 6,  8,  2],
       [ 6,  7,  6],
       [ 8,  4,  0],
       [ 2,  7,  3],
       [ 8,  0,  4],
       [ 6,  1,  1],
       [ 8, 10, 10],
       [ 6,  9,  7],
       [ 2,  8,  2],
       [10,  5, 10],
       [ 0,  1, 10],
       [ 5,  0,  8],
       [ 0,  3,  2],
       [10,  3,  2],
       [ 0,  1, 10],
       [10,  9,  6],
       [ 5,  7,  4],
       [ 7,  4,  1],
       [ 3,  6, 10],
       [ 1,  9,  2],
       [ 3, 10,  5],
       [ 7,  9,  2],
       [ 3,  

### 模型：Skip_gram

In [77]:
class SkipGramModel(nn.Module):
    """
        u_embedding: Embedding for center word.
        v_embedding: Embedding for neighbor words.
    """
    def __init__(self, emb_size, emb_dimension):
        super(SkipGramModel, self).__init__()
        self.emb_size = emb_size
        self.emb_dimension = emb_dimension
        self.u_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)
        self.v_embeddings = nn.Embedding(emb_size, emb_dimension, sparse=True)

        initrange = 1.0 / self.emb_dimension
        init.uniform_(self.u_embeddings.weight.data, -initrange, initrange)
        init.constant_(self.v_embeddings.weight.data, 0)

    def forward(self, pos_u, pos_v, neg_v):
        emb_u = self.u_embeddings(pos_u)
        emb_v = self.v_embeddings(pos_v)
        emb_neg_v = self.v_embeddings(neg_v)

        score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
        score = torch.clamp(score, max=10, min=-10)
        score = -F.logsigmoid(score)

        neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze()
        neg_score = torch.clamp(neg_score, max=10, min=-10)
        neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)

        return torch.mean(score + neg_score)

    def save_embedding(self, id2word, file_name):
        embedding = self.u_embeddings.weight.cpu().data.numpy()
        with open(file_name, 'w') as f:
            f.write('%d %d\n' % (len(id2word), self.emb_dimension))
            for wid, w in id2word.items():
                e = ' '.join(map(lambda x: str(x), embedding[wid]))
                f.write('%s %s\n' % (w, e))

### 主训练函数

In [89]:
def train(skip_gram_model,
          initial_lr,
          triples,
          epochs,
          batch_size,
          device,
         ):
    num_triples = len(triples)
    optimizer = optim.SparseAdam(list(skip_gram_model.parameters()), lr=initial_lr)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, num_triples)

    for epoch in range(epochs):
        # print("\n\n\nEpoch: " + str(epoch + 1))
        running_loss = 0.0
        
        for i in range(0, num_triples, batch_size):
            triple_batch = triples[i: i + batch_size]
            
            pos_u = torch.from_numpy(triple_batch[:, 0]).to(device)
            pos_v = torch.from_numpy(triple_batch[:, 1]).to(device)
            neg_v = torch.from_numpy(triple_batch[:, 2:]).to(device)

            scheduler.step()
            optimizer.zero_grad()
            loss = skip_gram_model.forward(pos_u, pos_v, neg_v)
            loss.backward()
            optimizer.step()

            running_loss = running_loss * 0.9 + loss.item() * 0.1
            
        print("Epoch:{}, Loss: {:06f}".format(epoch + 1, running_loss))

    skip_gram_model.save_embedding(id2token, './embeddding.csv')

### 训练

In [90]:
# define model and training hyperparameters
emb_size = num_tokens
emb_dimension = 8
batch_size = 4
epochs = 100
initial_lr = 1e-3
skip_gram_model = SkipGramModel(emb_size, emb_dimension)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda:
    skip_gram_model.cuda()

train(skip_gram_model, initial_lr, triple_cache, epochs, batch_size, device)

Epoch:1, Loss: 1.894531
Epoch:2, Loss: 1.893107
Epoch:3, Loss: 1.892189
Epoch:4, Loss: 1.891924
Epoch:5, Loss: 1.891845
Epoch:6, Loss: 1.891230
Epoch:7, Loss: 1.889468
Epoch:8, Loss: 1.886198
Epoch:9, Loss: 1.881736
Epoch:10, Loss: 1.877505
Epoch:11, Loss: 1.875108
Epoch:12, Loss: 1.874575
Epoch:13, Loss: 1.874240
Epoch:14, Loss: 1.872174
Epoch:15, Loss: 1.867163
Epoch:16, Loss: 1.859132
Epoch:17, Loss: 1.849731
Epoch:18, Loss: 1.842076
Epoch:19, Loss: 1.838397
Epoch:20, Loss: 1.837768
Epoch:21, Loss: 1.836858
Epoch:22, Loss: 1.832464
Epoch:23, Loss: 1.823120
Epoch:24, Loss: 1.809769
Epoch:25, Loss: 1.795894
Epoch:26, Loss: 1.785906
Epoch:27, Loss: 1.781785
Epoch:28, Loss: 1.781222
Epoch:29, Loss: 1.779425
Epoch:30, Loss: 1.772401
Epoch:31, Loss: 1.759014
Epoch:32, Loss: 1.741598
Epoch:33, Loss: 1.725156
Epoch:34, Loss: 1.714505
Epoch:35, Loss: 1.710734
Epoch:36, Loss: 1.710249
Epoch:37, Loss: 1.707377
Epoch:38, Loss: 1.697999
Epoch:39, Loss: 1.681747
Epoch:40, Loss: 1.662245
Epoch:41,