# 导包

In [1]:
import torch
import os,re
import pickle
from collections import defaultdict
from transformers import AutoTokenizer
import transformers
from tqdm import tqdm
import torch
import torch.nn as nn
import math
from transformers import AutoModel
tokenizer=AutoTokenizer.from_pretrained("/home/xhsun/Desktop/huggingfaceModels/chinese-roberta-wwm/")

# 读取知识图谱

In [2]:
ent2id = {}
with open('/home/xhsun/Desktop/KG/nlpcc2018/knowledge/small_knowledge/entities.dict') as f:
    lines=f.readlines()
for i in tqdm(range(len(lines))):
    l = lines[i].strip().split('\t')
    ent2id[l[0].strip()] = len(ent2id)
    
rel2id = {}
with open('/home/xhsun/Desktop/KG/nlpcc2018/knowledge/small_knowledge/relations.dict') as f:
    lines=f.readlines()
for i in tqdm(range(len(lines))):
    l = lines[i].strip().split('\t')
    rel2id[l[0].strip()] = int(l[1])

triples = []
with open('/home/xhsun/Desktop/KG/nlpcc2018/knowledge/small_knowledge/small_kb') as f:
    lines=f.readlines()
for i in tqdm(range(len(lines))):
    l = lines[i].strip().split('|||')
    s = ent2id[l[0].strip()]
    p = rel2id[l[1].strip()]
    o = ent2id[l[2].strip()]
    triples.append((s, p, o))

100%|██████████████████████████████████████████████████████████████████████████████████████████| 190912/190912 [00:00<00:00, 1825708.49it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████| 3906/3906 [00:00<00:00, 1802304.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 369812/369812 [00:00<00:00, 909696.24it/s]


In [3]:
print('知识图谱中一共有{}个三元组，{}个实体，{}个关系'.format(len(triples),len(ent2id),len(rel2id)))

知识图谱中一共有369812个三元组，190912个实体，3906个关系


- ent2id指的是每一个实体与对应的id之间的映射
- rel2id指的是每一个关系与对应的id之间的映射
- triples代表所有三元组的集合，每一个元素是一个三元组，其中头实体、尾实体和关系均用对应的id表示

In [4]:
print(ent2id['笑傲江湖'],rel2id['作者'],ent2id['金庸'])

68286 826 142061


In [5]:
triples.index((68286,826,142061))

253454

可以看出，(笑傲江湖，作者，金庸)这个三元组是整个triples集合中的第253455个三元组

In [6]:
triples[253454]

(68286, 826, 142061)

# 获取模型输入

从源代码中可以看出，模型有4个输入参数：
1. heads
2. questions
3. answers
4. entity_range

以如下句子为例：
{"question": "谁知道<解构金庸>的人号称什么啊？", "topic_entity": "解构金庸", "answer": "武林百晓生”", "relation": "号称"}

In [23]:
question="谁知道<解构金庸>的人号称什么啊？"
head='解构金庸'
question=question.replace('<'+head+'>','NE')
print(question)
answer='武林百晓生”'

head_id=ent2id[head]
answer_id=ent2id[answer]
question_ids=tokenizer(question)
print('问题中的topic entity是"{}"，它在实体集合中的id是{}'.format(head,head_id))
print('该问题的答案是"{}"，它在实体集合中的id是{}'.format(answer,answer_id))
print('将问题中的每一个单词转换为对应的id：\n',question_ids)

for key,value in question_ids.items():
    question_ids[key]=torch.LongTensor([value])

谁知道NE的人号称什么啊？
问题中的topic entity是"解构金庸"，它在实体集合中的id是12882
该问题的答案是"武林百晓生”"，它在实体集合中的id是51665
将问题中的每一个单词转换为对应的id：
 {'input_ids': [101, 6443, 4761, 6887, 10564, 4638, 782, 1384, 4917, 784, 720, 1557, 8043, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


**问题中单词的id是vocab.txt中的id，与知识图谱中实体的id无关**

# 模型的前向传播

## 模型的参数

- Encoder 就是BERT，维度是768
- $f^t$ 代表第$t$个step的projection function，论文中$t$设置为2，所以就是有两个映射层，每一个映射层由MLP+Tanh组成
- 关系分类器 就是一个普通的线性分类器，用来预测KG中哪些关系与问题最相关

In [39]:
encoder=AutoModel.from_pretrained("/home/xhsun/Desktop/huggingfaceModels/chinese-roberta-wwm/",return_dict=True)
step_1_projection=nn.Sequential(nn.Linear(768,768),nn.Tanh())
step_2_projection=nn.Sequential(nn.Linear(768,768),nn.Tanh())
relation_classifier=nn.Linear(768,3906)

Some weights of the model checkpoint at /home/xhsun/Desktop/huggingfaceModels/chinese-roberta-wwm/ were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## 计算关于问题的attention向量$q^t$
![image.png](attachment:image.png)

In [31]:
bert_output=encoder(**question_ids)
q_embeddings,context_embeddings=bert_output.pooler_output,bert_output.last_hidden_state

In [33]:
print("句子嵌入：",q_embeddings.size(),"上下文嵌入：",context_embeddings.size())

句子嵌入： torch.Size([1, 768]) 上下文嵌入： torch.Size([1, 14, 768])


In [37]:
qk_1=step_1_projection(q_embeddings)
b_1=torch.softmax(torch.sum(qk_1.unsqueeze(1)*context_embeddings,dim=2),dim=1)
q_1=torch.sum(b_1.unsqueeze(2)*context_embeddings,dim=1)

**$q^t$是问题感知(question-aware)的语义向量，它关注了问题的不同部分，即蕴含着问题所要询问的信息**

## 根据$q^t$计算关系分数(也就是计算KG中哪一个关系与该问题最相关)

![image.png](attachment:image.png)
![image-2.png](attachment:image-2.png)
![image-3.png](attachment:image-3.png)

In [40]:
p_1=torch.sigmoid(relation_classifier(q_1))

## 利用p_1实现hop

### TransferNet infers the answer by transfering entity scores along relation scores of multiple steps.
1. It **starts from the topic entity of the question** and maintains an entity score vector, whose elements indicate the probability of an entity being activated
2. At each step, it **attends to some question words and compute scores for the relations** in the graph.
3. We formulate these relation scores into an adjacent matrix, where **each entry indicates the transfer probability of an entity pair**.
4. By **multiplying the entity score vector with the relation score matrix**, we can “hop” along relations.

In [49]:
print("初始时刻，从问题中的topic entity出发，head_id = {}".format(head_id))
last_e=torch.nn.functional.one_hot(torch.LongTensor([head_id]),num_classes=len(ent2id))

初始时刻，从问题中的topic entity出发，head_id = 12882


In [43]:
len(ent2id)

190912

3906

In [48]:
torch.nn.functional.one_hot(torch.LongTensor([2]),num_classes=10)

tensor([[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [20]:
question_ids

{'input_ids': tensor([[  101,  6443,  4761,  6887, 10564,  4638,   782,  1384,  4917,   784,
           720,  1557,  8043,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [21]:
context_embeddings

'pooler_output'

In [20]:
ent2id['武林百晓生']

KeyError: '武林百晓生'