## 题目：利用卷积神经网络抽取实体间关系
* 参考论文：Relation classification via convolutional deep neural network.2014. COLING
* 参考框架、工具：python3.x、pytorch1.x
* 作业难度较大
* 使用技术手段：word2vec、CNN、MaxPooling
* 推荐数据集：关系抽取数据集SemEval2010 task8（可以用任意其他关系抽取数据集完成）
* 推荐词向量：Glove 
* 本项目包含：1.数据集构造（预处理）、2. 模型搭建和训练、3. 模型测试和展示

## 参考代码实现

### 导入依赖包和设置需求路径

In [1]:
import torch.utils.data as data
from torch import nn, optim
from tqdm import tqdm
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import os, json
import torch
import sys

In [2]:
root_path = "datasets"
rel2id = os.path.join(root_path, 'data/semeval/semeval_rel2id.json') #关系id映射
train_path = os.path.join(root_path, 'data/semeval/semeval_train.txt')  # 训练集路径
val_path = os.path.join(root_path, 'data/semeval/semeval_val.txt') #验证集路径
test_path = os.path.join(root_path, 'data/semeval/semeval_test.txt')#测试集路径
word2id = json.load(open(os.path.join(root_path, 'glove/glove_word2id.json'))) #词id
word2vec = os.path.join(root_path, 'glove/glove_mat.npy') #词向量
ckpt = 'best_acc.pth.tar' #保存参数路径

### 数据集预处理

In [3]:
#超参数设置
batch_size = 32
epochs = 20
lr = 0.1
weight_decay = 1e-5

In [4]:
class REDataset(data.Dataset):#创建数据集
    def __init__(self, path, rel2id_path, word2id, max_length=40):
        super().__init__()
        self.path = path
        self.rel2id = json.load(open(rel2id_path))
        self.id2rel = {v: k for k, v in self.rel2id.items()}
        # Load the file
        f = open(path, encoding='utf-8')
        self.data = []
        for line in f.readlines():
            line = line.rstrip()
            if len(line) > 0:
                self.data.append(eval(line))
        f.close()
        self.word2id = word2id
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        item = self.data[index]
        seq = list(self.tokenize(item['token'], item['h']['pos'], item['t']['pos']))
        return [self.rel2id[item['relation']]] + seq  # label, seq1, seq2, ...

    def tokenize(self, raw_tokens, pos_head, pos_tail):
        # token -> index，构造相对位置编码和词id
        indexed_tokens = []
        for token in raw_tokens:
            token = token.lower()
            if token in self.word2id:
                indexed_tokens.append(self.word2id[token])
            else:
                indexed_tokens.append(self.word2id['[UNK]'])

        # padding
        while len(indexed_tokens) < self.max_length:
            indexed_tokens.append(self.word2id['[PAD]'])
        indexed_tokens = indexed_tokens[:self.max_length]

        # pos
        pos1 = np.zeros((self.max_length), dtype=np.int32)
        pos2 = np.zeros((self.max_length), dtype=np.int32)
        pos1_in_index = min(self.max_length, pos_head[0])
        pos2_in_index = min(self.max_length, pos_tail[0])
        for i in range(self.max_length):
            pos1[i] = i - pos1_in_index + self.max_length
            pos2[i] = i - pos2_in_index + self.max_length
        indexed_tokens = torch.tensor(indexed_tokens).long()
        pos1 = torch.tensor(pos1).long()
        pos2 = torch.tensor(pos2).long()

        return indexed_tokens, pos1, pos2#词id，实体1的相对位置编码和实体2的相对位置编码
    @staticmethod
    def collate_fn(data):
        data = list(zip(*data))
        labels = data[0]
        batch_labels = torch.tensor(labels).long()
        seqs = data[1:]
        batch_seqs = []
        for seq in seqs:
            batch_seqs.append(pad_sequence(seq, batch_first=True, padding_value=0))
        return [batch_labels] + batch_seqs

In [5]:
#创建训练所需的DataLoader
train_dataset = REDataset(path=train_path, rel2id_path=rel2id, word2id=word2id)
val_dataset = REDataset(path=val_path, rel2id_path=rel2id, word2id=word2id)
test_dataset = REDataset(path=test_path, rel2id_path=rel2id, word2id=word2id)

train_loader = data.DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=0,
                                  collate_fn=REDataset.collate_fn)
val_loader = data.DataLoader(dataset=val_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=REDataset.collate_fn)
test_loader = data.DataLoader(dataset=test_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=0,
                                  collate_fn=REDataset.collate_fn)

### 创建模型

In [6]:
#构建词嵌入模块
class Embedding(nn.Module):

    def __init__(self, word_vec_mat, max_length, word_embedding_dim=50, pos_embedding_dim=5):
        nn.Module.__init__(self)

        self.max_length = max_length
        self.word_embedding_dim = word_embedding_dim
        self.pos_embedding_dim = pos_embedding_dim

        # Word embedding
        # unk = torch.randn(1, word_embedding_dim) / math.sqrt(word_embedding_dim)
        # blk = torch.zeros(1, word_embedding_dim)
        word_vec_mat = np.load(word_vec_mat)
        word_vec_mat = torch.from_numpy(word_vec_mat)
        self.word_embedding = nn.Embedding(word_vec_mat.shape[0], self.word_embedding_dim,
                                           padding_idx=word_vec_mat.shape[0] - 1)
        self.word_embedding.weight.data.copy_(word_vec_mat)

        # Position Embedding
        self.pos1_embedding = nn.Embedding(2 * max_length, pos_embedding_dim, padding_idx=0)
        self.pos2_embedding = nn.Embedding(2 * max_length, pos_embedding_dim, padding_idx=0)

    def forward(self, word, pos1, pos2):
        x = torch.cat([self.word_embedding(word),
                       self.pos1_embedding(pos1),
                       self.pos2_embedding(pos2)], 2)
        return x

In [7]:
#CNN 上下文编码模块
class Encoder(nn.Module):
    def __init__(self, max_length, word_embedding_dim=50, pos_embedding_dim=5, hidden_size=230):
        nn.Module.__init__(self)

        self.max_length = max_length
        self.hidden_size = hidden_size
        self.embedding_dim = word_embedding_dim + pos_embedding_dim * 2
        self.conv = nn.Conv1d(self.embedding_dim, self.hidden_size, 3, padding=1)
        self.pool = nn.MaxPool1d(max_length)

    def forward(self, inputs):
        return self.cnn(inputs)

    def cnn(self, inputs):
        x = self.conv(inputs.transpose(1, 2))
        x = F.relu(x)
        x = self.pool(x)
        return x.squeeze(2) # n x hidden_size

In [8]:
#主模型搭建
class CNNRC(nn.Module):
    def __init__(self, word_vec_mat, max_length):
        super().__init__()

        self.max_length = max_length
        self.embed = Embedding(word_vec_mat, max_length) #词嵌入
        self.encoder = Encoder(max_length)#卷积编码
        self.fc = nn.Linear(self.encoder.hidden_size, 19)#分类层

    def forward(self, word, pos1, pos2):
        x = self.embed(word, pos1, pos2)
        x = self.encoder(x)
        x = self.fc(x)
        _, pred = torch.max(x.view(-1, 19), 1)
        return x, pred

### 模型训练

In [9]:
model = CNNRC(word_vec_mat= word2vec, max_length=40) #初始化模型
#设置优化器SGD
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr, weight_decay=weight_decay)
#设置交叉熵损失函数
loss_func = nn.CrossEntropyLoss()

In [10]:
def epoch(t, train=False):#训练一个epoch的流程
    if train:
        model.train()
    else:
        model.eval()
    loss_log = 0.0
    acc_log = 0.0
    for iters, data in enumerate(t):
        # sentence
        label = data[0]
        args = data[1:]
        logits, pred = model(*args)
        # loss
        loss = loss_func(logits, label)
        acc = float((pred == label).long().sum()) / label.size(0)
        # Log
        loss_log += loss.item()
        acc_log += acc
        sys.stdout.write( "iters : {}, loss : {:.4f}, acc:{:.4f}".format(iters+1, loss_log/(iters+1), acc_log/(iters+1))+'\r')
        sys.stdout.flush()
        # Optimize
        if train:
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
    return acc_log/(iters+1)

In [11]:
def train():#完整训练过程
    best_acc = 0.0
    for i in range(epochs):
        print("\nepoch {} tarin".format(i))
        epoch(train_loader, train=True)
        print("\nepoch {} val".format(i))
        acc = epoch(val_loader)
        print("\nval result: {:.4f}".format(acc))
        sys.stdout.flush()
        if acc > best_acc:
            print("\nBest ckpt and saved.")
            torch.save({'state_dict': model.state_dict()}, ckpt)
            best_acc = acc
            sys.stdout.flush()
        
    print("test")
    acc = epoch(test_loader)
    print("test result: {:.4f}".format(acc))#在测试集上的结果

In [18]:
# 模型训练和测试
train()

epoch 0 tarin
epoch 0 val, loss : 0.2430, acc:0.9366
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
Best ckpt and saved.
epoch 1 tarin
epoch 1 val, loss : 0.2433, acc:0.9369
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
epoch 2 tarin
epoch 2 val, loss : 0.2429, acc:0.9369
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
epoch 3 tarin
epoch 3 val, loss : 0.2428, acc:0.9369
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
epoch 4 tarin
epoch 4 val, loss : 0.2428, acc:0.9369
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
epoch 5 tarin
epoch 5 val, loss : 0.2429, acc:0.9369
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
epoch 6 tarin
epoch 6 val, loss : 0.2432, acc:0.9369
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
epoch 7 tarin
epoch 7 val, loss : 0.2446, acc:0.9363
iters : 47, loss : 1.0002, acc:0.7152
val result: 0.7152
epoch 8 tarin
epoch 8 val, loss : 0.2435, acc:0.9366
iters : 47, loss : 1.0002, acc:0.7152
val resu

### 利用训练好的模型展示

In [12]:
model = CNNRC(word_vec_mat= word2vec, max_length=40)
checkpoint = torch.load(ckpt) #加载训练好的参数
model.load_state_dict(checkpoint['state_dict'])
def demo(x, h, t):
    ori = x
    x = x.strip().split()
    head, tail = x[h[0]:h[-1]], x[t[0]:t[-1]]
    token, h, t = test_dataset.tokenize(x, h, t)
    token = token.unsqueeze(0)
    h = h.unsqueeze(0)
    t = t.unsqueeze(0)
    args = [token, h, t]
    _, pred = model(*args)
    pred = int(pred[0].item())
    print("This sentense is:")
    print(ori)
    print("head(e1) :{}, tail(e2) : {}".format(head, tail))
    print("predict is:")
    print(test_dataset.id2rel[pred])

In [13]:
#展示效果
x = "Like magic , a covert of coots arrives to feed on whatever there is in the lake bottom ."
h = [6, 7]
t = [4, 5]
demo(x, h, t)

This sentense is:
Like magic , a covert of coots arrives to feed on whatever there is in the lake bottom .
head(e1) :['coots'], tail(e2) : ['covert']
predict is:
Product-Producer(e2,e1)
