# 中文三元组联合抽取

## 介绍

在这个notebook中我们将使用openue库代码来训练我们自己的三元组联合抽取，使用的基础模型是`bert-base-chinese`，训练分为两步，首先训练关系分类模型，其次训练实体抽取模型。之后联合验证。

## 数据集

在这个数据集中，使用ske数据集，具体例子如下。我们使用代码来读取`train.json`来分析一下数据。

In [1]:
import json
with open("../dataset/ske/train.json", "r") as file:
    for line in file.readlines():
        example = json.loads(line)
        break
for k, v in example.items():
    print(f"{k}: {v}")

postag: [{'word': '如何', 'pos': 'r'}, {'word': '演', 'pos': 'v'}, {'word': '好', 'pos': 'a'}, {'word': '自己', 'pos': 'r'}, {'word': '的', 'pos': 'u'}, {'word': '角色', 'pos': 'n'}, {'word': '，', 'pos': 'w'}, {'word': '请', 'pos': 'v'}, {'word': '读', 'pos': 'v'}, {'word': '《', 'pos': 'w'}, {'word': '演员自我修养', 'pos': 'nw'}, {'word': '》', 'pos': 'w'}, {'word': '《', 'pos': 'w'}, {'word': '喜剧之王', 'pos': 'nw'}, {'word': '》', 'pos': 'w'}, {'word': '周星驰', 'pos': 'nr'}, {'word': '崛起', 'pos': 'v'}, {'word': '于', 'pos': 'p'}, {'word': '穷困潦倒', 'pos': 'a'}, {'word': '之中', 'pos': 'f'}, {'word': '的', 'pos': 'u'}, {'word': '独门', 'pos': 'n'}, {'word': '秘笈', 'pos': 'n'}]
text: 如何演好自己的角色，请读《演员自我修养》《喜剧之王》周星驰崛起于穷困潦倒之中的独门秘笈
spo_list: [{'predicate': '主演', 'object_type': '人物', 'subject_type': '影视作品', 'object': '周星驰', 'subject': '喜剧之王'}]


# 训练

## `seq model`关系分类模型

如我们的模型图所示，我们需要先训练一个关系分类模型，识别出句子中实体的属性。

<div  align="center">
    <img src="./imgs/architecture.png" width = "600" height = "400" alt="图片名称" align=center />
</div>


In [2]:
import argparse
import importlib

import numpy as np
import torch
import pytorch_lightning as pl
import openue.lit_models as lit_models
import yaml
import time
from transformers import AutoConfig
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

error: Error -5 while decompressing data: incomplete or truncated stream

In [None]:
# 设置一些参数和动态调用包
def _import_class(module_and_class_name: str) -> type:
    module_name, class_name = module_and_class_name.rsplit(".", 1)
    module = importlib.import_module(module_name)
    class_ = getattr(module, class_name)
	
    return class_


def _setup_parser():
    """Set up Python's ArgumentParser with data, model, trainer, and other arguments."""
    parser = argparse.ArgumentParser(add_help=False)

    # Add Trainer specific arguments, such as --max_epochs, --gpus, --precision
    # trainer_parser = pl.Trainer.add_argparse_args(parser)
    # trainer_parser._action_groups[1].title = "Trainer Args"  # pylint: disable=protected-access
    # parser = argparse.ArgumentParser(add_help=False, parents=[trainer_parser])

    # Basic arguments
    parser.add_argument("--wandb", action="store_true", default=False)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--litmodel_class", type=str, default="SEQLitModel")
    parser.add_argument("--data_class", type=str, default="REDataset")
    parser.add_argument("--model_class", type=str, default="BertForRelationClassification")
    parser.add_argument("--load_checkpoint", type=str, default=None)

    # Get the data and model classes, so that we can add their specific arguments
    temp_args, _ = parser.parse_known_args()
    data_class = _import_class(f"openue.data.{temp_args.data_class}")
    model_class = _import_class(f"openue.models.{temp_args.model_class}")

    # Get data, model, and LitModel specific arguments
    data_group = parser.add_argument_group("Data Args")
    data_class.add_to_argparse(data_group)

    model_group = parser.add_argument_group("Model Args")
    model_class.add_to_argparse(model_group)

    lit_model_group = parser.add_argument_group("LitModel Args")
    lit_models.BaseLitModel.add_to_argparse(lit_model_group)

    parser.add_argument("--help", "-h", action="help")
    return parser

In [None]:
parser = _setup_parser()
args = parser.parse_args(args=[])

path = "../config/2021-09-09/20:28:29.yaml"
# 使用config.yaml 载入超参设置
opt = vars(args)
args = yaml.load(open(path))
opt.update(args)
args = opt
class Config(dict):
    def __getattr__(self, name):
        return self.get(name)

    def __setattr__(self, name, val):
        self[name] = val

args = Config(args)


np.random.seed(args.seed)
torch.manual_seed(args.seed)
data_class = _import_class(f"openue.data.{args.data_class}")
model_class = _import_class(f"openue.models.{args.model_class}")
litmodel_class = _import_class(f"openue.lit_models.{args.litmodel_class}")

data = data_class(args)

lit_model = litmodel_class(args=args, data_config=data.get_config())



logger = pl.loggers.TensorBoardLogger("training/logs")
if args.wandb:
    logger = pl.loggers.WandbLogger(project="dialogue_pl")
    logger.log_hyperparams(vars(args))

early_callback = pl.callbacks.EarlyStopping(monitor="Eval/f1", mode="max", patience=5)
model_checkpoint = pl.callbacks.ModelCheckpoint(monitor="Eval/f1", mode="max",
    filename='{epoch}-{Eval/f1:.2f}',
    dirpath="output",
    save_weights_only=True
)


callbacks = [early_callback, model_checkpoint]

trainer = pl.Trainer.from_argparse_args(args, callbacks=callbacks, logger=logger, default_root_dir="training/logs")

trainer.fit(lit_model, datamodule=data)

trainer.test(lit_model, datamodule=data)

  args = yaml.load(open(path))
404 Client Error: Not Found for url: https://huggingface.co/None/resolve/main/config.json


OSError: Can't load config for 'None'. Make sure that:

- 'None' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'None' is the correct path to a directory containing a config.json file



In [None]:
args.model_name_or_path

'bert-base-chinese'

In [None]:
type(args.model_name_or_path)

str