# CLUE-CLUENER 细粒度命名实体识别

本数据是在清华大学开源的文本分类数据集THUCTC基础上，选出部分数据进行细粒度命名实体标注，原数据来源于Sina News RSS.

训练集：10748 验证集：1343

标签类别：
数据分为10个标签类别，分别为: 地址（address），书名（book），公司（company），游戏（game），政府（goverment），电影（movie），姓名（name），组织机构（organization），职位（position），景点（scene）

数据下载地址：https://github.com/CLUEbenchmark/CLUENER2020

排行榜地址：https://cluebenchmarks.com/ner.html

span 79.530 (seed=42,no_kd,no_augs) 80.207 (seed=8864, robert-wwm-large-ext)
42/2augs: 77.771

enable_kd: 79.254

roberta-wwm-large-ext-chinese: 80.293 focalloss1.5: 79.824 / 2augs|CrossEntropyLoss: 79.590

pn: 78.751

2020-11-16 13:39:54.678 | INFO     | theta.modeling.ner_utils:ner_evaluate:1083 - total_right: 2499, total_preds: 2908, total_targets: 2982

2020-11-16 13:39:54.680 | WARNING  | theta.modeling.ner_utils:ner_evaluate:1090 - Micro: P: 0.853501, R: 0.842357, F1: 0.835674

2020-11-16 13:39:54.681 | WARNING  | theta.modeling.ner_utils:ner_evaluate:1096 - Macro: P: 0.859354, R: 0.838028, F1: 0.848557


|模型|线上效果f1|
|------|------:|
|Bert-base|78.82|
|RoBERTa-wwm-large-ext|80.42|
|Bi-Lstm + CRF|70.00|

In [1]:
import sys
sys.path.append('../../..')

In [2]:
import os, json, re
import numpy as np
import pandas as pd
from loguru import logger
from tqdm import tqdm

## 模型IO定义

通常将本节代码写入cluener.py文件中，这也是开发者主要要编写的代码。

In [3]:
ner_labels = [
    'address', 'book', 'company', 'game', 'government', 'movie', 'name',
    'organization', 'position', 'scene'
]
ner_connections = []

In [4]:
def clean_text(text):
    if text:
        text = text.strip()
    return text

def cluener_data_generator(train_file, desc=""):
    for i, line in enumerate(tqdm(open(train_file).readlines(), desc=desc)):
        guid = f"{i}"
        json_data = json.loads(line.strip())
        text = clean_text(json_data['text'])

        tags = []
        classes = json_data['label'].keys()
        for c in classes:
            c_labels = json_data['label'][c]
            for label, span in c_labels.items():
                x0, x1 = span[0]
                s = x0
                m = text[x0:x1 + 1]
                tags.append({'category': c, 'start': s, 'mention': m})
        yield guid, text, None, tags

### 定义训练集/验证集数据生成器
要点是每条样本返回(yield)(guid, text, None, tags)元组。
验证集数据生成器不是必须的，当未提供时，Theta自动从train_data_generator生成的数据集中切分训练集和验证集。

In [5]:
def train_data_generator(train_file):
    if train_file is None:
        train_file = 'data/train.json'

    for guid, text, _, tags in cluener_data_generator(train_file, desc="Train data"):
        yield guid, text, None, tags

    eval_file = 'data/dev.json'
    for guid, text, _, tags in cluener_data_generator(eval_file, desc="Eval data"):
        yield guid, text, None, tags
        
def eval_data_generator(eval_file):
    if eval_file is None:
        eval_file = 'data/dev.json'

    for guid, text, _, tags in cluener_data_generator(eval_file, desc="Eval data"):
        yield guid, text, None, tags

### 定义测试集数据生成器
要点是每条样本返回(guid, text, None, None)元组。

In [6]:
def test_data_generator(test_file):
    if test_file is None:
        test_file = 'data/test.json'

    for i, line in enumerate(
            tqdm(open(test_file).readlines(), desc="Test data: ")):
        guid = f"{i}"
        json_data = json.loads(line.strip())
        text = clean_text(json_data['text'])

        yield guid, text, None, None

### 定义提交结果文件生成函数
完成训练、推理后生成reviews_file标准格式输出文件，在此处转换成需要的输出文件格式。

In [7]:
def generate_submission(args):
    reviews_file = args.reviews_file
    reviews = json.load(open(reviews_file, 'r'))

    submission_file = f"./submissions/{args.dataset_name}_predict.json"
    test_results = []
    for guid, json_data in tqdm(reviews.items()):
        text = json_data['text']

        classes = {}
        for json_entity in json_data['tags']:
            c = json_entity['category']
            s = json_entity['start']
            m = json_entity['mention']
            if c not in classes:
                classes[c] = {}
            if m not in classes[c]:
                classes[c][m] = []
            classes[c][m].append([s, s + len(m) - 1])
        test_results.append({'id': guid, 'text': text, 'label': classes})

    with open(submission_file, 'w') as wt:
        for line in test_results:
            wt.write(f"{json.dumps(line, ensure_ascii=False)}\n")

    logger.info(f"Saved {len(reviews)} lines in {submission_file}")

## 定义主应用程序
此处通常无需修改。

In [8]:
# -------------------- NerApp --------------------
from theta.modeling.app import NerApp


class MyApp(NerApp):
    def __init__(self,
                 experiment_params,
                 ner_labels: list,
                 ner_connections: list,
                 add_special_args=None):

        super(MyApp, self).__init__(experiment_params, ner_labels,
                                    ner_connections, add_special_args)

    def run(
        self,
        train_data_generator,
        test_data_generator,
        generate_submission=None,
        eval_data_generator=None,
    ):

        args = self.args

        if args.preapre_data:
            logger.info(f"Prepare data.")
        else:
            super(MyApp, self).run(train_data_generator, test_data_generator,
                                   generate_submission, eval_data_generator)

## 主入口函数
提供缺省的主入口函数，通常无需修改即可正常运行。

其中add_special_args()函数可以加入自己需要的命令行参数定义，实现自定义的控制。

In [9]:
# -------------------- Main --------------------
def main():
    # -------- Customized arguments --------
    def add_special_args(parser):
        parser.add_argument("--preapre_data",
                            action='store_true',
                            help="Preapre data.")
        return parser

    app = MyApp(experiment_params,
                ner_labels=ner_labels,
                ner_connections=ner_connections,
                add_special_args=add_special_args)

    app.run(train_data_generator,
            test_data_generator,
            generate_submission=generate_submission,
            eval_data_generator=None)
            #eval_data_generator=eval_data_generator)

## 模型参数定义

In [12]:
# -------------------- Params --------------------
from theta.modeling import NerAppParams
experiment_params = NerAppParams()

# 在区域修改参数
# ----------------------------------------
# 8438bcf6
# a094ef08 ENABLE_KD=True
LR = 2e-5
ADAM_EPS = 1e-8
N_AUGS = 2
N_EPOCHS = 6
MAX_SEQ_LENGTH = 64
BATCH_SIZE = 32
SEG_LEN = MAX_SEQ_LENGTH - 2
SEG_BACKOFF = 0
ENABLE_KD = False
MODEL_PATH = "/opt/share/pretrained/pytorch/bert-base-chinese"
#MODEL_PATH = "/opt/share/pretrained/pytorch/roberta-wwm-large-ext-chinese"
CONFIDENCE = 0.0
LOSS_TYPE = "CrossEntropyLoss"
FOCALLOSS_GAMMA = 1.5
ALLOW_OVERLAP = False
NER_TYPE = "span"
SOFT_LABEL = False
ENABLE_NESTED_ENTITIES = False
FP16 = True
CC = None

SEED = 42
FOLD = 0

# ----------------------------------------
# 9488ff16
# LR = 1e-4
# ADAM_EPS = 1e-6
# N_AUGS = 0
# N_EPOCHS = 6
# MAX_SEQ_LENGTH = 64
# BATCH_SIZE = 32
# SEG_LEN = MAX_SEQ_LENGTH - 2
# SEG_BACKOFF = 0
# ENABLE_KD = False
# MODEL_PATH = "/opt/share/pretrained/pytorch/bert-base-chinese"
# CONFIDENCE = 0.35
# LOSS_TYPE = "CrossEntropyLoss"
# FOCALLOSS_GAMMA = 2.0
# ALLOW_OVERLAP = False
# NER_TYPE = "pn"
# SOFT_LABEL = False
# ENABLE_NESTED_ENTITIES = False
# FP16 = True
# CC = None

# SEED = 8864
# FOLD = 0

# ----------------------------------------
# 以下无需修改

conf_common_params = {
    'dataset_name': "cluener",
    'experiment_name': "CLUE",
    'learning_rate': LR,
    'adam_epsilon': ADAM_EPS,
    'fold': FOLD,
    'num_augments': N_AUGS,
    'enable_kd': ENABLE_KD,
    'num_train_epochs': N_EPOCHS,
    'train_max_seq_length': MAX_SEQ_LENGTH,
    'eval_max_seq_length': MAX_SEQ_LENGTH,
    'per_gpu_train_batch_size': BATCH_SIZE,
    'per_gpu_eval_batch_size': BATCH_SIZE,
    'per_gpu_predict_batch_size': BATCH_SIZE,
    'seg_len': SEG_LEN,
    'seg_backoff': SEG_BACKOFF,
    'model_path': MODEL_PATH,
    'confidence': CONFIDENCE,
    'loss_type': LOSS_TYPE,
    'focalloss_gamma': FOCALLOSS_GAMMA,
    'allow_overlap': ALLOW_OVERLAP,
    'enable_nested_entities': ENABLE_NESTED_ENTITIES,
    'fp16': FP16,
    'cc': CC,
    'seed': SEED
}
conf_ner_params = {'ner_type': NER_TYPE, 'soft_label': SOFT_LABEL}

for k, v in conf_common_params.items():
    setattr(experiment_params.common_params, k, v)
for k, v in conf_ner_params.items():
    setattr(experiment_params.ner_params, k, v)
experiment_params.debug()

2020-12-17 03:08:31.845 | DEBUG    | theta.modeling.utils:debug:26 - adam_epsilon: 1e-08
2020-12-17 03:08:31.848 | DEBUG    | theta.modeling.utils:debug:26 - allow_overlap: False
2020-12-17 03:08:31.849 | DEBUG    | theta.modeling.utils:debug:26 - artifact_path: None
2020-12-17 03:08:31.850 | DEBUG    | theta.modeling.utils:debug:26 - aug_train_only: False
2020-12-17 03:08:31.851 | DEBUG    | theta.modeling.utils:debug:26 - best_index: f1
2020-12-17 03:08:31.852 | DEBUG    | theta.modeling.utils:debug:26 - brat_data_dir: None
2020-12-17 03:08:31.853 | DEBUG    | theta.modeling.utils:debug:26 - cc: None
2020-12-17 03:08:31.854 | DEBUG    | theta.modeling.utils:debug:26 - confidence: 0.0
2020-12-17 03:08:31.855 | DEBUG    | theta.modeling.utils:debug:26 - dataset_name: cluener
2020-12-17 03:08:31.856 | DEBUG    | theta.modeling.utils:debug:26 - diceloss_weight: None
2020-12-17 03:08:31.857 | DEBUG    | theta.modeling.utils:debug:26 - emotion_words_file: None
2020-12-17 03:08:31.858 | DEB

## 新版本实验

In [None]:
!pip install -i https://pypi.tuna.tsinghua.edu.cn/simple msgpack msgpack_numpy

In [18]:
from theta.nlp.dataflow import EntityDataFlow
from theta.nlp.taggers import PointerSequenceTagger

KeyError: 'base'

## 启动实验

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting msgpack
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/8e/e8/a4b64266a1fb99190a3a03c999395108ef2fde5d912343b2a6cf435d59af/msgpack-1.0.1-cp37-cp37m-manylinux2010_x86_64.whl (273 kB)
[K     |████████████████████████████████| 273 kB 118 kB/s eta 0:00:01
[?25hCollecting msgpack_numpy
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/19/05/05b8d7c69c6abb36a34325cc3150089bdafc359f0a81fb998d93c5d5c737/msgpack_numpy-0.4.7.1-py2.py3-none-any.whl (6.7 kB)
Installing collected packages: msgpack, msgpack-numpy
Successfully installed msgpack-1.0.1 msgpack-numpy-0.4.7.1
You should consider upgrading via the '/usr/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [17]:
import sys
sys.argv = [sys.argv[0]] + ['--do_experiment']
main()

2020-12-17 03:13:42.232 | INFO     | theta.modeling.app:__init__:278 - args: Namespace(adam_epsilon=1e-08, allow_overlap=False, app_type='ner', artifact_path=None, aug_train_only=False, best_index='f1', best_model_path='./outputs/latest/best', brat_data_dir=None, cache_dir=None, cache_features=False, cc=None, confidence=0.0, data_dir=None, dataset_file=None, dataset_name='cluener', diceloss_weight=None, do_eda=False, do_eval=False, do_experiment=True, do_lower_case=False, do_new=False, do_predict=False, do_submit=False, do_train=False, emotion_words_file=None, enable_kd=False, enable_nested_entities=False, enable_sda=False, eval_all_checkpoints=False, eval_file=None, eval_max_seq_length=64, evaluate_during_training=False, experiment_id=None, experiment_name='CLUE', experiments_dir='./experiments', focalloss_alpha=None, focalloss_gamma=1.5, fold=0, fp16=True, fp16_opt_level='O1', generate_submission=False, gradient_accumulation_steps=1, ignore_categories=None, is_english=False, kd_coeff

KeyboardInterrupt: 

In [None]:
from theta.modeling import ner_evaluate
dev_file=None
reviews_file = "outputs/latest/cluener_reviews_a094ef08.json"
macro_acc, macro_recall, macro_f1, micro_acc, micro_recall, micro_f1 = ner_evaluate(
    dev_file, reviews_file, eval_data_generator)
