# lstm_crf_study

## 导包

In [2]:
import json
import torch
from pathlib import Path
import torch.nn as nn
from torch import optim
from model import NERModel
from dataset_loader import DatasetLoader
from progressbar import ProgressBar
from ner_metrics import SeqEntityScore
from data_processor import CluenerProcessor
from lr_scheduler import ReduceLROnPlateau
from utils_ner import get_entities
from common import (init_logger,
                    logger,
                    json_to_text,
                    load_model,
                    AverageMeter,
                    seed_everything)

## 配置类

In [6]:
class Config():
    def __init__(self):
        self.do_train = True
        self.do_eval = True 
        self.do_predict = True
        self.markup = "bios"     # choices=['bios', 'bio']
        self.arch = "bilstm_crf"
        self.learning_rate = 0.001 
        self.seed = 1234
        self.gpu = ""
        self.epochs = 50
        self.batch_size = 32
        self.embedding_size = 128
        self.hidden_size = 384
        self.grad_norm = 5.0      # Max gradient norm
        self.task_name ="ner"
        self.data_dir = Path("./dataset/cluener")
        self.train_path = self.data_dir / 'train.json'
        self.dev_path =self.data_dir / 'dev.json'
        self.test_path = self.data_dir / 'test.json'
        self.output_dir = Path("./outputs")

        self.label2id = {
            "O": 0,
            "B-address":1,
            "B-book":2,
            "B-company":3,
            'B-game':4,
            'B-government':5,
            'B-movie':6,
            'B-name':7,
            'B-organization':8,
            'B-position':9,
            'B-scene':10,
            "I-address":11,
            "I-book":12,
            "I-company":13,
            'I-game':14,
            'I-government':15,
            'I-movie':16,
            'I-name':17,
            'I-organization':18,
            'I-position':19,
            'I-scene':20,
            "S-address":21,
            "S-book":22,
            "S-company":23,
            'S-game':24,
            'S-government':25,
            'S-movie':26,
            'S-name':27,
            'S-organization':28,
            'S-position':29,
            'S-scene':30,
            "<START>": 31,
            "<STOP>": 32
        }
        
args = Config()

## 运行

### 文件夹创建

In [7]:
if not args.output_dir.exists():
    args.output_dir.mkdir()
args.output_dir = args.output_dir / '{}'.format(args.arch)

In [8]:
if not args.output_dir.exists():
    args.output_dir.mkdir()
init_logger(log_file=str(args.output_dir / '{}-{}.log'.format(args.arch, args.task_name)))
seed_everything(args.seed)

In [9]:
if args.gpu!='':
    args.device = torch.device(f"cuda:{args.gpu}")
else:
    args.device = torch.device("cpu")

### label -> id 的 映射表构建

In [10]:
args.id2label = {i: label for i, label in enumerate(args.label2id)}
args.label2id = args.label2id

### 数据处理类 定义

In [11]:
processor = CluenerProcessor(data_dir=config.data_dir)
processor.get_vocab()                            # 构建词典，并保存到 vocab.pkl 文件中

### 模型定义

In [12]:
model = NERModel(
    vocab_size=len(processor.vocab), 
    embedding_size=args.embedding_size,
    hidden_size=args.hidden_size,
    device=args.device,
    label2id=args.label2id
)

In [13]:
model.to(args.device)

NERModel(
  (embedding): Embedding(3821, 128)
  (bilstm): LSTM(128, 384, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
  (dropout): SpatialDropout(p=0.1, inplace=False)
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (classifier): Linear(in_features=768, out_features=33, bias=True)
  (crf): CRF()
)

### 加载 训练 和 验证 数据

```s
[
    {
        'id': 'train_0',
        'context': '浙 商 银 行 企 业 信 贷 部 叶 老 桂 博 士 ...',
        'tag': 'B-company I-company I-company I-company O O O O O B-name I-name I-name ...',
         'raw_context': '浙商银行企业信贷部叶老桂博士...'
     }, ...
 ]
```

In [16]:
# 功能：从 cache 或 数据文件中加载 训练或验证数据
def load_and_cache_examples(args,processor, data_type='train'):
    '''
        功能：从 cache 或 数据文件中加载 训练或验证数据
    '''
    # Load data features from cache or dataset file
    cached_examples_file = args.data_dir / 'cached_crf-{}_{}_{}'.format(
        data_type,
        args.arch,
        str(args.task_name))
    if cached_examples_file.exists():
        logger.info("Loading features from cached file %s", cached_examples_file)
        examples = torch.load(cached_examples_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        if data_type == 'train':
            examples = processor.get_train_examples()
        elif data_type == 'dev':
            examples = processor.get_dev_examples()
        logger.info("Saving features into cached file %s", cached_examples_file)
        torch.save(examples, str(cached_examples_file))
    return examples

### 模型验证

In [26]:
def evaluate(args,model,processor):
    # step 1：加载验证集
    eval_dataset = load_and_cache_examples(args,processor, data_type='dev')
    # step 2：定义 DatasetLoader 对象。并对数据 token 和 label 转化为 id，和按 batch_size 大小切割
    eval_dataloader = DatasetLoader(
        data=eval_dataset, 
        batch_size=args.batch_size,
        shuffle=False, 
        seed=args.seed, 
        sort=False,
        vocab=processor.vocab, 
        label2id=args.label2id)
    '''进度条'''
    pbar = ProgressBar(n_total=len(eval_dataloader), desc="Evaluating")
    '''评价指标定义'''
    metric = SeqEntityScore(args.id2label,markup=args.markup)
    '''计算并存储平均值和当前值'''
    eval_loss = AverageMeter()
    model.eval()
    with torch.no_grad():
        for step, batch in enumerate(eval_dataloader):
            input_ids, input_mask, input_tags, input_lens = batch
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            input_tags = input_tags.to(args.device)
            features, loss = model.forward_loss(input_ids, input_mask, input_lens, input_tags)
            eval_loss.update(val=loss.item(), n=input_ids.size(0))
            '''利用 CRF 解码 标注序列'''
            tags, _ = model.crf._obtain_labels(features, args.id2label, input_lens)
            input_tags = input_tags.cpu().numpy()
            target = [input_[:len_] for input_, len_ in zip(input_tags, input_lens)]
            metric.update(pred_paths=tags, label_paths=target)
            pbar(step=step)
    print(" ")
    eval_info, class_info = metric.result()
    eval_info = {f'eval_{key}': value for key, value in eval_info.items()}
    result = {'eval_loss': eval_loss.avg}
    result = dict(result, **eval_info)
    return result, class_info


### 模型训练

In [27]:
def train(args,model,processor):
    # step 1：加载 训练集数据
    train_dataset = load_and_cache_examples(args, processor, data_type='train')
    # step 2：定义 DatasetLoader 对象。并对数据 token 和 label 转化为 id，和按 batch_size 大小切割
    train_loader = DatasetLoader(
            data=train_dataset, 
            batch_size=args.batch_size,
            shuffle=False, seed=args.seed, sort=True,
            vocab = processor.vocab,label2id = args.label2id
        )
    # step 3：定义优化函数
    parameters = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(parameters, lr=args.learning_rate)
    '''
        作用：当指标停止改善时，降低学习率。
        一旦学习停滞，模型通常会受益于将学习率降低2-10倍。
        该调度程序读取度量标准数量，如果没有发现“耐心”时期的改善，则会降低学习率。
    '''
    scheduler = ReduceLROnPlateau(
        optimizer, mode='max', 
        factor=0.5, patience=3,
        verbose=1, epsilon=1e-4, 
        cooldown=0, min_lr=0, eps=1e-8)
    # step 4：模型训练
    best_f1 = 0
    for epoch in range(1, 1 + args.epochs):
        print(f"Epoch {epoch}/{args.epochs}")
        '''进度条'''
        pbar = ProgressBar(n_total=len(train_loader), desc='Training')
        '''计算并存储平均值和当前值'''
        train_loss = AverageMeter()
        model.train()
        assert model.training
        for step, batch in enumerate(train_loader):
            input_ids, input_mask, input_tags, input_lens = batch
            input_ids = input_ids.to(args.device)
            input_mask = input_mask.to(args.device)
            input_tags = input_tags.to(args.device)
            features, loss = model.forward_loss(input_ids, input_mask, input_lens, input_tags)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            pbar(step=step, info={'loss': loss.item()})
            train_loss.update(loss.item(), n=1)
        print(" ")
        train_log = {'loss': train_loss.avg}
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
        # step 5：评估 训练效果
        eval_log, class_info = evaluate(args,model,processor)
        logs = dict(train_log, **eval_log)
        show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()])
        logger.info(show_info)
        scheduler.epoch_step(logs['eval_f1'], epoch)
        # step 6：模型保存
        if logs['eval_f1'] > best_f1:
            logger.info(f"\nEpoch {epoch}: eval_f1 improved from {best_f1} to {logs['eval_f1']}")
            logger.info("save model to disk.")
            best_f1 = logs['eval_f1']
            if isinstance(model, nn.DataParallel):
                model_stat_dict = model.module.state_dict()
            else:
                model_stat_dict = model.state_dict()
            state = {'epoch': epoch, 'arch': args.arch, 'state_dict': model_stat_dict}
            model_path = args.output_dir / 'best-model.bin'
            torch.save(state, str(model_path))
            print("Eval Entity Score: ")
            for key, value in class_info.items():
                info = f"Subject: {key} - Acc: {value['acc']} - Recall: {value['recall']} - F1: {value['f1']}"
                logger.info(info)

In [28]:
if args.do_train:
    train(args,model,processor)

12/27/2020 15:04:08 - INFO - root -   Loading features from cached file dataset\cluener\cached_crf-train_bilstm_crf_ner


336 batches created
Epoch 1/50

12/27/2020 15:09:17 - INFO - root -   Loading features from cached file dataset\cluener\cached_crf-dev_bilstm_crf_ner


42 batches created

12/27/2020 15:09:41 - INFO - root -   
Epoch: 1 -  loss: 5.8060 - eval_loss: 10.4961 - eval_acc: 0.5807 - eval_recall: 0.6263 - eval_f1: 0.6027 
12/27/2020 15:09:41 - INFO - root -   
Epoch 1: eval_f1 improved from 0 to 0.602662490211433
12/27/2020 15:09:41 - INFO - root -   save model to disk.
12/27/2020 15:09:41 - INFO - root -   Subject: name - Acc: 0.5699 - Recall: 0.6925 - F1: 0.6252


Eval Entity Score: 


12/27/2020 15:09:41 - INFO - root -   Subject: address - Acc: 0.4749 - Recall: 0.3298 - F1: 0.3892
12/27/2020 15:09:41 - INFO - root -   Subject: movie - Acc: 0.604 - Recall: 0.596 - F1: 0.6
12/27/2020 15:09:41 - INFO - root -   Subject: position - Acc: 0.6932 - Recall: 0.7252 - F1: 0.7088
12/27/2020 15:09:41 - INFO - root -   Subject: organization - Acc: 0.6078 - Recall: 0.7221 - F1: 0.66
12/27/2020 15:09:41 - INFO - root -   Subject: company - Acc: 0.6385 - Recall: 0.6402 - F1: 0.6394
12/27/2020 15:09:41 - INFO - root -   Subject: scene - Acc: 0.3861 - Recall: 0.5598 - F1: 0.457
12/27/2020 15:09:41 - INFO - root -   Subject: government - Acc: 0.6726 - Recall: 0.6073 - F1: 0.6383
12/27/2020 15:09:41 - INFO - root -   Subject: book - Acc: 0.7128 - Recall: 0.4351 - F1: 0.5403
12/27/2020 15:09:41 - INFO - root -   Subject: game - Acc: 0.5177 - Recall: 0.7932 - F1: 0.6265


Epoch 2/50

KeyboardInterrupt: 

In [19]:
train_dataset = load_and_cache_examples(args, processor, data_type='train')

12/27/2020 14:18:19 - INFO - root -   Loading features from cached file dataset\cluener\cached_crf-train_bilstm_crf_ner
