In [1]:
import os
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.optim as optim

from data import read_corpus, build_dict, TAG_MAP, NER_DataSet, condtraints
from model import BiLSTM_CRF
from trainer import train, evaluate, load_model


train_corpus_path = './datasets/train_data'
test_corpus_path = './datasets/test_data'

### 准备数据

In [2]:
corpus = read_corpus(train_corpus_path)
dct = build_dict(corpus)

np.random.shuffle(corpus)
train_ds = NER_DataSet(corpus[:-5000], dct)
val_ds = NER_DataSet(corpus[-5000:], dct)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=0)
val_dl = DataLoader(val_ds, batch_size=32, shuffle=False, num_workers=0)

### 创建模型


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class Config:
    name = "hidden_256_embed_150"
    hidden_size = 256
    num_tags = len(TAG_MAP)
    embed_dim = 150
    embed_size = len(dct)
    dropout = 0.5
    device = device
    condtraints = condtraints

model = BiLSTM_CRF(Config())
model = model.to(device)
lr = 0.001
optimizer = optim.Adam(model.parameters(), lr=lr)

### 训练模型

In [7]:
from trainer import train, evaluate

In [19]:
history = train(model, optimizer, train_dl, val_dl, device=device, epochs=20, early_stop=True, save_every_n_epochs=3)

2019-06-01 20:25:27,658 - epoch 1 - loss: 6.30 acc: 0.72 - val_acc: 0.69


2019-06-01 20:28:05,706 - epoch 2 - loss: 2.04 acc: 0.82 - val_acc: 0.77


2019-06-01 20:30:53,383 - epoch 3 - loss: 1.30 acc: 0.88 - val_acc: 0.82


2019-06-01 20:33:30,144 - epoch 4 - loss: 0.95 acc: 0.91 - val_acc: 0.84


2019-06-01 20:36:18,832 - epoch 5 - loss: 0.74 acc: 0.92 - val_acc: 0.84


2019-06-01 20:38:55,712 - epoch 6 - loss: 0.60 acc: 0.94 - val_acc: 0.85


2019-06-01 20:41:42,535 - epoch 7 - loss: 0.50 acc: 0.95 - val_acc: 0.85


2019-06-01 20:44:16,465 - epoch 8 - loss: 0.42 acc: 0.96 - val_acc: 0.86


2019-06-01 20:47:03,501 - epoch 9 - loss: 0.37 acc: 0.97 - val_acc: 0.86


2019-06-01 20:49:37,020 - epoch 10 - loss: 0.34 acc: 0.97 - val_acc: 0.86
2019-06-01 20:49:37,023 - early stop


### 评估模型

In [6]:
test_corpus = read_corpus(test_corpus_path)

test_ds = NER_DataSet(test_corpus, dct)
test_dl = DataLoader(test_ds, batch_size=64, shuffle=True)

In [7]:
metric = evaluate(model, test_dl, device=device)

In [8]:
print(metric.report())

            PER         LOC         ORG         
precision   0.75        0.85        0.78        
recall      0.83        0.90        0.82        
f1          0.79        0.87        0.80        
------------------------------------------------
precision   0.80
recall      0.86
f1          0.83


### 预测

In [5]:
from predict import predict_sentence_tags, get_entity

sentence = '在周恩来总理的领导下，由当时中共中央主管科学工作的陈毅、国务院副总理兼国家计委主任李富春具体领导，在北京召开了包括中央各部门、各有关高等学校和中国科学院的科学技术工作人员大会，动员制定十二年科学发展远景规划。来自全国23个单位的787名科技人员提出了发展远景规划的初步内容，体现出全国“重点发展，迎头赶上”的方针。在规划制定过程中，深切感到某些新技术是现代科学技术发展的关键。为了更快地发展这些新学科，使其在短时间内接近国际水平，把计算技术、自动化、电子学和半导体这四个学科的研究和发展列为“四大紧急措施”，经周恩来总理同意，确定由中国科学院负责采取紧急措施，尽快筹建相应的四个学科研究机构。'

tags = predict_sentence_tags(model, sentence, dct, device)

In [6]:
get_entity(sentence, tags)

{'PER': {'周恩来', '李富春', '陈毅'},
 'ORG': {'中共中央', '中国科学院', '国务院', '国家计委'},
 'LOC': {'北京'}}

### 加载模型

In [4]:
from trainer import load_model

load_model(model, 'model_hidden_256_embed_150_epoch_8_acc_0.89.tar')

In [9]:
metric = evaluate(model, test_dl, device=device)
print(metric.report())

            PER         LOC         ORG         
precision   0.75        0.85        0.78        
recall      0.83        0.89        0.82        
f1          0.79        0.87        0.80        
------------------------------------------------
precision   0.80
recall      0.86
f1          0.83
