# 数据处理

In [1]:
import codecs
import numpy as np

In [3]:
tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']
# B-ORG I-ORG 机构的开始位置和中间位置
# B-PER I-PER 人物名字的开始位置和中间位置
# B-LOC I-LOC 位置的开始位置和中间位置

train_lines = codecs.open('datasets/named_entity_recognition/msra/train/sentences.txt').readlines()
train_lines = [x.replace(' ', '').strip() for x in train_lines]

train_tags = codecs.open('datasets/named_entity_recognition/msra/train/tags.txt').readlines()
train_tags = [x.strip().split(' ') for x in train_tags]
train_tags = [[tag_type.index(x) for x in tag] for tag in train_tags]

In [4]:
train_lines, train_tags = train_lines[:20000], train_tags[:20000] 

In [5]:
val_lines = codecs.open('datasets/named_entity_recognition/msra/val/sentences.txt').readlines()
val_lines = [x.replace(' ', '').strip() for x in val_lines]

val_tags = codecs.open('datasets/named_entity_recognition/msra/val/tags.txt').readlines()
val_tags = [x.strip().split(' ') for x in val_tags]
val_tags = [[tag_type.index(x) for x in tag] for tag in val_tags]

In [6]:
train_lines[0], train_tags[0]

('如何解决足球界长期存在的诸多矛盾，重振昔日津门足球的雄风，成为天津足坛上下内外到处议论的话题。',
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  6,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])

# 模型训练

## tokenizer

In [8]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
train_encoding = tokenizer(list(train_lines), truncation=True, padding=True, max_length=64)
val_encoding = tokenizer(list(val_lines), truncation=True, padding=True, max_length=64)

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx])[:64] for key, val in self.encodings.items()}
        # 字级别的标注
        item['labels'] = torch.tensor([0] + self.labels[idx] + [0] * (63-len(self.labels[idx])))[:64]
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = TextDataset(train_encoding, train_tags[:])
test_dataset = TextDataset(val_encoding, val_tags[:])

In [10]:
train_dataset[0]

{'input_ids': tensor([ 101, 1963,  862, 6237, 1104, 6639, 4413, 4518, 7270, 3309, 2100, 1762,
         4638, 6436, 1914, 4757, 4688, 8024, 7028, 2920, 3212, 3189, 3823, 7305,
         6639, 4413, 4638, 7413, 7599, 8024, 2768,  711, 1921, 3823, 6639, 1781,
          677,  678, 1079, 1912, 1168, 1905, 6379, 6389, 4638, 6413, 7579,  511,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'labels': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [11]:
tokenizer.decode(train_dataset[0]['input_ids'])

'[CLS] 如 何 解 决 足 球 界 长 期 存 在 的 诸 多 矛 盾 ， 重 振 昔 日 津 门 足 球 的 雄 风 ， 成 为 天 津 足 坛 上 下 内 外 到 处 议 论 的 话 题 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [12]:
for idx in range(len(train_dataset)):
    item = train_dataset[idx]
    for key in item:
        if item[key].shape[0] != 64:
            print(key, item[key].shape)

In [13]:
for idx in range(len(test_dataset)):
    item = test_dataset[idx]
    for key in item:
        if item[key].shape[0] != 64:
            print(key, item[key].shape)

## 模型训练

In [14]:
import torch
from transformers import BertForTokenClassification, AdamW, get_linear_schedule_with_warmup
model = BertForTokenClassification.from_pretrained('bert-base-chinese', num_labels=7)

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
model.to(device)

# BertForTokenClassification只是一个字级别的分类器
# 想要使用bert+crf可以参考这个实现
# https://github.com/Louis-udm/NER-BERT-CRF

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-c

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [15]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 1
scheduler = get_linear_schedule_with_warmup(optim, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [16]:
from tqdm import tqdm

def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for idx, batch in enumerate(train_loader):
        optim.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            # loss = outputs[0]

        loss = outputs.loss
        
        if idx % 20 == 0:
            with torch.no_grad():
                # 64 * 7
                print((outputs[1].argmax(2).data == labels.data).float().mean().item(), loss.item())
        
        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    for batch in test_dataloader:
        with torch.no_grad():
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs[1]

        total_eval_loss += loss.item()
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        total_eval_accuracy += (outputs[1].argmax(2).data == labels.data).float().mean().item()
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
    

for epoch in range(5):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

------------Epoch: 0 ----------------
0.17626953125 1.965804100036621
0.966796875 0.13332447409629822
0.9814453125 0.09005818516016006
0.970703125 0.09516040235757828
0.97021484375 0.14700622856616974
epoth: 0, iter_num: 100, loss: 0.1206, 16.00%
0.97412109375 0.12301269173622131
0.966796875 0.13751883804798126
0.96923828125 0.12724775075912476
0.97265625 0.11036788672208786
0.98388671875 0.07470635324716568
epoth: 0, iter_num: 200, loss: 0.0808, 32.00%
0.99658203125 0.02434217929840088
0.9775390625 0.07938043028116226
0.97998046875 0.07676959782838821
0.98779296875 0.06868980824947357
0.98583984375 0.05229904130101204
epoth: 0, iter_num: 300, loss: 0.0468, 48.00%
0.9873046875 0.05262850970029831
0.98876953125 0.06584114581346512
0.98583984375 0.03852091357111931
0.98779296875 0.051144976168870926
0.97998046875 0.0400601401925087
epoth: 0, iter_num: 400, loss: 0.0643, 64.00%
0.98583984375 0.054247722029685974
0.9892578125 0.032272450625896454
0.9853515625 0.06257447600364685
0.98339843

In [12]:
torch.save(model, 'bert-ner.pt')

# 模型预测

In [17]:
tag_type = ['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']

def predcit(s):
    item = tokenizer([s], truncation=True, padding='longest', max_length=64)
    with torch.no_grad():
        input_ids = torch.tensor(item['input_ids']).to(device).reshape(1, -1)
        attention_mask = torch.tensor(item['attention_mask']).to(device).reshape(1, -1)
        labels = torch.tensor([0] * attention_mask.shape[1]).to(device).reshape(1, -1)
        
        outputs = model(input_ids, attention_mask, labels)
        outputs = outputs[0].data.cpu().numpy()
        
    outputs = outputs[0].argmax(1)[1:-1]
    ner_result = ''
    ner_flag = ''
    
    for o, c in zip(outputs,s):
        # 0 就是 O，没有含义
        if o == 0 and ner_result == '':
            continue
        
        # 
        elif o == 0 and ner_result != '':
            if ner_flag == 'O':
                print('机构：', ner_result)
            if ner_flag == 'P':
                print('人名：', ner_result)
            if ner_flag == 'L':
                print('位置：', ner_result)
                
            ner_result = ''
        
        elif o != 0:
            ner_flag = tag_type[o][2]
            ner_result += c
    return outputs

In [24]:
s = '整个华盛顿已笼罩在一片夜色之中，一个电话从美国总统府白宫打到了菲律宾总统府马拉卡南宫。'
data = predcit(s)

位置： 华盛顿
位置： 美国总统府白宫
位置： 菲律宾总统府马拉卡南宫


In [19]:
s = '人工智能是未来的希望，也是中国和美国的冲突点。'
data = predcit(s)

位置： 中国
位置： 美国


In [20]:
s = '明天我们一起在海淀吃个饭吧，把叫刘涛和王华也叫上。'
data = predcit(s)

位置： 海淀
人名： 刘涛
人名： 王华


In [21]:
s = '同煤集团同生安平煤业公司发生井下安全事故 19名矿工遇难'
data = predcit(s)

机构： 同煤集团同生安平煤业公司


In [22]:
s = '山东省政府办公厅就平邑县玉荣商贸有限公司石膏矿坍塌事故发出通报'
data = predcit(s)

机构： 山东省政府办公厅
机构： 平邑县玉荣商贸有限公司


In [23]:
s = '[新闻直播间]黑龙江:龙煤集团一煤矿发生火灾事故'
data = predcit(s)

位置： 黑龙江
机构： 龙煤集团


# 参考链接

- https://mp.weixin.qq.com/s?__biz=MzIwNDA5NDYzNA==&mid=2247490973&idx=1&sn=d5283ca0889d813d8a32d4829e833fa6&chksm=96c43058a1b3b94e93cbe185cdfbcda3638fb3ef9d3dd25953f9b2e1ecd703e7199e608201e6&scene=178&cur_album_id=1364202321906941952#rd