<a href="https://colab.research.google.com/github/zhangxs131/NER/blob/main/bert_conll2003_ner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#NER using Bert

本文使用pytorch-pretrained-bert训练一个NER模型,数据集使用了conll2003,可以从 https://deepai.org/dataset/conll-2003-english
下载

模型就是bert-base-uncased

经过本文复习了一些训练验证的基本流程，对NER预训练模型数据处理流程。

本文改进，对token和label对齐只是简单对齐，由于tokenizer不同，使得分词后的标签数目，于label的标签无法对齐，这也是NER的一个重要问题。模型使用pytorch-pretrained-bert，这个库已经不更新了，还是应该使用huggingfacae的transformers进行更方便，快捷。

In [1]:
#安装相关包
!pip install pytorch-pretrained-bert==0.4.0
!pip install seqeval==0.0.12

Collecting pytorch-pretrained-bert==0.4.0
  Downloading pytorch_pretrained_bert-0.4.0-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 1.5 MB/s 
Collecting boto3
  Downloading boto3-1.21.18-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 6.5 MB/s 
Collecting botocore<1.25.0,>=1.24.18
  Downloading botocore-1.24.18-py3-none-any.whl (8.6 MB)
[K     |████████████████████████████████| 8.6 MB 42.6 MB/s 
[?25hCollecting s3transfer<0.6.0,>=0.5.0
  Downloading s3transfer-0.5.2-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 9.5 MB/s 
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.8-py2.py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 71.3 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
[K     |████████████████████████████████| 127 kB 72.3 MB/s 
Installing col

In [2]:
import string 
import pandas as pd
import numpy as np
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset ,DataLoader,Dataset,RandomSampler,SequentialSampler
from pytorch_pretrained_bert import BertTokenizer,BertConfig
from pytorch_pretrained_bert import BertForTokenClassification 

#帮助处理数据的包
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

#计算F1值
from seqeval.metrics import f1_score

In [3]:
#上传数据文件 conll2003
from google.colab import files
uploaded=files.upload()

Saving conll2003.zip to conll2003.zip


In [4]:
!unzip conll2003.zip

Archive:  conll2003.zip
  inflating: metadata                
  inflating: test.txt                
  inflating: train.txt               
  inflating: valid.txt               


#数据处理

这里只需要data中 words和NERtags两列

特征选择word和word_pos_of_speech 使用

In [21]:
"""
Load the training/testing data. 
input: conll format data, but with only 2 tab separated colums - words and NEtags.
output: A list where each item is 2 lists.  sentence as a list of tokens, NER tags as a list for each token.
"""
#functions for preparing the data in the *.txt files
def load__data_conll(file_path):
    myoutput,words,poses,tags = [],[],[],[]
    fh = open(file_path)
    for line in fh:
        line = line.strip()
        if " " not in line:
            #Sentence ended.
            myoutput.append([words,poses,tags])
            words,poses,tags =[],[],[]
        else:
            word,pos,_, tag = line.split(" ")
            words.append(word)
            poses.append(pos)
            tags.append(tag)
    fh.close()
    return myoutput


In [22]:
train_path = 'train.txt'
test_path = 'test.txt' 

conll_train = load__data_conll(train_path)
conll_test = load__data_conll(test_path)  


In [26]:
#预处理句子，将一些特殊符号进行转换，
import re
def untokenize(words):
    """
    Untokenizing a text undoes the tokenizing operation, restoring
    punctuation and spaces to the places that people expect them to be.
    Ideally, `untokenize(tokenize(text))` should be identical to `text`,
    except for line breaks.
    """
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    return step6.strip()

print(untokenize(conll_test[3][0]))

AL-AIN, United Arab Emirates 1996-12-06


In [27]:
#lets convert them to dataframs for easier handling
df_train = pd.DataFrame(conll_train,columns=["sentence","pos","labels"])
df_test = pd.DataFrame(conll_test,columns=["sentence","pos","labels"])

In [None]:
#整合所有数据

sentences = list(df_train['sentence'])+list(df_test['sentence'])
sentences = [untokenize(sent) for sent in sentences]

labels = list(df_train['labels'])+list(df_test['labels']) 
print(len(sentences),len(labels))

# 使用tokenizer处理数据

现在我们得到word和labels，使用berttokenizer处理数据


In [39]:
#GPU

device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu=torch.cuda.device_count()
print('num of gpu is ',n_gpu)

#设置超参数

max_length=75
batch_size=32

#不区分大小写的bert base tokenizer
tokenizer=BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
tokenized_texts=list(map(lambda x:['[CLS]']+tokenizer.tokenize(x)+['[SEP]'],sentences))

print(tokenized_texts[4])
print(len(tokenized_texts[4]))

num of gpu is  1
['[CLS]', 'the', 'european', 'commission', 'said', 'on', 'thursday', 'it', 'disagreed', 'with', 'german', 'advice', 'to', 'consumers', 'to', 'shu', '##n', 'british', 'lamb', 'until', 'scientists', 'determine', 'whether', 'mad', 'cow', 'disease', 'can', 'be', 'transmitted', 'to', 'sheep', '.', '[SEP]']
33


In [37]:
print(labels[4])
print(len(labels[4]))

['O', 'B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
30


In [43]:
#标签进行数值化
lab=[]
for label in labels[:1000]:
  lab=lab+label
tag_vals=list(set(lab))
tag2idx={t:i for i,t in enumerate(tag_vals)}
tag2idx

{'B-LOC': 6,
 'B-MISC': 0,
 'B-ORG': 1,
 'B-PER': 4,
 'I-LOC': 8,
 'I-MISC': 5,
 'I-ORG': 7,
 'I-PER': 3,
 'O': 2}

In [45]:
#将input和label转为tensor，并进行padding和truncaiton

input_ids=pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                        maxlen=max_length,dtype='long',truncating='post',padding="post")

tags=pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                   maxlen=max_length,value=tag2idx['O'],padding='post',
                   dtype='long',truncating='post')

attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [46]:
#划分训练集和验证集

tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, random_state=2022, test_size=0.2)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,  random_state=2022, test_size=0.2)

tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [51]:
#创建dataloader

train_data=TensorDataset(tr_inputs,tr_masks,tr_tags)
valid_data=TensorDataset(val_inputs, val_masks, val_tags)

train_sampler = RandomSampler(train_data)
valid_sampler = SequentialSampler(valid_data)

train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

In [50]:
#加载模型

model=BertForTokenClassification.from_pretrained('bert-base-uncased',num_labels=len(tag_vals))

full_finetuning=True

if full_finetuning:
  param_optimizer=list(model.named_parameters())
  no_decay=['bias','gamma','beta']
  optimizer_grouped_parameters=[
    {'params':[p for n,p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate':0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}
  ]
else:
  param_optimizer = list(model.classifier.named_parameters()) 
  optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [49]:
# 计算accuracy 
def flat_accuracy(pred,labels):
  pred_flat =np.argmax(pred,axis=2).flatten()
  label_flat=labels.flatten()
  return np.sum(pred_flat==label_flat) /len(label_flat)
  

#Train 训练模型

In [61]:
epochs=2
max_grad_norm=1.0

step=0
train_loss_set=[]
model.cuda()
for i in range(epochs):
  print('Epoch {}________________'.format(i+1))
  model.train()
  tr_loss=0
  tr_examples,tr_steps=0,0

  for batch in tqdm(train_dataloader):
    batch=tuple(t.to(device) for t in batch)
    b_input_ids,b_input_mask,b_labels=batch

    loss=model(b_input_ids,attention_mask=b_input_mask,token_type_ids=None,labels=b_labels)
    train_loss_set.append(loss)

    loss.backward()

    tr_loss+=loss.item()
    tr_examples+=b_input_ids.size(0)
    tr_steps+=1
    step+=1
    if step%20==0:
      print('Epoch {} ,Step {} Train Loss {}'.format(i+1,step,loss.item()))

    # gradient clipping
    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

    #更新
    optimizer.step()
    model.zero_grad()
  print('Epoch {} ,Train Loss {} '.format(i+1,tr_loss/tr_steps))

  #验证集
  model.eval()
  eval_loss,eval_accuracy=0,0
  eval_example,eval_steps=0,0
  prediction,true_labels=[],[]

  for batch in valid_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      logits = model(b_input_ids, token_type_ids=None,attention_mask=b_input_mask)

      logits_ids=logits.detach().cpu().numpy()
      label_ids=b_labels.to('cpu').numpy()
      prediction.extend([list(p) for p in np.argmax(logits_ids, axis=2)])
      true_labels.append(label_ids)

      tmp_eval_accuracy = flat_accuracy(logits_ids, label_ids)
      eval_accuracy += tmp_eval_accuracy

      eval_example += b_input_ids.size(0)
      eval_steps += 1
  print("Epoch {} Validation Accuracy: {}".format(i+1,eval_accuracy/eval_steps))

  #F1值计算
  pred_tags = [tag_vals[p_i] for p in prediction for p_i in p]
  valid_tags = [tag_vals[l_ii] for l in true_labels  for l_i in l for l_ii in l_i]
  print("Epoch {} F1-Score: {}".format(i+1,f1_score(pred_tags, valid_tags)))

Epoch 1 F1-Score: 0.7518685983205684
Epoch 2 F1-Score: 0.7518685983205684


In [None]:
#可视化损失
import matplotlib.pyplot as plt
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.ylim(0,0.25)
plt.plot(train_loss_set)
plt.show()

In [66]:
#验证集
model.eval()
eval_loss,eval_accuracy=0,0
eval_example,eval_steps=0,0
prediction,true_labels=[],[]

for batch in valid_dataloader:
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch

  with torch.no_grad():
    logits = model(b_input_ids, token_type_ids=None,attention_mask=b_input_mask)

    logits_ids=logits.detach().cpu().numpy()
    label_ids=b_labels.to('cpu').numpy()
    prediction.extend([list(p) for p in np.argmax(logits_ids, axis=2)])
    true_labels.append(label_ids)

    tmp_eval_accuracy = flat_accuracy(logits_ids, label_ids)
    eval_accuracy += tmp_eval_accuracy

    eval_example += b_input_ids.size(0)
    eval_steps += 1
print("Validation Accuracy: {}".format(eval_accuracy/eval_steps))

#F1值计算
pred_tags = [tag_vals[p_i] for p in prediction for p_i in p]
valid_tags = [tag_vals[l_ii] for l in true_labels  for l_i in l for l_ii in l_i]
print("F1-Score: {}".format(f1_score(pred_tags,valid_tags)))

Validation Accuracy: 0.989743589743589
F1-Score: 0.7518685983205684


In [67]:
#保存模型
torch.save(model.state_dict(),'bert_for_conll.pth')
torch.save(optimizer.state_dict(),'optimizer_for_conll.pth')