In [1]:
from transformers import AutoModelForTokenClassification, AutoTokenizer,DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer
import evaluate  
from datasets import load_dataset
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = load_dataset('doushabao4766/msra_ner_k_V3')
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 45001
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags', 'knowledge'],
        num_rows: 3443
    })
})

In [4]:
for row in ds['train']:
    print(row['tokens'])
    print(row['ner_tags'])
    
    break

['当', '希', '望', '工', '程', '救', '助', '的', '百', '万', '儿', '童', '成', '长', '起', '来', '，', '科', '教', '兴', '国', '蔚', '然', '成', '风', '时', '，', '今', '天', '有', '收', '藏', '价', '值', '的', '书', '你', '没', '买', '，', '明', '日', '就', '叫', '你', '悔', '不', '当', '初', '！']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [5]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

## 实体映射字典
'O':0
'B-PER':1
'I-PER':2
'B-LOC':3
'I-LOC':4
'B-ORG':5
'I-ORG':6

In [6]:
entites = ['O'] + list({'PER', "LOC", "ORG"})
entity_index = {entity:i for i, entity in enumerate(entites)}
tags = ['O']
for entity in entites[1:]:
    tags.append('B-' + entity)
    tags.append('I-' + entity)

In [7]:
entity_index

{'O': 0, 'ORG': 1, 'PER': 2, 'LOC': 3}

In [8]:
tags

['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']

In [9]:
def data_input_proc(item):
    # 文本已经分为字符，且tag索引也已经提供
    # 所以数据预处理反而简单
    # 导入已拆分为字符的文本列表，需要设置参数is_split_into_words=True
    input_data = tokenizer(item['tokens'], 
                           truncation=True,
                           add_special_tokens=False, 
                           max_length=512, 
                           is_split_into_words=True)
    
    labels = [lbl[:512] for lbl in item['ner_tags']]
    input_data['labels'] = labels
    return input_data

ds1 = ds.map(data_input_proc, batched=True)  # batched 每次传入自定义方法样本数量多个

Map: 100%|██████████| 3443/3443 [00:00<00:00, 3817.36 examples/s]


In [11]:
ds1.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [25]:
args = TrainingArguments(
    output_dir="ner_train",  # 模型训练工作目录（tensorboard，临时模型存盘文件，日志）
    num_train_epochs = 3,    # 训练 epoch
    save_safetensors=False,  # 设置False保存文件可以通过torch.load加载
    per_device_train_batch_size=4,  # 训练批次
    per_device_eval_batch_size=4,
    report_to='tensorboard',  # 训练输出记录
    eval_strategy="epoch",
)

In [14]:
tags

['O', 'B-ORG', 'I-ORG', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC']

In [None]:
id2lb = {i:tag for i,tag in enumerate(tags)}
lb2id = {tag:i for i,tag in enumerate(tags)}
model = AutoModelForTokenClassification.from_pretrained('google-bert/bert-base-chinese', num_labels=7, id2label=id2lb, label2id=lb2id)

In [16]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [26]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, padding=True)

trainer = Trainer(
    model,
    args,
    train_dataset=ds1['train'],
    eval_dataset=ds1['test'],
    data_collator=data_collator
)

In [28]:
trainer.train()

  0%|          | 0/16878 [00:34<?, ?it/s]
  0%|          | 0/33753 [00:13<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 2.00 GiB of which 0 bytes is free. Of the allocated memory 1.63 GiB is allocated by PyTorch, and 31.55 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)