In [None]:
'''
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install seqeval
!git clone https://github.com/16692281376/NER_dataset.git
'''

In [None]:
import json
with open(r"NER_dataset/medicines_res.json","r",encoding="utf-8") as f:
    dataset=json.load(f)


In [None]:
from datasets import Dataset,DatasetDict
import pandas as pd

train_dataset=pd.DataFrame(dataset)
train_dataset = Dataset.from_dict(dataset)
dataset=DatasetDict({"train":train_dataset,"validation":train_dataset})
dataset

In [None]:
from transformers import AutoTokenizer
import os
dirpath=os.environ["HF_HOME"]
checkpoint="bert-base-chinese"
tokenizer=AutoTokenizer.from_pretrained(checkpoint,cache_dir=dirpath)

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            #如果不是None则为其对应的word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [None]:
medicines_label = ["O", "B-PD", "I-PD",  
                   "B-PH", "I-PH",
                   "B-JY", "I-JY", 
                   "B-PR", "I-PR", 
                   "B-SR", "I-SR", 
                   "B-FR", "I-FR",                    
                   "B-JD","I-JD",
                    "B-JX", "I-JX", ]
# 购药发票标注对应字典
token2label = {"无": "O", "票据代码B": "B-PD", "票据代码I": "I-PD",
                "票据号码B": "B-PH", "票据号码I": "I-PH", 
               "校验码B": "B-JY", "校验码I": "I-JY", 
               "开票日期B": "B-PR", "开票日期I": "I-PR",
                "收款人B": "B-SR", "收款人I": "I-SR", 
                "复核人B": "B-FR", "复核人I": "I-FR",
                "价税合计（大写）B":"B-JD","价税合计（大写）I":"I-JD",
                "（小写）B":"B-JX","（小写）I":"I-JX"
               }
id2label = {str(i): label for i, label in enumerate(medicines_label)}
label2id = {v: k for k, v in id2label.items()}
''' 票据代码-PD、票据号码-PH、校验码-JY、开票日期-PR、收款人-SR、复核人-FR、
    价税合计（大写）B-JD、（小写）-JX、'''
#print(id2label)
#print(label2id)

In [None]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
    cache_dir=dirpath,
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from transformers import get_scheduler

num_train_epochs = 96
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[medicines_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [medicines_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from tqdm.auto import tqdm
import torch
from datasets import load_metric

metric = load_metric("seqeval")
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        
        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

In [None]:
tokenizer.push_to_hub("med-ner")
model.push_to_hub("med-ner")