In [None]:
'''
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install seqeval
'''

In [None]:
import json
with open(r"NER_dataset/fp_res.json","r",encoding="utf-8") as f:
    dataset=json.load(f)


In [None]:
from datasets import Dataset,DatasetDict
import pandas as pd

train_dataset=pd.DataFrame(dataset)
train_dataset = Dataset.from_dict(dataset)
dataset=DatasetDict({"train":train_dataset,"validation":train_dataset})
dataset

In [None]:
from transformers import AutoTokenizer

checkpoint="bert-base-chinese"
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            #如果不是None则为其对应的word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


In [None]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], collate_fn=data_collator, batch_size=8
)

In [None]:
hospitalization_label = ["O", "B-G", "I-G", "B-D", "I-D", "B-PD", "I-PD", "B-PH", "I-PH","B-IN","I-IN","B-OUT","I-OUT",
                   "B-JY", "I-JY", "B-PR", "I-PR", "B-SD", "I-SD", "B-SR", "I-SR", "B-FR", "I-FR", "B-L", "I-L", "B-YL",
                   "I-YL", "B-ZC", "I-ZC", "B-JC", "I-JC", "B-HY", "I-HY", "B-ZL", "I-ZL", "B-S", "I-S", "B-WC", "I-WC",
                   "B-WY", "I-WY", "B-ZY", "I-ZY", "B-ZCY", "I-ZCY", "B-YZL", "I-YZL", "B-CW", "I-CW", "B-HL", "I-HL",
                   "B-GH", "I-GH", "B-ELE", "I-ELE", "B-HJD", "I-HJD", "B-HJX", "I-HJX", "B-YTC", "I-YTC", "B-TC",
                   "I-TC", "B-PX", "I-PX", "B-PP", "I-PP", "B-PO", "I-PO", "B-ELP", "I-ELP", "B-POO", "I-POO", "B-PW",
                   "I-PW"]
# 住院发票标注对应字典
token2label = {"无": "O", "性别B": "B-G", "性别I": "I-G", "入院日期B":"B-IN","入院日期I":"I-IN","出院日期B":"B-OUT","出院日期I":"I-OUT",
               "住院天数B": "B-D", "住院天数I": "I-D", "票据代码B": "B-PD", "票据代码I": "I-PD",
               "票据号码B": "B-PH", "票据号码I": "I-PH", "校验码B": "B-JY", "校验码I": "I-JY", "开票日期B": "B-PR", "开票日期I": "I-PR",
               "收款单位B": "B-SD", "收款单位I": "I-SD", "收款人B": "B-SR", "收款人I": "I-SR", "复核人B": "B-FR", "复核人I": "I-FR",
               "医疗机构类型B": "B-L", "医疗机构类型I": "I-L", "医保类型B": "B-YL", "医保类型I": "I-YL", "诊查费B": "B-ZC", "诊查费I": "I-ZC",
               "检查费B": "B-JC", "检查费I": "I-JC", "化验费B": "B-HY", "化验费I": "I-HY", "治疗费B": "B-ZL", "治疗费I": "I-ZL",
               "手术费B": "B-S", "手术费I": "I-S", "卫生材料费B": "B-WC", "卫生材料费I": "I-WC", "西药费B": "B-WY", "西药费I": "I-WY",
               "中药饮片B": "B-ZY", "中药饮片I": "I-ZY", "中成药费B": "B-ZCY", "中成药费I": "I-ZCY", "一般诊疗费B": "B-YZL",
               "一般诊疗费I": "I-YZL",
               "床位费B": "B-CW", "床位费I": "I-CW", "护理费B": "B-HL", "护理费I": "I-HL", "挂号费B": "B-GH", "挂号费I": "I-GH",
               "其他收费项目B": "B-ELE", "其他收费项目I": "I-ELE", "合计金额（大写）B": "B-HJD", "合计金额（大写）I": "I-HJD",
               "（小写）B": "B-HJX", "（小写）I": "I-HJX", "医保统筹基金支付B": "B-YTC", "医保统筹基金支付I": "I-YTC", "统筹支付B": "B-TC",
               "统筹支付I": "I-TC", "个人现金支付B": "B-PX", "个人现金支付I": "I-PX", "个人账户支付B": "B-PP", "个人账户支付I": "I-PP",
               "个人自付B": "B-PO", "个人自付I": "I-PO", "其他支付B": "B-ELP", "其他支付I": "I-ELP", "自付一B": "B-POO", "自付一I": "I-POO",
               "自付二B": "B-PW", "自付二I": "I-PW"
               }

id2label = {str(i): label for i, label in enumerate(hospitalization_label)}
label2id = {v: k for k, v in id2label.items()}
'''客户信息字段（属性名共 4个）：性别-G、入院日期-IN、出院日期-OUT、住院天数-D
   发票信息字段（属性名共 7个）：票据代码-PD、票据号码-PH、校验码-JY、开票日期-PR、收款单位-SD、收款人-SR、复核人-FR
   医保信息字段（属性名共 2个）：医疗机构类型-L、医保类型-YL
   项目信息字段（属性名共 14个）：诊查费-ZC、检查费-JC、化验费-HY、治疗费-ZL、手术费-S、卫生材料费-WC、西药费-WY、中药饮片-ZY、中成药费-ZCY、一般诊疗费-YZL、床位费-CW、护理费-HL、挂号费-GH、其他收费项目-ELE
   支付信息字段（属性名共 10个）：合计金额（大写）-HJD、（小写）-HJX、医保统筹基金支付-YTC、统筹支付-TC、个人现金支付-PX、个人账户支付-PP、个人自付-PO、其他支付-ELP、自付一-POO、自付二-PW'''


In [None]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
from transformers import get_scheduler

num_train_epochs = 96
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [None]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[hospitalization_label[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [hospitalization_label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from tqdm.auto import tqdm
import torch
from datasets import load_metric

metric = load_metric("seqeval")
progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]
        
        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)
        
        true_predictions, true_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

In [None]:
tokenizer.push_to_hub("fp-ner")
model.push_to_hub("fp-ner")