## SUNDAN商品命名实体识别（基于BERT微调）

In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForTokenClassification
import os

In [37]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


#### **数据处理**


In [38]:
data = pd.read_csv('data_process/data.csv', usecols=['sentence', 'word_labels'], encoding='ansi')
data['word_labels'] = data['word_labels'].apply(lambda x: x.strip(","))
data['len_sen'] = data['sentence'].apply(lambda x: len(x.split(" ")))
data.sort_values(by='len_sen', ascending=False).head(10)  # 确定最大长度为21

Unnamed: 0,sentence,word_labels,len_sen
704,东 芝 多 功 能 彩 色 复 合 机 激 光 双 面 打 印 复 印 扫 描 含 双 纸 ...,"B-BRA,I-BRA,O,O,O,O,O,B-PRO,I-PRO,I-PRO,O,O,O,...",31
756,柯 尼 卡 美 能 达 黑 白 多 功 能 复 合 机 主 机 双 面 器 双 面 自 动 ...,"O,O,O,O,O,O,O,O,B-PRO,I-PRO,I-PRO,I-PRO,I-PRO,...",27
707,长 城 黑 白 激 光 复 印 机 适 配 国 产 平 台 自 动 双 面 打 印 、 复 印,"B-BRA,I-BRA,O,O,O,O,B-PRO,I-PRO,I-PRO,O,O,O,O,...",24
699,联 想 激 光 多 功 能 一 体 机 四 合 一 黑 白 页 双 面 网 络 电 话,"B-BRA,I-BRA,O,O,B-PRO,I-PRO,I-PRO,I-PRO,I-PRO,...",22
689,惠 普 喷 墨 一 体 机 幅 面 彩 色 页 分 钟 打 印 扫 描 复 印 传 真,"B-BRA,I-BRA,B-PRO,I-PRO,I-PRO,I-PRO,I-PRO,O,O,...",22
705,奔 图 黑 白 激 光 打 印 机 适 配 国 产 平 台 自 动 双 面 打 印,"B-BRA,I-BRA,O,O,O,O,B-PRO,I-PRO,I-PRO,O,O,O,O,...",21
706,奔 图 彩 色 激 光 打 印 机 适 配 国 产 平 台 自 动 双 面 打 印,"B-BRA,I-BRA,O,O,O,O,B-PRO,I-PRO,I-PRO,O,O,O,O,...",21
754,柯 尼 卡 美 能 达 黑 白 复 印 机 配 双 面 器 双 面 自 动 输 稿,"O,O,O,O,O,O,O,O,B-PRO,I-PRO,I-PRO,O,O,O,O,O,O,...",21
682,柯 尼 卡 美 能 达 双 面 自 动 进 稿 器 黑 白 多 功 能 复 合 机,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-PRO,I-PRO,I-PR...",21
694,鸿 合 会 议 平 板 锐 系 列 交 互 电 子 白 板 教 学 一 体 机 英,"B-BRA,I-BRA,B-PRO,I-PRO,I-PRO,I-PRO,O,O,O,O,O,...",21


In [39]:
tags = ['O', 'B-BRA', 'I-BRA', 'B-PRO', 'I-PRO']  # 定义标签种类
labels_to_ids = {k: v for v, k in enumerate(tags)}  # 将标签转化为id
ids_to_labels = {v: k for v, k in enumerate(tags)}
labels_to_ids

{'O': 0, 'B-BRA': 1, 'I-BRA': 2, 'B-PRO': 3, 'I-PRO': 4}

#### **构建DataLoader**

In [40]:
MAX_LEN = 21  # 在数据处理阶段得到的结论
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 4  # 训练轮次
LEARNING_RATE = 1e-05  # 设置学习率
MAX_GRAD_NORM = 5  # 用于梯度剪切
MODEL_NAME=r'D:\Asoftware\Pycharm_pro\venv\NLP\model\BERT'  # 填写从hugging face上下载的模型存放位置
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [42]:
# 模型保存参数
save_dir = "./model"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [43]:
# example
data.iloc[0]

sentence                               卡 萨 帝 波 轮 洗 衣 机
word_labels    B-BRA,I-BRA,I-BRA,O,O,B-PRO,I-PRO,I-PRO
len_sen                                              8
Name: 0, dtype: object

In [44]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):  # 将相应位置上的词/句转化为tensor向量
        # 步骤 1: 对每个句子分词
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = sentence.split(" "), word_labels.split(",")  # 简单用split分词即可
        
        # 步骤 2: 添加特殊token并添加对应的标签
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # 给[CLS] token添加O标签
        labels.insert(-1, "O") # 给[SEP] token添加O标签

        # 步骤 3: 截断/填充
        maxlen = self.max_len

        if len(tokenized_sentence) > maxlen:
          # 截断
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # 填充
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # 步骤 4: 构建attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # 步骤 5: 将分词结果转为词表的id表示
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)
        label_ids = [labels_to_ids[label] for label in labels]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [45]:
# 数据集划分
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (1032, 3)
TRAIN Dataset: (826, 3)
TEST Dataset: (206, 3)


In [46]:
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

# 创建Dataloader
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [47]:
# example
training_set[0]

{'ids': tensor([ 101, 5401, 4638, 5892, 4171,  671,  860, 3322,  102,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0]),
 'mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'targets': tensor([0, 1, 2, 3, 4, 4, 4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}

In [48]:
# example
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"]), training_set[0]["targets"]):
  print('{0:10}  {1}   {2}'.format(token, label,ids_to_labels[label.numpy().tolist()]))

[CLS]       0   O
美           1   B-BRA
的           2   I-BRA
蒸           3   B-PRO
烤           4   I-PRO
一           4   I-PRO
体           4   I-PRO
机           0   O
[SEP]       4   I-PRO
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O
[PAD]       0   O


#### **载入预训练模型**

In [49]:
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))  # num_labels在此处为5
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at D:\Asoftware\Pycharm_pro\venv\NLP\model\BERT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

#### **模型训练**



In [50]:
# 设置优化器Adam
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [51]:
# 训练函数
def train():
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # 将model设置为train模式
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch['targets'].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs[0], outputs[1]
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)
        
        if idx % 50==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 50 training steps: {loss_step}")
           
        # 计算准确率
        flattened_targets = targets.view(-1) # 真实标签 大小 (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # 模型输出shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # 取出每个token对应概率最大的标签索引 shape (batch_size * seq_len,)
        # MASK：PAD
        active_accuracy = mask.view(-1) == 1 # shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_preds.extend(predictions)
        tr_labels.extend(targets)
        
        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # 梯度剪切
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # loss反向求导
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

# 开始训练
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train()

Training epoch: 1
Training loss per 50 training steps: 1.692384958267212
Training loss per 50 training steps: 0.6092148031674179
Training loss per 50 training steps: 0.4033998487756984
Training loss per 50 training steps: 0.3096090164282267
Training loss per 50 training steps: 0.25658349212796533
Training loss epoch: 0.2507974176267207
Training accuracy epoch: 0.8417194424272623
Training epoch: 2
Training loss per 50 training steps: 0.08762291073799133
Training loss per 50 training steps: 0.07374300638798113
Training loss per 50 training steps: 0.07248822658202879
Training loss per 50 training steps: 0.06767170027897551
Training loss per 50 training steps: 0.06692384234606404
Training loss epoch: 0.06578759119310529
Training accuracy epoch: 0.9594467327489915
Training epoch: 3
Training loss per 50 training steps: 0.08238206803798676
Training loss per 50 training steps: 0.03588311563191168
Training loss per 50 training steps: 0.03649087980767657
Training loss per 50 training steps: 0.03

#### **模型评估**

In [52]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)
            
            # loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=targets)
            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs[0],outputs[1]
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # 计算准确率
            flattened_targets = targets.view(-1) # 大小 (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size * seq_len,)
            active_accuracy = mask.view(-1) == 1 # 大小 (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(targets)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

# 开始评估
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.0574503056704998
Validation loss per 100 evaluation steps: 0.055126491601528174
Validation Loss: 0.05764250751941926
Validation Accuracy: 0.9728624175047498


#### **模型保存**

In [None]:
# # 保存tokenizer
# tokenizer.save_vocabulary(save_dir)
# # 保存权重和配置文件
# model.save_pretrained(save_dir)
# print('Finished')