In [1]:
# 导包
import pandas as pd
import torch
from transformers import BertTokenizer, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
tags = ['O', 'B-BRA', 'I-BRA', 'B-PRO', 'I-PRO']
labels_to_ids = {k: v for v, k in enumerate(tags)}  # 标签转id
ids_to_labels = {v: k for v, k in enumerate(tags)}  # id转标签
labels_to_ids

{'O': 0, 'B-BRA': 1, 'I-BRA': 2, 'B-PRO': 3, 'I-PRO': 4}

In [4]:
MAX_LEN = 21  # 设置句子的最大长度

In [5]:
# 加载训练好的模型参数
MODEL_NAME = 'model'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForTokenClassification.from_pretrained(MODEL_NAME, num_labels=len(labels_to_ids))
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [59]:
# 提取实体并处理成所需的句子
def extract_ner(pred: list) -> str:
    tags = ['B-BRA', 'I-BRA', 'B-PRO', 'I-PRO']  # 排除"O"
    ner_list = []
    tag_list = []

    for pair in pred:
        if pair[1] == 'B-PRO' and pair[1] in tag_list:  # 仅提取第一次出现的产品实体
            break
        elif pair[1] in tags and pair[0] not in ['[CLS]', '[SEP]', '[PAD]']:
            ner_list.append(pair[0])
            tag_list.append(pair[1])
    tmp = list(set(ner_list))  # 去除重复品牌（注：set去重有可能会打乱顺序）
    tmp.sort(key=ner_list.index)  # 按原先顺序排列
    product_name = "".join(tmp)
    # print(ner_list, tag_list)
    return product_name

In [60]:
# 模型预测
def data_pred(sentence):
    sentence += "END"
    inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

    # 加载到GPU或CPU
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)

    # 输入到模型
    outputs = model(ids, mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels) # 大小 (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # 大小 (batch_size*seq_len,)

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # tuple = (wordpiece, prediction)
    return extract_ner(wp_preds)


In [7]:
# 读入待处理数据
data = pd.read_csv('data_process/data.csv', encoding='ansi')
data.head()

Unnamed: 0,sentence,word_labels,品牌,原描述
0,卡 萨 帝 波 轮 洗 衣 机,"B-BRA,I-BRA,I-BRA,O,O,B-PRO,I-PRO,I-PRO,",KSD,☆*C802 100U1(香槟金)卡萨帝10公斤波轮洗衣机
1,海 尔 波 轮 洗 衣 机,"B-BRA,I-BRA,O,O,B-PRO,I-PRO,I-PRO,",HL,☆*XQS120-BZ866(雅晶银) 海尔12公斤波轮洗衣机
2,卡 萨 帝 迷 你 波 轮 洗 衣 机,"B-BRA,I-BRA,I-BRA,O,O,O,O,B-PRO,I-PRO,I-PRO,",KSD,☆*C601 30RPU1卡萨帝3公斤迷你波轮洗衣机
3,整 理 台,"B-PRO,I-PRO,I-PRO,",AK,HDB1153W（白色）ASKO整理台
4,博 世 两 门 嵌 入 式 冰 箱,"B-BRA,I-BRA,O,O,O,O,O,B-PRO,I-PRO,",BOS,☆*KIS87AF32C（银色色）博世两门嵌入式冰箱


In [62]:
# 生成预测列
data['pred'] = data['sentence'].apply(lambda x: data_pred(x))

In [63]:
# 保存数据
data.to_csv('pred.csv', index=False, encoding='ansi')

In [61]:
# example
data_pred('史 密 斯 智 慧 互 联 橱 下 冷 热 一 体 净 水 器')

'史密斯净水器'

In [None]:
# TO DO:
# 1、将spacy或nltk的命名实体识别效果与baseline(上述BERT预训练微调模型)进行对比