In [4]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

model = AutoModelForMaskedLM.from_pretrained("./model/bert_base_chinese")
tokenizer = AutoTokenizer.from_pretrained("./model/bert_base_chinese")

sentences = [
    'null',
    'Input-Asr-None',
    '(side)你看他一来就行我们那么的不会用搞来搞去的不知道是去哪个码头(unknown)',
    '这也是一句话',
    '这是最后一句话',
]
pattern = re.compile(r'[a-zA-Z0-9-\(\)\s]+')
sentences = [pattern.sub('.', sentence) for sentence in sentences]

inputs = tokenizer(sentences, padding=True, return_tensors='pt')
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
print(input_ids.shape, attention_mask.shape)
outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True).hidden_states[-1]
print(outputs.shape)

# output each token's corresponding word piece
for input in inputs['input_ids']:
    for token in input:
        print(tokenizer.decode([token.item()]), end=' ')
    print()

Some weights of the model checkpoint at ./model/bert_base_chinese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([5, 33]) torch.Size([5, 33])
torch.Size([5, 33, 768])
[CLS] . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 
[CLS] . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 
[CLS] . 你 看 他 一 来 就 行 我 们 那 么 的 不 会 用 搞 来 搞 去 的 不 知 道 是 去 哪 个 码 头 . [SEP] 
[CLS] 这 也 是 一 句 话 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 
[CLS] 这 是 最 后 一 句 话 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 


In [None]:
import torch
from torch import nn
from transformers import AutoTokenizer, AutoModelForMaskedLM


class BertDecoder(nn.Module):

    def __init__(self, input_size, num_tags, pad_id):
        super(BertDecoder, self).__init__()
        self.num_tags = num_tags
        self.output_layer = nn.Linear(input_size, num_tags)
        self.softmax_fn = nn.Softmax(dim=-1)
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=pad_id)

    def forward(self, hiddens, masks, labels=None):
        logits = self.output_layer(hiddens)
        logits += -1e32 * (1 - masks).unsqueeze(-1).repeat(1, 1, self.num_tags)
        prob = self.softmax_fn(logits)
        if labels:
            loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
            return prob, loss
        else:
            return (prob, )


class SLUBert(nn.Module):

    def __init__(self, config):
        super(SLUBert, self).__init__()
        self.config = config
        self.cell = config.encoder_cell
        self.tokenizer = AutoTokenizer.from_pretrained(config.bert_path)
        self.bert = AutoModelForMaskedLM.from_pretrained(config.bert_path)
        self.decoder = BertDecoder(config.hidden_size, config.num_tags, config.tag_pad_idx)

    def forward(self, batch):
        print(batch.utt)
        input_ids = self.tokenizer(batch.utt, padding=True, truncation=True, return_tensors='pt')
        last_hidden_state = self.bert(**input_ids, output_hidden_states=True).hidden_states[-1]
        last_hidden_state = last_hidden_state[:, 1:-1, :] # ignore [CLS] and [SEP]
        output_tags = self.decoder(last_hidden_state, batch.tag_mask, batch.tag_ids)
        print(output_tags.shape)
        return output_tags

    def decode(self, label_vocab, batch):
        batch_size = len(batch)
        labels = batch.labels
        output = self.forward(batch)
        prob = output[0]
        predictions = []
        for i in range(batch_size):
            pred = torch.argmax(prob[i], dim=-1).cpu().tolist()
            pred_tuple = []
            idx_buff, tag_buff, pred_tags = [], [], []
            pred = pred[:len(batch.utt[i])]
            for idx, tid in enumerate(pred):
                tag = label_vocab.convert_idx_to_tag(tid)
                pred_tags.append(tag)
                if (tag == 'O' or tag.startswith('B')) and len(tag_buff) > 0:
                    slot = '-'.join(tag_buff[0].split('-')[1:])
                    value = ''.join([batch.utt[i][j] for j in idx_buff])
                    idx_buff, tag_buff = [], []
                    pred_tuple.append(f'{slot}-{value}')
                    if tag.startswith('B'):
                        idx_buff.append(idx)
                        tag_buff.append(tag)
                elif tag.startswith('I') or tag.startswith('B'):
                    idx_buff.append(idx)
                    tag_buff.append(tag)
            if len(tag_buff) > 0:
                slot = '-'.join(tag_buff[0].split('-')[1:])
                value = ''.join([batch.utt[i][j] for j in idx_buff])
                pred_tuple.append(f'{slot}-{value}')
            predictions.append(pred_tuple)
        if len(output) == 1:
            return predictions
        else:
            loss = output[1]
            return predictions, labels, loss.cpu().item()


sentences = [
    '这是第一句话',
    '这也是一句话',
    '这是最后一句话',
]