In [24]:
#第8章/加载编码工具
from transformers import BertTokenizer

token = BertTokenizer.from_pretrained('bert-base-chinese')

token

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [25]:
#第8章/试编码句子
out = token.batch_encode_plus(
    batch_text_or_text_pairs=['轻轻的我走了，正如我轻轻地来。', '我轻轻的招手，作别西天的云彩。'],
    truncation=True,
    padding='max_length',
    max_length=18,
    return_tensors='pt',
    return_length=True)

#查看编码输出
for k, v in out.items():
    print(k, v.shape)

#把编码还原为句子
print(token.decode(out['input_ids'][0]))

input_ids torch.Size([2, 18])
token_type_ids torch.Size([2, 18])
length torch.Size([2])
attention_mask torch.Size([2, 18])
[CLS] 轻 轻 的 我 走 了 ， 正 如 我 轻 轻 地 来 。 [SEP] [PAD]


In [26]:
#第8章/加载数据集
from datasets import load_from_disk

dataset = load_from_disk('./data/ChnSentiCorp')

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 0
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [27]:
#第8章/编码数据,同时删除多余的字段
def f(data):
    return token.batch_encode_plus(batch_text_or_text_pairs=data['text'],
                                   truncation=True,
                                   padding='max_length',
                                   max_length=30,
                                   return_length=True)


dataset = dataset.map(function=f,
                      batched=True,
                      batch_size=1000,
                      num_proc=4,
                      remove_columns=['text', 'label'])

dataset

 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-d3283c8583b24ed8.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-27dc37d5bd6706e4.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-826f6ca6e106aacc.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-835f419ddabc745f.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-682897291cf43603.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-9d6e5e25ce6b17d4.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-a0072e02f56e2af6.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-f0333c24f02eb083.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 9600
    })
    validation: Dataset({
        features: [],
        num_rows: 0
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 1200
    })
})

In [28]:
#第8章/过滤掉太短的句子
def f(data):
    return [i >= 30 for i in data['length']]


dataset = dataset.filter(function=f, batched=True, batch_size=1000, num_proc=4)

dataset

 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-431e5ba849eb7708_00000_of_00004.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-431e5ba849eb7708_00001_of_00004.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-431e5ba849eb7708_00002_of_00004.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/train/cache-431e5ba849eb7708_00003_of_00004.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-f5af1b4bd662e257_00000_of_00004.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-f5af1b4bd662e257_00001_of_00004.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-f5af1b4bd662e257_00002_of_00004.arrow


 

Loading cached processed dataset at data/ChnSentiCorp/test/cache-f5af1b4bd662e257_00003_of_00004.arrow


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 9286
    })
    validation: Dataset({
        features: [],
        num_rows: 0
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 1157
    })
})

In [29]:
#第8章/定义计算设备
import torch

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

device

'cuda'

In [30]:
#第8章/数据整理函数
def collate_fn(data):
    #取出编码结果
    input_ids = [i['input_ids'] for i in data]
    attention_mask = [i['attention_mask'] for i in data]
    token_type_ids = [i['token_type_ids'] for i in data]

    #转换为tensor格式
    input_ids = torch.LongTensor(input_ids)
    attention_mask = torch.LongTensor(attention_mask)
    token_type_ids = torch.LongTensor(token_type_ids)

    #把第15个词替换为mask
    labels = input_ids[:, 15].reshape(-1).clone()
    input_ids[:, 15] = token.get_vocab()[token.mask_token]

    #移动到计算设备
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    labels = labels.to(device)

    return input_ids, attention_mask, token_type_ids, labels

In [52]:
#第8章/数据整理函数试算
#模拟一批数据
data = [{
    'input_ids': [
        101, 2769, 3221, 3791, 6427, 1159, 2110, 5442, 117, 2110, 749, 8409,
        702, 6440, 3198, 4638, 1159, 5277, 4408, 119, 1728, 711, 2769, 3221,
        5439, 2399, 782, 117, 3791, 102
    ],
    'token_type_ids': [0] * 30,
    'attention_mask': [1] * 30
}, {
    'input_ids': [
        101, 679, 7231, 8024, 2376, 3301, 1351, 6848, 4638, 8024, 3301, 1351,
        3683, 6772, 4007, 2692, 8024, 2218, 3221, 100, 2970, 1366, 2208, 749,
        8024, 5445, 684, 1059, 3221, 102
    ],
    'token_type_ids': [0] * 30,
    'attention_mask': [1] * 30
}]

#试算
input_ids, attention_mask, token_type_ids, labels = collate_fn(data)

#把编码还原为句子
print(token.decode(input_ids[0]))
print(token.decode(labels[0]))

input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

[CLS] 我 是 法 语 初 学 者, 学 了 78 个 课 时 [MASK] 初 级 班. 因 为 我 是 老 年 人, 法 [SEP]
的


(torch.Size([2, 30]),
 torch.Size([2, 30]),
 torch.Size([2, 30]),
 tensor([4638, 2692], device='cuda:0'))

In [32]:
#第8章/定义数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset['train'],
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

len(loader)

580

In [54]:
#第8章/查看数据样例
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(token.decode(input_ids[0]))
print(token.decode(labels[0]))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

[CLS] 位 于 友 谊 路 金 融 街 ， 找 不 到 吃 饭 [MASK] 地 方 。 酒 店 刚 刚 装 修 好 ， 有 点 [SEP]
的


(torch.Size([16, 30]),
 torch.Size([16, 30]),
 torch.Size([16, 30]),
 tensor([4638, 6230,  511, 7313, 3221, 7315, 6820, 6858, 7564, 3211, 1690, 3315,
         3300,  172, 6821, 1126], device='cuda:0'))

In [55]:
#第8章/加载预训练模型
from transformers import BertModel

pretrained = BertModel.from_pretrained('bert-base-chinese')

#统计参数量
sum(i.numel() for i in pretrained.parameters()) / 10000

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


10226.7648

In [56]:
#第8章/不训练预训练模型,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

In [57]:
#第8章/预训练模型试算
#设定计算设备
pretrained.to(device)

#模型试算
out = pretrained(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids)

out.last_hidden_state.shape

torch.Size([16, 30, 768])

In [62]:
#第8章/定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.decoder = torch.nn.Linear(in_features=768,
                                       out_features=token.vocab_size,
                                       bias=False)
        #重新初始化decode中的bias参数为全0
        self.bias = torch.nn.Parameter(data=torch.zeros(token.vocab_size))
        self.decoder.bias = self.bias

        #定义Dropout层，防止过拟合
        self.dropout = torch.nn.Dropout(p=0.5)

    def forward(self, input_ids, attention_mask, token_type_ids):
        #使用预训练模型抽取数据特征
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        #把第15个词的特征，投影到全字典范围内
        out = self.dropout(out.last_hidden_state[:, 15])
        out = self.decoder(out)

        return out


model = Model()

#设定计算设备
model.to(device)

#试算
model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape

torch.Size([16, 21128])

In [63]:
#第8章/训练
from transformers import AdamW
from transformers.optimization import get_scheduler


def train():
    #定义优化器
    optimizer = AdamW(model.parameters(), lr=5e-4, weight_decay=1.0)

    #定义loss函数
    criterion = torch.nn.CrossEntropyLoss()

    #定义学习率调节器
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader) * 5,
                              optimizer=optimizer)

    #模型切换到训练模式
    model.train()

    #共训练5个epoch
    for epoch in range(5):
        #按批次遍历训练集中的数据
        for i, (input_ids, attention_mask, token_type_ids,
                labels) in enumerate(loader):

            #模型计算
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

            #计算loss并使用梯度下降法优化模型参数
            loss = criterion(out, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            #输出各项数据的情况，便于观察
            if i % 50 == 0:
                out = out.argmax(dim=1)
                accuracy = (out == labels).sum().item() / len(labels)
                lr = optimizer.state_dict()['param_groups'][0]['lr']
                print(epoch, i, loss.item(), lr, accuracy)


train()

0 0 10.022448539733887 0.0004998275862068966 0.0
0 50 8.73752498626709 0.0004912068965517241 0.1875
0 100 7.15378475189209 0.0004825862068965518 0.25
0 150 6.036799907684326 0.0004739655172413793 0.25
0 200 6.4759111404418945 0.0004653448275862069 0.0625
0 250 3.800313949584961 0.00045672413793103453 0.4375
0 300 7.0236616134643555 0.00044810344827586206 0.25
0 350 5.194925785064697 0.00043948275862068964 0.3125
0 400 5.884705543518066 0.0004308620689655173 0.3125
0 450 4.168199062347412 0.00042224137931034486 0.4375
0 500 6.240730285644531 0.0004136206896551724 0.375
0 550 4.36335563659668 0.00040500000000000003 0.375
1 0 3.574946403503418 0.00039982758620689656 0.375
1 50 4.219260215759277 0.00039120689655172415 0.375
1 100 3.1496996879577637 0.00038258620689655173 0.625
1 150 3.0767054557800293 0.0003739655172413793 0.375
1 200 3.6137592792510986 0.0003653448275862069 0.5625
1 250 3.3886961936950684 0.0003567241379310345 0.5
1 300 5.3483662605285645 0.0003481034482758621 0.4375
1 35

In [64]:
#第8章/测试
def test():
    #定义测试数据集加载器
    loader_test = torch.utils.data.DataLoader(dataset=dataset['test'],
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    #下游任务模型切换到运行模式
    model.eval()

    correct = 0
    total = 0
    #按批次遍历测试集中的数据
    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        #计算15个批次即可，不需要全部遍历
        if i == 15:
            break

        print(i)

        #计算
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        #统计正确率
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
0.5645833333333333
