In [2]:
#第7章/加载编码工具
from transformers import BertTokenizer

token = BertTokenizer.from_pretrained('bert-base-chinese')

token

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [3]:
#第7章/试编码句子
out = token.batch_encode_plus(
    batch_text_or_text_pairs=['从明天起，做一个幸福的人。', '喂马，劈柴，周游世界。'],
    truncation=True,
    padding='max_length',
    max_length=17,
    return_tensors='pt',
    return_length=True)

#查看编码输出
for k, v in out.items():
    print(k, v.shape)

#把编码还原为句子
print(token.decode(out['input_ids'][0]))

input_ids torch.Size([2, 17])
token_type_ids torch.Size([2, 17])
length torch.Size([2])
attention_mask torch.Size([2, 17])
[CLS] 从 明 天 起 ， 做 一 个 幸 福 的 人 。 [SEP] [PAD] [PAD]


In [4]:
#第7章/定义数据集
import torch
from datasets import load_from_disk


class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_from_disk('./data/ChnSentiCorp')[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']

        return text, label


dataset = Dataset('train')

len(dataset), dataset[20]

(9600, ('非常不错，服务很好，位于市中心区，交通方便，不过价格也高！', 1))

In [5]:
#第7章/定义计算设备
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

device

'cuda'

In [6]:
#第7章/数据整理函数
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #把数据移动到计算设备上
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    labels = labels.to(device)

    return input_ids, attention_mask, token_type_ids, labels

In [7]:
#第7章/数据整理函数试算
#模拟一批数据
data = [
    ('你站在桥上看风景', 1),
    ('看风景的人在楼上看你', 0),
    ('明月装饰了你的窗子', 1),
    ('你装饰了别人的梦', 0),
]

#试算
input_ids, attention_mask, token_type_ids, labels = collate_fn(data)

input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

(torch.Size([4, 500]),
 torch.Size([4, 500]),
 torch.Size([4, 500]),
 tensor([1, 0, 1, 0], device='cuda:0'))

In [7]:
#第7章/数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

len(loader)

600

In [8]:
#第7章/查看数据样例
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1], device='cuda:0'))

In [9]:
#第7章/加载预训练模型
from transformers import BertModel

pretrained = BertModel.from_pretrained('bert-base-chinese')

#统计参数量
sum(i.numel() for i in pretrained.parameters()) / 10000

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


10226.7648

In [10]:
#第7章/不训练预训练模型,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

In [11]:
#第7章/预训练模型试算
#设定计算设备
pretrained.to(device)

#模型试算
out = pretrained(input_ids=input_ids,
                 attention_mask=attention_mask,
                 token_type_ids=token_type_ids)

out.last_hidden_state.shape

torch.Size([16, 500, 768])

In [12]:
#第7章/定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(in_features=768, out_features=2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        #使用预训练模型抽取数据特征
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids)

        #对抽取的特征只取第一个字的结果做分类即可
        out = self.fc(out.last_hidden_state[:, 0])

        out = out.softmax(dim=1)

        return out


model = Model()

#设定计算设备
model.to(device)

#试算
model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape

torch.Size([16, 2])

In [13]:
#第7章/训练
from transformers import AdamW
from transformers.optimization import get_scheduler


def train():
    #定义优化器
    optimizer = AdamW(model.parameters(), lr=5e-4)

    #定义loss函数
    criterion = torch.nn.CrossEntropyLoss()

    #定义学习率调节器
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    #模型切换到训练模式
    model.train()

    #按批次遍历训练集中的数据
    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader):

        #模型计算
        out = model(input_ids=input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids)

        #计算loss并使用梯度下降法优化模型参数
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        #输出各项数据的情况，便于观察
        if i % 10 == 0:
            out = out.argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            print(i, loss.item(), lr, accuracy)


train()



0 0.688614547252655 0.0004991666666666666 0.625
10 0.6768248677253723 0.0004908333333333334 0.625
20 0.7436589002609253 0.0004825 0.375
30 0.5400702357292175 0.0004741666666666667 0.9375
40 0.6189451217651367 0.00046583333333333334 0.6875
50 0.5567441582679749 0.0004575 0.9375
60 0.5188874006271362 0.00044916666666666667 0.875
70 0.5120527148246765 0.0004408333333333334 0.875
80 0.5961946249008179 0.0004325 0.6875
90 0.5220378637313843 0.0004241666666666667 0.8125
100 0.5031461715698242 0.0004158333333333333 0.875
110 0.5246412754058838 0.0004075 0.75
120 0.5258313417434692 0.0003991666666666667 0.875
130 0.5714597105979919 0.0003908333333333333 0.75
140 0.5225942134857178 0.00038250000000000003 0.8125
150 0.4699893891811371 0.00037416666666666664 0.9375
160 0.470580130815506 0.00036583333333333335 0.9375
170 0.44881629943847656 0.0003575 0.9375
180 0.42978283762931824 0.0003491666666666667 0.9375
190 0.5476064682006836 0.00034083333333333334 0.875
200 0.441872775554657 0.0003325 0.937

In [14]:
#第7章/测试
def test():
    #定义测试数据集加载器
    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    #下游任务模型切换到运行模式
    model.eval()
    correct = 0
    total = 0

    #按批次遍历测试集中的数据
    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):

        #计算5个批次即可，不需要全部遍历
        if i == 5:
            break

        print(i)

        #计算
        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        #统计正确率
        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

0
1
2
3
4
0.89375
