In [1]:
#第12章/加载编码工具
from transformers import BertTokenizer

token = BertTokenizer.from_pretrained('bert-base-chinese')

token

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [5]:
#第12章/加载数据集
from datasets import load_from_disk

dataset = load_from_disk('./data/ChnSentiCorp')

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 0
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [6]:
#第12章/定义计算设备
import torch

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

device

'cpu'

In [37]:
#第12章/数据整理函数
def collate_fn(data):
    sents = [i['text'] for i in data]
    labels = [i['label'] for i in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding=True,
                                   max_length=512,
                                   return_tensors='pt')

    #转移到计算设备
    for k, v in data.items():
        data[k] = v.to(device)

    data['labels'] = torch.LongTensor(labels).to(device)

    return data

In [38]:
#第12章/数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset['train'],
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

len(loader)

600

In [40]:
#第12章/查看数据样例
for i, data in enumerate(loader):
    break

for k, v in data.items():
    print(k, v.shape)

input_ids torch.Size([16, 235])
token_type_ids torch.Size([16, 235])
attention_mask torch.Size([16, 235])
labels torch.Size([16])


In [41]:
#第12章/加载预训练模型
from transformers import AutoModelForSequenceClassification

#加载模型
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese',
                                                           num_labels=2)
#设定计算设备
model.to(device)

#统计参数量
print(sum(i.numel() for i in model.parameters()) / 10000)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

10226.9186


In [43]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [42]:
#模型试算
out = model(**data)

out['loss'], out['logits'].shape

(tensor(0.7723, grad_fn=<NllLossBackward0>), torch.Size([16, 2]))

In [13]:
#第12章/训练
from transformers import AdamW
from transformers.optimization import get_scheduler


def train():
    #定义优化器
    optimizer = AdamW(model.parameters(), lr=5e-4)

    #定义学习率调节器
    scheduler = get_scheduler(name='linear',
                              num_warmup_steps=0,
                              num_training_steps=len(loader),
                              optimizer=optimizer)

    #模型切换到训练模式
    model.train()

    #按批次遍历训练集中的数据
    for i, data in enumerate(loader):

        #模型计算
        out = model(**data)

        #计算loss并使用梯度下降法优化模型参数
        out['loss'].backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        model.zero_grad()

        #输出各项数据的情况，便于观察
        if i % 10 == 0:
            out = out['logits'].argmax(dim=1)
            accuracy = (out == labels).sum().item() / len(labels)
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            print(i, loss.item(), lr, accuracy)


train()



0 0.688614547252655 0.0004991666666666666 0.625
10 0.6768248677253723 0.0004908333333333334 0.625
20 0.7436589002609253 0.0004825 0.375
30 0.5400702357292175 0.0004741666666666667 0.9375
40 0.6189451217651367 0.00046583333333333334 0.6875
50 0.5567441582679749 0.0004575 0.9375
60 0.5188874006271362 0.00044916666666666667 0.875
70 0.5120527148246765 0.0004408333333333334 0.875
80 0.5961946249008179 0.0004325 0.6875
90 0.5220378637313843 0.0004241666666666667 0.8125
100 0.5031461715698242 0.0004158333333333333 0.875
110 0.5246412754058838 0.0004075 0.75
120 0.5258313417434692 0.0003991666666666667 0.875
130 0.5714597105979919 0.0003908333333333333 0.75
140 0.5225942134857178 0.00038250000000000003 0.8125
150 0.4699893891811371 0.00037416666666666664 0.9375
160 0.470580130815506 0.00036583333333333335 0.9375
170 0.44881629943847656 0.0003575 0.9375
180 0.42978283762931824 0.0003491666666666667 0.9375
190 0.5476064682006836 0.00034083333333333334 0.875
200 0.441872775554657 0.0003325 0.937

In [14]:
#第12章/测试
def test():
    #定义测试数据集加载器
    loader_test = torch.utils.data.DataLoader(dataset=Dataset('test'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    #下游任务模型切换到运行模式
    model.eval()
    correct = 0
    total = 0

    #按批次遍历测试集中的数据
    for i, (data) in enumerate(loader_test):

        #计算5个批次即可，不需要全部遍历
        if i == 5:
            break

        print(i)

        #计算
        with torch.no_grad():
            out = model(**data)

        #统计正确率
        out = out['logits'].argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)


test()

0
1
2
3
4
0.89375
