# 0. 环境
kaggle 里面创建的 Notebook， GPU 也是官方免费给的 Tesla P100 16GB 显存。
# 1. 定义数据集

In [1]:
import torch
from datasets import load_dataset

# 固定随机种子，实验可重复
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = True


#定义数据集
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_dataset(path='seamew/ChnSentiCorp', split=split)

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']

        return text, label


dataset = Dataset('train')

len(dataset), dataset[0]

Downloading builder script:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading and preparing dataset chn_senti_corp/default to /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85...


Downloading data:   0%|          | 0.00/3.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/376k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/371k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset chn_senti_corp downloaded and prepared to /root/.cache/huggingface/datasets/seamew___chn_senti_corp/default/0.0.0/1f242195a37831906957a11a2985a4329167e60657c07dc95ebe266c03fdfb85. Subsequent calls will reuse this data.


(9600,
 ('选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般',
  1))

训练集共 9600 句话，以第一个样本为例，评价为差评。

# 2. 加载 tokenizer

In [2]:
from transformers import BertTokenizer

#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')

token

Downloading:   0%|          | 0.00/107k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/624 [00:00<?, ?B/s]

PreTrainedTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

## 3. 定义批处理函数
该函数中进行分词和编码

In [3]:
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=500,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels

# 4. 定义数据加载器并查看数据样例

In [4]:
#数据加载器
loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=16,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    break

print(len(loader))
input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

600


(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1]))

# 5. 加载 BERT 中文预训练模型

In [5]:
from transformers import BertModel

#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

#不冻结预训练模型的参数
for param in pretrained.parameters():
    param.requires_grad_(True)

#模型试算
out = pretrained(input_ids=input_ids,
           attention_mask=attention_mask,
           token_type_ids=token_type_ids)

out.last_hidden_state.shape

Downloading:   0%|          | 0.00/393M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([16, 500, 768])

16 表示 batch_size，表示 16 句话，

500 表示数据分词的一个长度，数据编码时，指定每一句话编码成 500 个词的长度，

768 是词编码的维度，把每一个词编码成一个 768 维的向量

# 6. 定义下游任务模型
只包含一个单全连接层，对应二分类

In [6]:
#定义下游任务模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)
        # 0 表示 cls 位，使用 cls 进行分类任务
        out = self.fc(out.last_hidden_state[:, 0])

        out = out.softmax(dim=1)

        return out


model = Model()
    
model(input_ids=input_ids,
      attention_mask=attention_mask,
      token_type_ids=token_type_ids).shape

torch.Size([16, 2])

# 7. 训练下游任务

In [7]:
from transformers import AdamW
import time

start = time.time()
#训练
optimizer = AdamW(model.parameters(), lr=5e-4)
criterion = torch.nn.CrossEntropyLoss()

if torch.cuda.is_available():
    pretrained = pretrained.cuda()
    model = model.cuda()   
    criterion = criterion.cuda()
    
model.train()
for i, (input_ids, attention_mask, token_type_ids,
        labels) in enumerate(loader):
    if torch.cuda.is_available():
        input_ids, attention_mask, token_type_ids, labels = input_ids.cuda(), attention_mask.cuda(), token_type_ids.cuda(), labels.cuda() 
        
    out = model(input_ids=input_ids,
                attention_mask=attention_mask, 
                token_type_ids=token_type_ids)
    
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if i % 5 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)

        print(i, loss.item(), accuracy)

    if i == 300:
        break
end = time.time()
print('训练时间：', end - start)



0 0.6742366552352905 0.5625
5 0.7023376226425171 0.5
10 0.6331156492233276 0.625
15 0.7474691867828369 0.375
20 0.5797390937805176 0.6875
25 0.5568404793739319 0.9375
30 0.6012440323829651 0.875
35 0.6725104451179504 0.5625
40 0.539483904838562 0.875
45 0.5475109815597534 0.875
50 0.5602291822433472 0.875
55 0.5149505138397217 0.9375
60 0.5647078156471252 0.8125
65 0.538317859172821 0.75
70 0.5270556211471558 0.875
75 0.520484983921051 0.8125
80 0.46829140186309814 0.875
85 0.5209758877754211 0.8125
90 0.5698109865188599 0.6875
95 0.49698641896247864 0.875
100 0.6128743290901184 0.625
105 0.4816654920578003 0.875
110 0.47260743379592896 1.0
115 0.6126810908317566 0.6875
120 0.4164586067199707 1.0
125 0.5309047698974609 0.75
130 0.543803870677948 0.6875
135 0.4575423300266266 0.875
140 0.5235946774482727 0.8125
145 0.43342381715774536 0.9375
150 0.3835070729255676 1.0
155 0.4110313653945923 1.0
160 0.40721094608306885 0.9375
165 0.4661119282245636 0.9375
170 0.48450756072998047 0.875
17

# 8. 测试

In [8]:
#测试
def test():
    model.eval()
    correct = 0
    total = 0

    loader_test = torch.utils.data.DataLoader(dataset=Dataset('validation'),
                                              batch_size=32,
                                              collate_fn=collate_fn,
                                              shuffle=False,
                                              drop_last=True)

    for i, (input_ids, attention_mask, token_type_ids,
            labels) in enumerate(loader_test):
        if torch.cuda.is_available():
            input_ids, attention_mask, token_type_ids, labels = input_ids.cuda(), attention_mask.cuda(), token_type_ids.cuda(), labels.cuda()
        
#         if i == 5:
#             break

        print(i)

        with torch.no_grad():
            out = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        token_type_ids=token_type_ids)

        out = out.argmax(dim=1)
        correct += (out == labels).sum().item()
        total += len(labels)

    print(correct / total)

test()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
0.8699324324324325


In [9]:
!nvidia-smi

Sun Jun 19 12:26:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P0    51W / 250W |   2893MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
torch.__version__

'1.11.0'