# 任务二. 中文填空
人再阅读一个句子时，删除一两个词，并不影响阅读，可以根据上下文猜出被挖的词，这被称为填空任务。

In [None]:
!pip install transformers
!pip install datasets
!nvcc -V

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m57.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m80.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m69.2 MB/s[0m eta [36m0:00:0

In [None]:
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


## 数据集
使用数据集：ChnSentiCorp  
数据集介绍：情感分类数据集，每条数据包括一个购物评价，和一个标识，评价的商品包括书籍、酒店、计算机等。  
数据处理过程中，会把每句地第十五字遮盖，换成特殊字符[MASK]，并且每句话会被截断成固定的30个字符长度，神经网络的任务内容就是根据上下文预测第十五个字符。

## 模型架构
与任务一相同，将预训练模型视为 backbone 不做参数调整，自定义下游任务方式进行测试。

### 准备数据集
#### 1. 使用编码工具

In [None]:
from transformers import BertTokenizer
token = BertTokenizer.from_pretrained("bert-base-chinese")
token

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [None]:
out = token.batch_encode_plus(
    batch_text_or_text_pairs = ['海贼王我当定了','路飞与索罗相聚罗格镇','海军的存在就是为了正义'],
    # retrun_tensors = "pt",
    return_length = True
)

for k,v in out.items():
  print(k,":",v)

print(token.decode(out['input_ids'][0]))

input_ids : [[101, 3862, 6592, 4374, 2769, 2496, 2137, 749, 102], [101, 6662, 7607, 680, 5164, 5384, 4685, 5471, 5384, 3419, 7252, 102], [101, 3862, 1092, 4638, 2100, 1762, 2218, 3221, 711, 749, 3633, 721, 102]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
length : [9, 12, 13]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[CLS] 海 贼 王 我 当 定 了 [SEP]


#### 2. 定义数据集

In [None]:
from datasets import load_from_disk
dataset = load_from_disk("/content/drive/MyDrive/ChnSentiCorp")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [None]:
def f(data):
  return token.batch_encode_plus(
      batch_text_or_text_pairs = data['text'],
      truncation = True,
      padding = 'max_length',
      max_length = 30,
      return_length = True
  )
dataset = dataset.map(f,
      batched =True,
      batch_size = 1000,
      num_proc = 4,
      remove_columns = ['text','label'],
  )

dataset

Map (num_proc=4):   0%|          | 0/9600 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1200 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 1200
    })
})

过滤字符长度小于30的语句

In [None]:
def f_cut(data):
  return [ i >= 30 for i in data['length'] ]

dataset = dataset.filter(f_cut,
    batched = True,
    batch_size = 1000,
    num_proc = 4,
    )
dataset

Filter (num_proc=4):   0%|          | 0/9600 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/1200 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/1200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 9286
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 1158
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'length', 'attention_mask'],
        num_rows: 1157
    })
})

### 3. 定义计算设备



In [None]:
import torch
device = 'cpu'
if torch.cuda.is_available():
  device = 'cuda'
device

'cuda'

#### 4. 定义数据整理函数

In [None]:
def collate_fn(data):
  input_ids = [i['input_ids'] for i in data]
  token_type_ids = [i['token_type_ids'] for i in data]
  attention_mask = [i['attention_mask'] for i in data]
  # 转换成 tensor
  input_ids = torch.LongTensor(input_ids)
  token_type_ids = torch.LongTensor(token_type_ids)
  attention_mask = torch.LongTensor(attention_mask)
  # 第十五字符替换成 MASK
  labels = input_ids[:,15].reshape(-1).clone()
  input_ids[:,15] = token.get_vocab()[token.mask_token]

  input_ids = input_ids.to(device)
  token_type_ids = token_type_ids.to(device)
  attention_mask = attention_mask.to(device)
  labels = labels.to(device)
  return input_ids, attention_mask, token_type_ids, labels

test_data = dataset['train'].select([1,2])
test_data
var1, var2, var3, var4 = collate_fn(test_data)
print(token.decode(var1[1]))
print(token.decode(var4[1]))
var1.shape, var2.shape, var3.shape, var4

[CLS] 1. 接 电 源 没 有 几 分 钟, 电 源 适 [MASK] 器 热 的 不 行. 2. 摄 像 头 用 不 [SEP]
配


(torch.Size([2, 30]),
 torch.Size([2, 30]),
 torch.Size([2, 30]),
 tensor([1825, 6981], device='cuda:0'))

#### 5. 定义数据集加载器

In [None]:
loader = torch.utils.data.DataLoader(
    dataset = dataset['train'],
    batch_size = 20,
    collate_fn = collate_fn,
    shuffle = True,
    drop_last = True,
)
len(loader)
# show one batch data
for i,(var1, var2, var3, var4) in enumerate(loader):
  break
print(token.decode(var1[1]))
print(token.decode(var4[1]))
var1.shape, var2.shape, var3.shape, var4

[CLS] 服 务 总 体 很 好 ， 通 过 房 间 电 话 可 [MASK] 方 便 得 到 服 务 ， 价 格 有 一 定 竞 [SEP]
以


(torch.Size([20, 30]),
 torch.Size([20, 30]),
 torch.Size([20, 30]),
 tensor([1350,  809, 1139, 4415,  679, 6821, 1423, 8024, 2346, 2094,  100, 1469,
         4638,  679, 8024,  116, 1599, 6387,  749, 2399], device='cuda:0'))

### 定义模型
#### 1. 加载预训练模型

In [None]:
from transformers import BertModel
pretrained = BertModel.from_pretrained("bert-base-chinese")
sum(i.nelement() for i in pretrained.parameters()) / 10000

Downloading model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

10226.7648

模型参数超过 1个亿，本次只是将模型作为特征提取器，不对模型进行微调，需要冻结模型

In [None]:
for param in pretrained.parameters():
  param.requires_grad_(False)

对冻结模型进行试算，观察输入输出

In [None]:
pretrained.to(device)
out = pretrained(input_ids = var1,
        attention_mask = var2,
        token_type_ids = var3
)
print(out)
out.last_hidden_state.shape

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 6.5386e-01,  1.0758e-02, -1.5797e-01,  ...,  4.7415e-01,
          -7.3767e-01, -1.8951e-01],
         [ 1.0089e+00,  6.2436e-01, -8.8314e-01,  ...,  4.6767e-01,
          -8.0295e-01,  2.1339e-01],
         [ 8.4972e-01, -1.9226e-01, -4.8350e-01,  ...,  7.3792e-01,
          -2.9309e-01, -3.1109e-02],
         ...,
         [ 1.2513e+00, -5.0597e-01, -4.0739e-02,  ...,  1.2647e+00,
          -7.8422e-02,  3.8459e-01],
         [ 1.6605e+00,  2.6047e-01, -1.2258e-01,  ...,  4.0181e-01,
           3.0988e-01,  6.7763e-02],
         [ 7.9018e-01, -2.5067e-02, -5.1823e-01,  ...,  7.6467e-01,
          -6.8388e-01, -6.7498e-01]],

        [[ 1.4255e-01,  2.5426e-01,  4.0242e-01,  ..., -7.4451e-01,
          -1.3002e-01, -1.6328e-02],
         [ 8.0946e-01,  1.8544e-01, -2.8532e-01,  ..., -1.0212e+00,
          -3.2374e-02, -1.6824e-02],
         [ 6.9421e-01, -2.3051e-01, -6.4590e-02,  ...,  5.0293e-01,
           1.

torch.Size([20, 30, 768])

#### 2. 定义下游任务

In [None]:
import torch.nn as nn
class Model(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.decoder = torch.nn.Linear(
        in_features=768,
        out_features = token.vocab_size,
        bias = False,
    )
    self.bias = torch.nn.Parameter(data=torch.zeros(token.vocab_size))
    self.decoder.bias = self.bias
    self.DropOut = torch.nn.Dropout(p=0.5)
  def forward(self, input_ids, attention_mask, token_type_ids):
    # 使用预训练模型抽取特征
    with torch.no_grad():
      out = pretrained(input_ids = input_ids,
        attention_mask = attention_mask,
        token_type_ids = token_type_ids)
    # 把第十五个词的特征投影到字典范围内
    out = self.DropOut(out.last_hidden_state[:,15])
    out = self.decoder(out)
    return out

model = Model()
model.to(device)

# 试算
model(input_ids=var1, attention_mask=var2, token_type_ids=var3).shape

torch.Size([20, 21128])

训练

In [33]:
from transformers import AdamW
from transformers.optimization import get_scheduler
def train():
  optimizer = AdamW( model.parameters(), lr = 5e-4, weight_decay = 1.0 )
  criterion = torch.nn.CrossEntropyLoss()
  scheduler = get_scheduler(
      name = "linear",
      num_warmup_steps = 0 ,
      num_training_steps = len(loader) * 5,
      optimizer = optimizer)

  model.train()
  for epoch in range(8):
    for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
      out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
      loss = criterion(out, labels)
      loss.backward()
      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()
      if i % 50 == 0:
        out = out.argmax(dim=1)
        accuracy = (out == labels).sum().item() / len(labels)
        lr = optimizer.state_dict()['param_groups'][0]['lr']
        print(epoch, i, loss.item(), lr, accuracy)

train()


0 0 1.9384686946868896 0.0004997844827586207 0.75
0 50 2.847938299179077 0.0004890086206896551 0.45
0 100 3.2198944091796875 0.00047823275862068963 0.5
0 150 2.440917491912842 0.00046745689655172413 0.65
0 200 2.4985251426696777 0.0004566810344827586 0.6
0 250 3.3938515186309814 0.0004459051724137931 0.55
0 300 2.4503092765808105 0.0004351293103448276 0.55
0 350 2.573770046234131 0.0004243534482758621 0.6
0 400 3.1031997203826904 0.00041357758620689654 0.4
0 450 1.4600650072097778 0.00040280172413793104 0.8
1 0 1.7394129037857056 0.00039978448275862066 0.8
1 50 2.8840789794921875 0.0003890086206896552 0.5
1 100 2.314711093902588 0.0003782327586206897 0.6
1 150 3.3673222064971924 0.00036745689655172414 0.45
1 200 2.5796098709106445 0.00035668103448275863 0.5
1 250 1.7201449871063232 0.0003459051724137931 0.75
1 300 2.9165799617767334 0.00033512931034482756 0.5
1 350 2.0518202781677246 0.00032435344827586206 0.65
1 400 2.308948516845703 0.0003135775862068966 0.7
1 450 1.9466203451156616 

测试

In [36]:
def test():
  loader_test = torch.utils.data.DataLoader(
      dataset = dataset['test'],
      batch_size = 32,
      collate_fn = collate_fn,
      shuffle = True,
      drop_last = True,
  )
  model.eval()
  total_list = []
  correct = 0
  total = 0
  for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
    # print(i)
    with torch.no_grad():
      out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
      out = out.argmax(dim=1)
      correct += (out == labels).sum().item()
      total += len(labels)
    total_list.append(correct / total)
  return sum(total_list)/len(loader_test)
test()

0.6130619605060155

验证集

In [37]:
def test():
  loader_test = torch.utils.data.DataLoader(
      dataset = dataset['validation'],
      batch_size = 32,
      collate_fn = collate_fn,
      shuffle = True,
      drop_last = True,
  )
  model.eval()
  total_list = []
  correct = 0
  total = 0
  for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
    # print(i)
    with torch.no_grad():
      out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
      out = out.argmax(dim=1)
      correct += (out == labels).sum().item()
      total += len(labels)
    total_list.append(correct / total)
  return sum(total_list)/len(loader_test)
test()

0.6013643632470856