# 中文句子关系推断
## 任务简介：
判断两个句子是否有连接关系。
## 使用模型:
Bert作为backbone 使用预训练模型来抽取两个句子作为的文本特征，并在特征的基础上进行判断，得出两个句子是否是关联的。Bert 模型在自身训练的过程中有子任务就是判断两个句子之间的关系，所以使用 bert 完成这个子任务很合适。  
## 数据集：  
依然使用 ChnSentiCorp 数据集，删除 label 字段，只需要文本数据在后续的数据处理中，将把本文数据中立成需要的句子组成对话形式，并且每一对句子都有一个标识，表明句子之间是否有联系。
## 1. 准备数据集

In [1]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.4 MB/s[0m eta [36m0:00:0

In [2]:
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


### 1. 使用编码工具

In [3]:
from transformers import BertTokenizer
token = BertTokenizer.from_pretrained("bert-base-chinese")
token

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [4]:
out = token.batch_encode_plus(
    batch_text_or_text_pairs = ['海贼王我当定了','路飞与索罗相聚罗格镇','海军的存在就是为了正义'],
    # retrun_tensors = "pt",
    return_length = True
)

for k,v in out.items():
  print(k,":",v)

print(token.decode(out['input_ids'][0]))

input_ids : [[101, 3862, 6592, 4374, 2769, 2496, 2137, 749, 102], [101, 6662, 7607, 680, 5164, 5384, 4685, 5471, 5384, 3419, 7252, 102], [101, 3862, 1092, 4638, 2100, 1762, 2218, 3221, 711, 749, 3633, 721, 102]]
token_type_ids : [[0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
length : [9, 12, 13]
attention_mask : [[1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
[CLS] 海 贼 王 我 当 定 了 [SEP]


### 2. 定义数据集

In [5]:
from datasets import load_from_disk
dataset = load_from_disk("/content/drive/MyDrive/ChnSentiCorp")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [32]:
import torch
import random
class Dataset(torch.utils.data.Dataset):
  def __init__(self, split):
    dataset = load_from_disk("/content/drive/MyDrive/ChnSentiCorp")[split].shuffle(seed=70)
    def f(data):
      return len(data['text']) > 40
    self.dataset = dataset.filter(f)
  def __len__(self):
    return len(self.dataset)
  def __getitem__(self, i):
    text = self.dataset[i]['text']
    sentence1 = text[:20]
    sentence2 = text[20:40]
    label = random.randint(0,1)
    if label == 1:
      j = random.randint(0, len(self.dataset) - 1)
      sentence2 = self.dataset[j]['text'][20:40]
    return sentence1, sentence2, label

dataset = Dataset('train')
var1, var2, var3 = dataset[7]
len(dataset), var1, var2, var3
# 关联数据是0，不关联的是1

Filter:   0%|          | 0/9600 [00:00<?, ? examples/s]

(8001, '房间装修比较旧，周边也没有休闲购物的地方', '，性价比不高。虽然说免费提供上网，但实际', 0)

## 3.定义计算设备

In [6]:
import torch
device = 'cpu'
if torch.cuda.is_available():
  device = 'cuda'
device

'cuda'

## 4. 定义数据整理函数

In [34]:
def collate_fn(data):
  sents = [i[:2] for i in data]
  labels = [i[2] for i in data]
  data = token.batch_encode_plus(
      batch_text_or_text_pairs=sents,
      truncation=True,
      padding = 'max_length',
      max_length = 45,
      return_tensors = 'pt',
      return_length = True,
      add_special_tokens = True,
  )
  input_ids = data['input_ids'].to(device)
  token_type_ids = data['token_type_ids'].to(device)
  attention_mask = data['attention_mask'].to(device)
  labels = torch.LongTensor(labels).to(device)
  return input_ids, attention_mask, token_type_ids, labels

test_data = Dataset('train')
test_data = [test_data[1],test_data[2],test_data[3]]
test_data
var1, var2, var3, var4 = collate_fn(test_data)
print(token.decode(var1[0]))
var1.shape, var2.shape, var3.shape, var4

[CLS] 性 价 比 高 ； 显 示 屏 不 错 ； 一 条 [UNK] 内 存 ； [SEP] thinkpad 专 用 ） ， 性 能 稳 定 ， 使 用 了 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


(torch.Size([3, 45]),
 torch.Size([3, 45]),
 torch.Size([3, 45]),
 tensor([1, 0, 0], device='cuda:0'))

### 5. 定义数据集加载器

In [38]:
loader = torch.utils.data.DataLoader(
    dataset = Dataset('train'),
    batch_size = 8,
    collate_fn = collate_fn,
    shuffle = True,
    drop_last = True,
)
len(loader)

1000

查看数据样例

In [39]:
for i,(var1, var2, var3, var4) in enumerate(loader):
  break
print(token.decode(var1[0]))
var1.shape, var2.shape, var3.shape, var4

[CLS] 装 机 要 改 [UNK] 散 热 不 怎 么 样 3dma [SEP] rk06 运 行 未 响 应 ， 可 能 是 amd 的 cpu [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]


(torch.Size([8, 45]),
 torch.Size([8, 45]),
 torch.Size([8, 45]),
 tensor([0, 1, 1, 0, 0, 0, 0, 0], device='cuda:0'))

## 2. 定义模型
### 1. 加载训练模型

In [40]:
from transformers import BertModel
pretrained = BertModel.from_pretrained("bert-base-chinese")
sum(i.nelement() for i in pretrained.parameters()) / 10000

Downloading model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

10226.7648

模型参数超过 1个亿，本次只是将模型作为特征提取器，不对模型进行微调，需要冻结模型

In [41]:
for param in pretrained.parameters():
  param.requires_grad_(False)

对冻结模型进行试算，观察输入输出

In [42]:
pretrained.to(device)
out = pretrained(input_ids = var1,
        attention_mask = var2,
        token_type_ids = var3
)
print(out)
out.last_hidden_state.shape

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2676,  0.3493, -0.6113,  ...,  0.6170, -0.9017, -0.3040],
         [-0.1646,  0.0055, -0.2975,  ..., -0.4079, -1.0838,  0.2616],
         [-0.7438,  0.5934,  0.0745,  ...,  0.6769, -0.6124, -0.2094],
         ...,
         [ 0.1446,  0.5361, -0.0652,  ..., -0.0537, -0.5474, -0.1145],
         [ 0.1931,  0.2714, -0.4588,  ...,  0.2841, -0.5749, -0.1758],
         [ 0.1362,  0.5256, -0.0737,  ..., -0.0470, -0.5145, -0.0953]],

        [[-0.7078,  0.9596, -0.5822,  ...,  0.3859,  0.7320, -0.8654],
         [ 0.3418, -0.2985,  1.3014,  ..., -1.0366, -0.5203,  0.2162],
         [ 0.4088, -1.2087, -0.4192,  ...,  0.3708,  0.7221,  0.6695],
         ...,
         [-0.1997,  0.4299,  0.0129,  ..., -0.1211,  0.2582, -0.7055],
         [-0.0089, -0.0060, -0.1784,  ..., -0.2175, -0.0044, -0.5449],
         [ 0.0439,  0.0566, -0.3493,  ..., -0.1688,  0.1133, -0.5737]],

        [[-0.6150,  1.3054,  0.3551,  ...,  0.7875,  

torch.Size([8, 45, 768])

### 2. 定义下游任务

In [44]:
import torch.nn as nn
class Model(torch.nn.Module):
  def __init__(self):
    super().__init__()
    self.fc = torch.nn.Linear(768,2)
  def forward(self, input_ids, attention_mask, token_type_ids):
    # 使用预训练模型抽取特征
    with torch.no_grad():
      out = pretrained(input_ids = input_ids,
        attention_mask = attention_mask,
        token_type_ids = token_type_ids)
    out = self.fc(out.last_hidden_state[:,0])
    m = nn.Softmax(dim=1)
    # out = out.nn.Softmax(dim=1)
    return m(out)

model = Model()
model.to(device)

# 试算
model(input_ids=var1, attention_mask=var2, token_type_ids=var3).shape

torch.Size([8, 2])

训练

In [45]:
from transformers import AdamW
from transformers.optimization import get_scheduler
def train():
  optimizer = AdamW( model.parameters(), lr = 5e-4)
  criterion = torch.nn.CrossEntropyLoss()
  scheduler = get_scheduler(
      name = "linear",
      num_warmup_steps = 0 ,
      num_training_steps = len(loader),
      optimizer = optimizer)

  model.train()
  for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
    out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
    loss = criterion(out, labels)
    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
    if i % 20 == 0:
      out = out.argmax(dim=1)
      accuracy = (out == labels).sum().item() / len(labels)
      lr = optimizer.state_dict()['param_groups'][0]['lr']
      print(i, loss.item(), lr, accuracy)

# 第一次训练
train()



0 0.7559349536895752 0.0004995 0.25
20 0.5014636516571045 0.0004895 0.75
40 0.5253674983978271 0.0004795 0.75
60 0.4530353546142578 0.0004695 0.875
80 0.34555456042289734 0.00045950000000000006 1.0
100 0.390194296836853 0.00044950000000000003 1.0
120 0.44828811287879944 0.0004395 0.875
140 0.5095226168632507 0.0004295 0.75
160 0.3925873339176178 0.0004195 0.875
180 0.4355214238166809 0.0004095 0.875
200 0.4150103032588959 0.0003995 0.875
220 0.4759352505207062 0.00038950000000000003 0.75
240 0.33236685395240784 0.0003795 1.0
260 0.47383907437324524 0.0003695 0.875
280 0.48334071040153503 0.0003595 0.875
300 0.4362844228744507 0.0003495 0.875
320 0.5351254343986511 0.0003395 0.75
340 0.5346762537956238 0.00032950000000000004 0.75
360 0.5716930031776428 0.0003195 0.75
380 0.3568860590457916 0.0003095 1.0
400 0.32434648275375366 0.0002995 1.0
420 0.324169784784317 0.0002895 1.0
440 0.36224740743637085 0.0002795 1.0
460 0.3798927366733551 0.00026950000000000005 1.0
480 0.4438630938529968 0

In [46]:
train()

0 0.3346230089664459 0.0004995 1.0
20 0.4089270234107971 0.0004895 0.875
40 0.5102227926254272 0.0004795 0.75
60 0.5053510665893555 0.0004695 0.75
80 0.33018988370895386 0.00045950000000000006 1.0
100 0.40845373272895813 0.00044950000000000003 1.0
120 0.5821674466133118 0.0004395 0.75
140 0.4352390468120575 0.0004295 0.875
160 0.4507412910461426 0.0004195 0.875
180 0.34920620918273926 0.0004095 1.0
200 0.43105408549308777 0.0003995 0.875
220 0.4286583662033081 0.00038950000000000003 0.875
240 0.32372772693634033 0.0003795 1.0
260 0.5788923501968384 0.0003695 0.75
280 0.4559512436389923 0.0003595 0.875
300 0.3342723250389099 0.0003495 1.0
320 0.3173181414604187 0.0003395 1.0
340 0.5278144478797913 0.00032950000000000004 0.75
360 0.49338361620903015 0.0003195 0.75
380 0.443149209022522 0.0003095 0.875
400 0.461862713098526 0.0002995 0.875
420 0.3364284336566925 0.0002895 1.0
440 0.3568921983242035 0.0002795 1.0
460 0.33134540915489197 0.00026950000000000005 1.0
480 0.3868561089038849 0.0

测试

In [47]:
def test():
  loader_test = torch.utils.data.DataLoader(
      dataset = Dataset('test'),
      batch_size = 32,
      collate_fn = collate_fn,
      shuffle = True,
      drop_last = True,
  )
  model.eval()
  total_list = []
  correct = 0
  total = 0
  for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
    # print(i)
    with torch.no_grad():
      out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
      out = out.argmax(dim=1)
      correct += (out == labels).sum().item()
      total += len(labels)
    total_list.append(correct / total)
  return sum(total_list)/len(loader_test)
test()

Filter:   0%|          | 0/1200 [00:00<?, ? examples/s]

0.8654908126536178

验证

In [48]:
def validation():
  loader_test = torch.utils.data.DataLoader(
      dataset = Dataset('validation'),
      batch_size = 32,
      collate_fn = collate_fn,
      shuffle = True,
      drop_last = True,
  )
  model.eval()
  total_list = []
  correct = 0
  total = 0
  for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
    # print(i)
    with torch.no_grad():
      out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
      out = out.argmax(dim=1)
      correct += (out == labels).sum().item()
      total += len(labels)
    total_list.append(correct / total)
  return sum(total_list)/len(loader_test)
validation()

Filter:   0%|          | 0/1200 [00:00<?, ? examples/s]

0.8937651371768007

第三次训练

In [49]:
train()

0 0.32430240511894226 0.0004995 1.0
20 0.40649083256721497 0.0004895 0.875
40 0.32790565490722656 0.0004795 1.0
60 0.5166630744934082 0.0004695 0.75
80 0.5471376180648804 0.00045950000000000006 0.75
100 0.32359176874160767 0.00044950000000000003 1.0
120 0.3599091172218323 0.0004395 1.0
140 0.3222920596599579 0.0004295 1.0
160 0.451789915561676 0.0004195 0.75
180 0.4244410991668701 0.0004095 0.875
200 0.33762747049331665 0.0003995 1.0
220 0.38297808170318604 0.00038950000000000003 1.0
240 0.3675689995288849 0.0003795 0.875
260 0.5219493508338928 0.0003695 0.75
280 0.3312818706035614 0.0003595 1.0
300 0.7730363607406616 0.0003495 0.5
320 0.38088253140449524 0.0003395 1.0
340 0.3394869267940521 0.00032950000000000004 1.0
360 0.4304693639278412 0.0003195 0.875
380 0.5871613621711731 0.0003095 0.625
400 0.40624570846557617 0.0002995 0.875
420 0.45045045018196106 0.0002895 0.875
440 0.4112741947174072 0.0002795 0.875
460 0.37088027596473694 0.00026950000000000005 1.0
480 0.44942471385002136 

测试集测试

In [50]:
def test():
  loader_test = torch.utils.data.DataLoader(
      dataset = Dataset('test'),
      batch_size = 32,
      collate_fn = collate_fn,
      shuffle = True,
      drop_last = True,
  )
  model.eval()
  total_list = []
  correct = 0
  total = 0
  for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
    # print(i)
    with torch.no_grad():
      out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
      out = out.argmax(dim=1)
      correct += (out == labels).sum().item()
      total += len(labels)
    total_list.append(correct / total)
  return sum(total_list)/len(loader_test)
test()

0.8782414655112991

验证集验证

In [51]:
def validation():
  loader_test = torch.utils.data.DataLoader(
      dataset = Dataset('validation'),
      batch_size = 32,
      collate_fn = collate_fn,
      shuffle = True,
      drop_last = True,
  )
  model.eval()
  total_list = []
  correct = 0
  total = 0
  for i,(input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
    # print(i)
    with torch.no_grad():
      out = model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)
      out = out.argmax(dim=1)
      correct += (out == labels).sum().item()
      total += len(labels)
    total_list.append(correct / total)
  return sum(total_list)/len(loader_test)
validation()

0.863771107307157