# 基于Transformers的多项选择

## Step1 导入相关包

In [1]:
import evaluate

# 本地一份数据集：Dataset
# 本地多份数据集：DatasetDict
from datasets import DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
)

# 会提前填充好，所以不需要collator

  from .autonotebook import tqdm as notebook_tqdm


## Step2 加载数据集

In [2]:
# clue数据集
c3 = DatasetDict.load_from_disk("./c3/")
c3

DatasetDict({
    test: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 1625
    })
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

In [3]:
c3["train"][0]

{'id': 0,
 'context': ['男：你今天晚上有时间吗?我们一起去看电影吧?', '女：你喜欢恐怖片和爱情片，但是我喜欢喜剧片，科幻片一般。所以……'],
 'question': '女的最喜欢哪种电影?',
 'choice': ['恐怖片', '爱情片', '喜剧片', '科幻片'],
 'answer': '喜剧片'}

In [4]:
c3["test"][0]

{'id': 0,
 'context': ['老师把一个大玻璃瓶子带到学校，瓶子里装着满满的石头、玻璃碎片和沙子。之后，老师请学生把瓶子里的东西都倒出来，然后再装进去，先从沙子开始。每个学生都试了试，最后都发现没有足够的空间装所有的石头。老师指导学生重新装这个瓶子。这次，先从石头开始，最后再装沙子。石头装进去后，沙子就沉积在石头的周围，最后，所有东西都装进瓶子里了。老师说：“如果我们先从小的东西开始，把小东西装进去之后，大的石头就放不进去了。生活也是如此，如果你的生活先被不重要的事挤满了，那你就无法再装进更大、更重要的事了。”'],
 'question': '那个任务，学生刚开始完成得怎么样？',
 'choice': ['都没完成', '都装进去了', '完成得很好', '有一组没做完'],
 'answer': ''}

In [5]:
# test的answer字段为空
c3.pop("test")

Dataset({
    features: ['id', 'context', 'question', 'choice', 'answer'],
    num_rows: 1625
})

In [6]:
c3

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

## Step3 数据集预处理

In [7]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")
tokenizer

BertTokenizerFast(name_or_path='hfl/chinese-macbert-base', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [8]:
def process_function(examples):
    # examples, dict, keys: ["context", "quesiton", "choice", "answer"]
    # examples, 1000
    context = []
    question_choice = []
    labels = []
    for idx in range(len(examples["context"])):
        # 将context列表拼接成字符串
        ctx = "\n".join(examples["context"][idx])
        question = examples["question"][idx]
        choices = examples["choice"][idx]  # list
        # 构建：[CLS] + ctx + [SEP] + question + " " + choice + [SEP]
        for choice in choices:
            context.append(ctx)
            question_choice.append(question + " " + choice)
        # 填充
        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                context.append(ctx)
                question_choice.append(question + " " + "不知道")
        # 定位到正确答案的index
        labels.append(choices.index(examples["answer"][idx]))
    tokenized_examples = tokenizer(
        context,
        question_choice,
        truncation="only_first",
        max_length=256,
        padding="max_length",
    )  # input_ids: 4000 * 256,
    # num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
    tokenized_examples = {
        k: [v[i : i + 4] for i in range(0, len(v), 4)]
        for k, v in tokenized_examples.items()
    }  # 1000 * 4 * 256
    tokenized_examples["labels"] = labels
    return tokenized_examples

In [9]:
res = c3["train"].select(range(10)).map(process_function, batched=True)
res

Map: 100%|██████████| 10/10 [00:00<00:00, 673.73 examples/s]


Dataset({
    features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 10
})

In [10]:
import numpy as np

np.array(res["input_ids"]).shape

(10, 4, 256)

In [11]:
tokenized_c3 = c3.map(process_function, batched=True)
tokenized_c3

Map: 100%|██████████| 11869/11869 [00:05<00:00, 2284.95 examples/s]
Map: 100%|██████████| 3816/3816 [00:01<00:00, 2342.13 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 3816
    })
})

## Step4 创建模型

In [12]:
model = AutoModelForMultipleChoice.from_pretrained("hfl/chinese-macbert-base")

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at hfl/chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step5 创建评估函数

In [13]:
import numpy as np

accuracy = evaluate.load("accuracy")


def compute_metric(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

## Step6 配置训练参数

In [14]:
args = TrainingArguments(
    output_dir="./muliple_choice",
    per_device_train_batch_size=16,  # 实际是16*4=64
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
)

## Step7 创建训练器

In [15]:
trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=tokenized_c3["train"],
    eval_dataset=tokenized_c3["validation"],
    compute_metrics=compute_metric,
)

## Step8 模型训练

In [16]:
trainer.train()

  7%|▋         | 50/742 [00:26<05:37,  2.05it/s]

{'loss': 1.3376, 'grad_norm': 33.09033966064453, 'learning_rate': 4.690026954177898e-05, 'epoch': 0.07}


 13%|█▎        | 100/742 [00:51<05:24,  1.98it/s]

{'loss': 1.2429, 'grad_norm': 9.356889724731445, 'learning_rate': 4.359838274932615e-05, 'epoch': 0.13}


 20%|██        | 150/742 [01:16<05:00,  1.97it/s]

{'loss': 1.3565, 'grad_norm': 2.8682126998901367, 'learning_rate': 4.022911051212938e-05, 'epoch': 0.2}


 27%|██▋       | 200/742 [01:39<04:11,  2.15it/s]

{'loss': 1.255, 'grad_norm': 5.812619209289551, 'learning_rate': 3.685983827493262e-05, 'epoch': 0.27}


 34%|███▎      | 250/742 [02:02<03:48,  2.15it/s]

{'loss': 1.1413, 'grad_norm': 10.789732933044434, 'learning_rate': 3.349056603773585e-05, 'epoch': 0.34}


 40%|████      | 300/742 [02:27<03:28,  2.12it/s]

{'loss': 1.1225, 'grad_norm': 9.134963035583496, 'learning_rate': 3.0121293800539085e-05, 'epoch': 0.4}


 47%|████▋     | 350/742 [02:51<03:03,  2.13it/s]

{'loss': 1.0827, 'grad_norm': 7.456314563751221, 'learning_rate': 2.6752021563342316e-05, 'epoch': 0.47}


 54%|█████▍    | 400/742 [03:14<02:46,  2.06it/s]

{'loss': 1.0871, 'grad_norm': 7.24234676361084, 'learning_rate': 2.3382749326145553e-05, 'epoch': 0.54}


 61%|██████    | 450/742 [03:39<02:35,  1.88it/s]

{'loss': 1.0667, 'grad_norm': 9.89490795135498, 'learning_rate': 2.0013477088948788e-05, 'epoch': 0.61}


 67%|██████▋   | 500/742 [04:03<01:52,  2.14it/s]

{'loss': 1.0495, 'grad_norm': 9.587221145629883, 'learning_rate': 1.6644204851752022e-05, 'epoch': 0.67}


 74%|███████▍  | 550/742 [04:27<01:39,  1.94it/s]

{'loss': 0.9982, 'grad_norm': 10.686402320861816, 'learning_rate': 1.3274932614555258e-05, 'epoch': 0.74}


 81%|████████  | 600/742 [04:51<01:06,  2.13it/s]

{'loss': 1.007, 'grad_norm': 9.057299613952637, 'learning_rate': 9.905660377358492e-06, 'epoch': 0.81}


 88%|████████▊ | 650/742 [05:16<00:44,  2.06it/s]

{'loss': 0.9948, 'grad_norm': 15.068231582641602, 'learning_rate': 6.536388140161725e-06, 'epoch': 0.88}


 94%|█████████▍| 700/742 [05:40<00:19,  2.13it/s]

{'loss': 0.996, 'grad_norm': 10.008601188659668, 'learning_rate': 3.1671159029649594e-06, 'epoch': 0.94}


                                                 
100%|██████████| 742/742 [06:36<00:00,  2.18it/s]

{'eval_loss': 0.9323550462722778, 'eval_accuracy': 0.59958071278826, 'eval_runtime': 34.7396, 'eval_samples_per_second': 109.846, 'eval_steps_per_second': 6.88, 'epoch': 1.0}


100%|██████████| 742/742 [06:37<00:00,  1.87it/s]

{'train_runtime': 397.6088, 'train_samples_per_second': 29.851, 'train_steps_per_second': 1.866, 'train_loss': 1.114190566893215, 'epoch': 1.0}





TrainOutput(global_step=742, training_loss=1.114190566893215, metrics={'train_runtime': 397.6088, 'train_samples_per_second': 29.851, 'train_steps_per_second': 1.866, 'total_flos': 6245674154244096.0, 'train_loss': 1.114190566893215, 'epoch': 1.0})

## Step9 模型预测

In [17]:
from typing import Any
import torch


class MultipleChoicePipeline:

    def __init__(self, model, tokenizer) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, context, quesiton, choices):
        cs, qcs = [], []
        for choice in choices:
            cs.append(context)
            qcs.append(quesiton + " " + choice)
        return tokenizer(
            cs, qcs, truncation="only_first", max_length=256, return_tensors="pt"
        )

    def predict(self, inputs):
        # v.unsqueeze(0)：添加batch size维度
        inputs = {k: v.unsqueeze(0).to(self.device) for k, v in inputs.items()}
        return self.model(**inputs).logits  # [batch_size, num_choices]

    def postprocess(self, logits, choices):
        predition = torch.argmax(logits, dim=-1).cpu().item()
        return choices[predition]

    def __call__(self, context, question, choices) -> Any:
        inputs = self.preprocess(context, question, choices)
        logits = self.predict(inputs)
        result = self.postprocess(logits, choices)
        return result

In [18]:
pipe = MultipleChoicePipeline(model, tokenizer)

In [19]:
pipe(
    "小明在北京上班",
    "小明在哪里上班？",
    ["北京", "上海", "河北", "海南", "河北", "海南"],
)

'北京'