# 安装datasets和transformers

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 4.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 49.2 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 47.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 64.3 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[?25l[K     |█▎                              | 10 kB 34.5 MB/s eta 0:00:01[K     |██▌                             | 20 kB 8.9 MB/s eta 0:00:01[K     |███▊                            | 30 kB 8.0 MB/s eta 0:00:01[K     |█████                           | 40 kB 7.4 MB/s eta 0:00:01[K     |██████▏                         | 51 kB 4.1 MB/s eta 0:00:01[K     |███████▍                        | 61 kB 4.3 MB/s eta 0:00:01[K     |████████▋                       | 71 kB 4.6 MB/s eta 0:00:01[K     |██████████                      | 81 kB 5.1 MB/s eta 0:00:01[K     |███████████▏                    | 92 kB 5.2 MB/s eta 0:00:01[K     |████████████▍                   | 102 kB 4.3 MB/s eta 0:00:01[K     |█████████████▋                  | 112 kB 4.3 MB/s eta 0:00:01[K     |██████████████▉                 | 122 kB 4.3 MB/s eta 0:00:01[K     |████████████████                | 133 kB 4.3 MB/s eta 0:00:01[

# 3.4.1 Overview

In [10]:
# 对数据集进行预处理，之前已经尝试了很多遍的过程
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True) # 这里会加入一些新的key-value到datasets对象中

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
# 如果要使用pytorch的话，这里要把不能tensor化的对象都删除掉，主要是idx，sentence1，sentence2这些的，需要自己额外多处理一些内容
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)

# 上述预处理过程和之前使用Trainer API基本是相似的，下边开始的是一些新内容了
from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)
# 一个简单的打印查看
# for batch in train_dataloader:
#   print(batch)
#   print({k: v.shape for k, v in batch.items()})
#   break

# 创建模型并送入到模型中的一整个过程
from transformers import AutoModelForSequenceClassification
checkpoint = "bert-base-cased"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# 一个简单的打印查看
# for batch in train_dataloader:
#     outputs = model(**batch)
#     print(outputs.loss, outputs.logits.shape)
#     break

# optimizer用来更新的，和loss.backward()这些基本都要组合使用的
from transformers import AdamW
optimizer= AdamW(model.parameters(), lr=5e-5)

# loss = outputs.loss
# loss.backward()
# optimizer.step() # 在当前step更新下
# optimizer.zero_grad() # optimizer在每个步骤要请客，否则会累加起来（穷人的大显存）

# 在训练过程中不断减小学习率
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# 使用GPU进行训练
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
# print(device) # device(type="cuda")

# 把所有上边的几个东西组合在一起，就成了一个训练过程，使用tqdm可以查看这一整个过程
from tqdm.auto import tqdm
from datasets import load_metric
metric = load_metric("glue", "mrpc")

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        
        # model转化为训练状态
        model.train()
        
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 自己尝试在每个环节做一个输出，在每个epoch中，输出一下训练集和验证集上的准确率
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    print("epoch: " + str(epoch) + ",predict: " + str(metric.compute()))

# 测试过程如下所示
# from datasets import load_metric

# metric = load_metric("glue", "mrpc")
# model.eval()
# for batch in eval_dataloader:
#     batch = {k:v.to(device) for k, v in batch.items()}
#     with torch.no_grad():
#         outputs = model(**batch)
    
#     logits = outputs.logits
#     predictions = torch.argmax(logits, dim=-1)
#     metric.add_batch(predictions=predictions, references=batch["labels"])
# metric.compute()


Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1cb40aaf0134e900.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9a1e53f79e506f46.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c41720dd6309dcaf.arrow
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNo

  0%|          | 0/1377 [00:00<?, ?it/s]

epoch: 0,predict: {'accuracy': 0.8235294117647058, 'f1': 0.8727915194346291}
epoch: 1,predict: {'accuracy': 0.8578431372549019, 'f1': 0.8989547038327526}
epoch: 2,predict: {'accuracy': 0.8602941176470589, 'f1': 0.9008695652173914}


In [17]:
# 一个更加集合的过程
# import，各自有各自的作用
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# 数据集预处理过程，之前尝试过几次了
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenzier = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)

# 构建DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

# GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# model，并转移到GPU上
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)

# optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# 学习率递减及一些超参数
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # 转化为train
        model.train()

        batch = {k: v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 在每个epoch上做一个输出
    model.eval()

    # training set
    metric1 = load_metric("glue", "mrpc") # 需要查看一下，每次实例化和总实例化是否有区别？
    for batch in train_dataloader:
        model.eval()
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric1.add_batch(predictions=predictions, references=batch["labels"])
    print("epoch: " + str(epoch) + ", 训练集predict: " + str(metric1.compute()))

    # validation set
    metric2 = load_metric("glue", "mrpc")
    for batch in eval_dataloader:
        model.eval()
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric2.add_batch(predictions=predictions, references=batch["labels"])
    print("epoch: " + str(epoch) + ", 测试集predict: " + str(metric2.compute()))

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1cb40aaf0134e900.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9a1e53f79e506f46.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c41720dd6309dcaf.arrow
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNo

  0%|          | 0/1377 [00:00<?, ?it/s]

epoch: 0, 训练集predict: {'accuracy': 0.824154852780807, 'f1': 0.8757943385326401}
epoch: 0, 测试集predict: {'accuracy': 0.8112745098039216, 'f1': 0.8710217755443886}
epoch: 1, 训练集predict: {'accuracy': 0.9511995637949836, 'f1': 0.9634470083724729}
epoch: 1, 测试集predict: {'accuracy': 0.8602941176470589, 'f1': 0.8991150442477877}
epoch: 2, 训练集predict: {'accuracy': 0.9877317339149401, 'f1': 0.9909438518816663}
epoch: 2, 测试集predict: {'accuracy': 0.8578431372549019, 'f1': 0.9026845637583893}


In [18]:
# 一个更加集合的过程
# import，各自有各自的作用
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

# 数据集预处理过程，之前尝试过几次了
raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenzier = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

data_collator = DataCollatorWithPadding(tokenizer)

# 构建DataLoader
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

# GPU
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

# model，并转移到GPU上
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model.to(device)

# optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# 学习率递减及一些超参数
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# 总的实例化一次，看看和之前是否有区别，可能需要后期看看其他人是怎么写的
metrics = load_metric("glue", "mrpc")


progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # 转化为train
        model.train()

        batch = {k: v.to(device) for k,v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # 在每个epoch上做一个输出
    model.eval()

    # training set
    # metric1 = load_metric("glue", "mrpc") # 需要查看一下，每次实例化和总实例化是否有区别？
    for batch in train_dataloader:
        model.eval()
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics.add_batch(predictions=predictions, references=batch["labels"])
    print("epoch: " + str(epoch) + ", 训练集predict: " + str(metrics.compute()))

    # validation set
    # metric2 = load_metric("glue", "mrpc")
    for batch in eval_dataloader:
        model.eval()
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metrics.add_batch(predictions=predictions, references=batch["labels"])
    print("epoch: " + str(epoch) + ", 测试集predict: " + str(metrics.compute()))

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-1cb40aaf0134e900.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-9a1e53f79e506f46.arrow
Loading cached processed dataset at /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-c41720dd6309dcaf.arrow
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNo

  0%|          | 0/1377 [00:00<?, ?it/s]

epoch: 0, 训练集predict: {'accuracy': 0.8792257360959651, 'f1': 0.9155708023632552}
epoch: 0, 测试集predict: {'accuracy': 0.7843137254901961, 'f1': 0.8603174603174604}
epoch: 1, 训练集predict: {'accuracy': 0.9847328244274809, 'f1': 0.988673139158576}
epoch: 1, 测试集predict: {'accuracy': 0.8431372549019608, 'f1': 0.8869257950530036}
epoch: 2, 训练集predict: {'accuracy': 0.9956379498364231, 'f1': 0.9967676767676769}
epoch: 2, 测试集predict: {'accuracy': 0.8602941176470589, 'f1': 0.9015544041450777}
