# 利用pytorch 手动实现 transformers库中保存模型的功能

首先准备一些前置的代码

In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification
import evaluate
from torch.utils.data import DataLoader
import torch
from torch.optim import Adam

accuracy = evaluate.load('accuracy')
f1_score = evaluate.load('f1')
model = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-macbert-large', trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-macbert-large')


dataset = load_dataset('csv', data_files='D:\\CodeLibrary\\NLP_Task\\classification_demo\\ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda x : x['review'] is not None)
split_datasets = dataset.train_test_split(test_size=0.1)

def process_datasets(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=32, truncation=True, padding=True)
    tokenized_examples['label'] = examples['label'] # 重新添加 ['label']
    return tokenized_examples

# 带有 remove_columns 的 feature栏 少了 ['review'], 本质上 ['label'] 也被删除了，只是重新添加回去了
tokenized_datasets = split_datasets.map(process_datasets, batched=True, remove_columns=split_datasets['test'].column_names)

trainset, validset = tokenized_datasets['train'], tokenized_datasets['test']
train_loader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
test_loader = DataLoader(validset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))

optimizer = Adam(model.parameters(), lr=2e-5)

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

# transformers 库提供了 `TrainingArguments` 封装了训练过程中的许多功能，例如：

In [None]:
from transformers import TrainingArguments

train_args = TrainingArguments(
    output_dir='./temps', ## 模型保存地址
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,
    gradient_checkpointing=True,
    optim='adafactor',
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    logging_steps=1,
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    save_total_limit=3,
    weight_decay=0.01,
    metric_for_best_model='f1',
    load_best_model_at_end=True
)

# 如果用Pytorch实现 参数
```python
output_dir='./temps',
```

In [7]:
import os
import torch

# 先定义目录
output_dir='./temps'

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

assert os.path.exists(output_dir), f"output directory '{output_dir}' does not exist"

best_f1 = 0

def save_checkpoint(epoch, model, optimizer, f1):
    checkpoint_path = os.path.join(output_dir, f"checkpoint_epoch_{epoch}.pt")
    global best_f1
    if f1['f1'] > best_f1:
        best_f1 = f1['f1']
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'f1_score': f1['f1']
        }, checkpoint_path)
        print(f"Checkpoint saved to {checkpoint_path}")

# 在训练函数中增加 `save_checkpoint` 函数 保存模型

In [8]:
def train(epochs=3, log_step=100):
    global_step=0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    for epoch in range(epochs):
        model.train()
        correct = 0
        total = len(tokenized_datasets['train'])
        all_predictions = []
        all_labels = []
        for batch in train_loader:
            batch = {k:v.to(device) for k,v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            prediction = torch.argmax(output.logits, dim=-1)
            # print('predictions', prediction)
            # print('labels', batch['labels'])
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"{epoch}, global_step:{global_step}, loss:{output.loss.item()}")
            global_step += 1
            all_predictions.extend(prediction.cpu().tolist())
            all_labels.extend(batch['labels'].cpu().tolist())
        acc = accuracy.compute(predictions=all_predictions, references=all_labels)
        f1 = f1_score.compute(predictions=all_predictions, references=all_labels)
        save_checkpoint(epoch, model, optimizer, f1)
        print(f"acc: {acc}, f1:{f1}")

In [9]:
train()

0, global_step:0, loss:0.26602548360824585
0, global_step:100, loss:0.1833508163690567
0, global_step:200, loss:0.29030925035476685
Checkpoint saved to ./temps\checkpoint_epoch_0.pt
acc: {'accuracy': 0.8942472810532341}, f1:{'f1': 0.924306053467172}
1, global_step:300, loss:0.11631196737289429
1, global_step:400, loss:0.06651370972394943
Checkpoint saved to ./temps\checkpoint_epoch_1.pt
acc: {'accuracy': 0.9384659416141957}, f1:{'f1': 0.9557886078552333}
2, global_step:500, loss:0.026110773906111717
2, global_step:600, loss:0.09740591049194336
Checkpoint saved to ./temps\checkpoint_epoch_2.pt
acc: {'accuracy': 0.9675157412707499}, f1:{'f1': 0.9765374677002584}
