# 显存优化

## 显存占用分析

模型权重————4byte*模型参数量
优化器状态————8byte*模型参数量
梯度————4byte*模型参数量
前向激活值————序列长度、隐层维度、btach大小等因素

## step1导包

In [1]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## step2加载数据集

In [2]:
dataset = load_dataset("lansinuote/ChnSentiCorp", split="train")
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 9600
})

## step3划分数据集

In [3]:
datasets=dataset.train_test_split(test_size=0.1) #将10%的数据作为测试集，剩下90%的数据作为训练集
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8640
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 960
    })
})

## step4数据集预处理

In [4]:
import torch

tokenizer=AutoTokenizer.from_pretrained("hfl/rbt3") #加载分词器

def process_function(examples):
    tokenized_examples=tokenizer(
        examples["text"], #对每条文本进行分词
        max_length=128,
        truncation=True,    #超过长度则截断
        padding="max_length") #不够长度则用pad补齐到max_length
    tokenized_examples["labels"]=examples["label"]
    return tokenized_examples

tokenized_datasets=datasets.map(process_function,batched=True,remove_columns=datasets["train"].column_names)
tokenized_datasets

Map: 100%|██████████| 8640/8640 [00:01<00:00, 6092.35 examples/s]
Map: 100%|██████████| 960/960 [00:00<00:00, 4843.98 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8640
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 960
    })
})

## step5创建模型

In [5]:
model=AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## step6创建评估函数

In [6]:
import evaluate
#可根据任务类型在huggingface上选择提供的适配指标
acc_metric=evaluate.load("accuracy")
f1_metric=evaluate.load("f1")

In [7]:
def eval_metric(eval_predict):
    predictions,labels=eval_predict
    predictions=predictions.argmax(axis=-1)
    acc=acc_metric.compute(predictions=predictions,references=labels)
    f1=f1_metric.compute(predictions=predictions,references=labels)
    acc.update(f1)
    return acc

## step7创建trainingargument

In [8]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 输出文件夹
                               per_device_train_batch_size=2,   # 训练时的batch_size
                               per_device_eval_batch_size=4,    # 验证时的batch_size
                               num_train_epochs=1,              # 训练轮数
                               logging_steps=10,                # log 打印的频率
                               eval_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.001,              # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标

                               gradient_accumulation_steps=32, #优化1：梯度累加 BS1 GA32
                               gradient_checkpointing=True,   #优化2：启动梯度检查点，优化前向激活值内存占用
                               optim="adafactor",  #优化3：添加优化器
                               load_best_model_at_end=True)     # 训练完成后加载最优模型
train_args

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=epoch,
eval_use_gather_object=False,
fp16=False,
fp16

## step8 创建trainer

In [9]:
from transformers import DataCollatorWithPadding
#冻结参数
for name,param in model.bert.named_parameters():
    param.requires_grad=False

trainer = Trainer(model=model, 
                  args=train_args, 
                  tokenizer=tokenizer,
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

  trainer = Trainer(model=model,


## step9模型训练

In [10]:
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
trainer.evaluate(tokenized_datasets["test"])

In [None]:
trainer.predict(tokenized_datasets["test"])

## step10模型预测

In [None]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")

    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

In [None]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)


In [None]:
pipe(sen)