In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [2]:
dataset = load_dataset('csv', data_files="./ChnSentiCorp_htl_all.csv", split='train')
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")  # 这里不需要设置CUDA了，因为Trainer会自动做判断

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
import evaluate
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

In [7]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

### 创建训练参数

In [13]:
train_args = TrainingArguments(output_dir="./checkpoints", # 指定训练输出的路径
                               per_device_train_batch_size=64, 
                               per_device_eval_batch_size=128,
                               logging_steps=10,
                               evaluation_strategy="epoch",
                               save_strategy="epoch",  # 按照epoch的方式去保存
                               save_total_limit=3,  # 最多保存3个
                               learning_rate=2e-5,
                               weight_decay=0.01,
                               metric_for_best_mdoel="f1",  # 设置评估指标
                               load_best_model_at_end=True)  # 训练完之后加载最优模型
train_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=au

### 创建Trainer

In [14]:
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model,
                  args=train_args,
                  train_dataset=tokenized_datasets['train'],
                  eval_dataset=tokenized_datasets['test'],
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

### 模型训练

In [15]:
trainer.train()

  0%|          | 0/330 [00:00<?, ?it/s]

{'train_runtime': 20363.3093, 'train_samples_per_second': 1.029, 'train_steps_per_second': 0.016, 'train_loss': 0.08026240955699573, 'epoch': 3.0}


TrainOutput(global_step=330, training_loss=0.08026240955699573, metrics={'train_runtime': 20363.3093, 'train_samples_per_second': 1.029, 'train_steps_per_second': 0.016, 'train_loss': 0.08026240955699573, 'epoch': 3.0})

### 模型评估

In [12]:
trainer.evaluate(tokenized_datasets["test"])

  0%|          | 0/98 [00:00<?, ?it/s]

{'eval_loss': 0.4299311637878418,
 'eval_accuracy': 0.8970398970398971,
 'eval_f1': 0.9249530956848029,
 'eval_runtime': 92.9531,
 'eval_samples_per_second': 8.359,
 'eval_steps_per_second': 1.054,
 'epoch': 3.0}

In [11]:
def evaluate():
    model.eval()
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            clf_metrics.add_batch(predictions=pred.long(), references=batch['labels'].long())
    return clf_metrics.compute()


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            global_step += 1
        clf = evaluate()
        print(f"ep: {ep},  {clf}")
            


In [12]:
train()

ep: 0, global_step: 0, loss: 0.6765587329864502


In [19]:
sen = "我觉得这家酒店的菜不错， 饭很好吃！"
id2_label = {0:"差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, max_length=128, padding="max_length", truncation=True, return_tensors='pt')
    if torch.cuda.is_available():
        inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果： {id2_label.get(pred.item())}")


输入：我觉得这家酒店的菜不错， 饭很好吃！
模型预测结果： 好评！


## 加载模型

In [1]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("./checkpoints/checkpoint-2500")
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

In [8]:
from transformers import pipeline, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)


In [9]:
pipe("我觉得这家酒店的菜不错， 饭很好吃！")

[{'label': 'LABEL_1', 'score': 0.9992902278900146}]