<a href="https://colab.research.google.com/github/yuyu990116/transformers_tutorials/blob/main/pytorch%2Btransformers_SequenceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate
import torch
import evaluate
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer
from transformers import TrainingArguments
from datasets import load_dataset



In [2]:
datasets = load_dataset("lansinuote/ChnSentiCorp")
datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [3]:
datasets = datasets.filter(lambda x:x['label'] is not None)

In [4]:
tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")
model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def process_data(example):
  data = tokenizer(example["text"],max_length=128,truncation=True)
  data['labels']=example["label"]

  return data

tokenized_datasets=datasets.map(process_data,remove_columns=datasets["train"].column_names,batched=True)

Map:   0%|          | 0/9600 [00:00<?, ? examples/s]

In [6]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1200
    })
})

In [7]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
#如果不这样导入dataloader好像就在后面的时候会提示list没有cuda

trainset, validset = tokenized_datasets["train"], tokenized_datasets["validation"]
trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [8]:
# from torch.utils.data import DataLoader


# trainset, validset = tokenized_datasets["train"], tokenized_datasets["validation"]
# trainloader = DataLoader(trainset, batch_size=32, shuffle=True)
# validloader = DataLoader(validset, batch_size=64, shuffle=False)
好像dataloader就是必须有一个合适的collate_fn，不然在模型调用的时候就会报错：list型数据

In [9]:
for data in trainloader:
  print(data)
  break

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 101, 2769, 3221,  ...,    0,    0,    0],
        [ 101, 6983, 2421,  ...,    0,    0,    0],
        [ 101, 3221, 2769,  ...,    0,    0,    0],
        ...,
        [ 101, 2242, 2391,  ...,    0,    0,    0],
        [ 101, 2190, 8243,  ...,    0,    0,    0],
        [ 101, 3634,  741,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0,
        0, 1, 0, 0, 1, 1, 0, 1])}


In [10]:
from torch.optim import AdamW
if torch.cuda.is_available():
    model = model.cuda()
optimizer = AdamW(model.parameters(),lr=1e-4)
clf_metric=evaluate.combine(["accuracy","f1"]) #在huaggingface的tasks上面有相关指标推荐

In [11]:
def evaluate():
  model.eval()
  with torch.inference_mode():
    for batch in validloader:
      if torch.cuda.is_available():
        batch={k:v.cuda()for k,v in batch.items()}
      output=model(**batch)
      pred=torch.argmax(output.logits,dim=-1)
      clf_metric.add_batch(predictions=pred.long(),references=batch["labels"].long())
  return clf_metric.compute()

In [18]:
def train(epoch=2,log_step=100):
  global_step=0
  for ep in range(epoch):
    model.train()
    for batch in trainloader:
      if torch.cuda.is_available():
        batch={k:v.cuda()for k,v in batch.items()}
      optimizer.zero_grad()
      output=model(**batch)
      output.loss.backward()
      optimizer.step()
      if global_step%log_step==0:
        print(f"ep:{ep},step:{global_step},loss:{output.loss}")
      global_step+=1
    clf = evaluate() #每个epoch训结束了都评估一下
    print(f"ep:{ep},clf_metrics:{clf}")

In [19]:
train()

ep:0,step:0,loss:2.457594156265259
ep:0,step:100,loss:1.1608948707580566
ep:0,step:200,loss:0.3107556700706482
ep:0,clf_metrics:{'accuracy': 0.8925, 'f1': 0.893476465730801}
ep:1,step:300,loss:0.16866949200630188
ep:1,step:400,loss:0.5496124029159546
ep:1,step:500,loss:0.12389957159757614
ep:1,clf_metrics:{'accuracy': 0.9125, 'f1': 0.9071618037135278}


In [14]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [15]:
#简便的预测方法
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
pipe(sen)

[{'label': '好评！', 'score': 0.9999895095825195}]