# 文本分类实例

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Step2 加载数据集

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda x: x["review"] is not None)
dataset

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)
Generating train split: 7766 examples [00:00, 179317.18 examples/s]
Filter: 100%|██████████| 7766/7766 [00:00<00:00, 308956.41 examples/s]


Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step3 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step4 创建Dataloader

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")


def process_function(examples):
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    tokenized_examples["labels"] = examples["label"]
    return tokenized_examples


tokenized_datasets = datasets.map(
    process_function, batched=True, remove_columns=datasets["train"].column_names
)
tokenized_datasets

Map: 100%|██████████| 6988/6988 [00:00<00:00, 16000.35 examples/s]
Map: 100%|██████████| 777/777 [00:00<00:00, 19643.14 examples/s]


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [5]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

trainset, validset = tokenized_datasets["train"], tokenized_datasets["test"]
trainloader = DataLoader(
    trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer)
)
validloader = DataLoader(
    validset,
    batch_size=64,
    shuffle=False,
    collate_fn=DataCollatorWithPadding(tokenizer),
)

## Step5 创建模型及优化器

In [6]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    model = model.cuda()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
optimizer = Adam(model.parameters(), lr=2e-5)

## **Step6 训练与验证

In [8]:
import evaluate

clf_metrics = evaluate.combine(["accuracy", "f1"])

In [9]:
def evaluate():
    model.eval()
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(**batch)
            pred = torch.argmax(output.logits, dim=-1)
            ##############################
            clf_metrics.add_batch(
                predictions=pred.long(), references=batch["labels"].long()
            )
            ##############################
    return clf_metrics.compute()


def train(epoch=3, log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            output = model(**batch)
            output.loss.backward()
            optimizer.step()
            if global_step % log_step == 0:
                print(
                    f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}"
                )
            global_step += 1
        ##################
        clf = evaluate()
        ##################
        print(f"ep: {ep}, {clf}")

## Step7 模型训练

In [10]:
train()

ep: 0, global_step: 0, loss: 0.6137725710868835
ep: 0, global_step: 100, loss: 0.33229947090148926
ep: 0, global_step: 200, loss: 0.21843478083610535
ep: 0, {'accuracy': 0.8906048906048906, 'f1': 0.9203373945641987}
ep: 1, global_step: 300, loss: 0.07495684176683426
ep: 1, global_step: 400, loss: 0.14997105300426483
ep: 1, {'accuracy': 0.8957528957528957, 'f1': 0.9254829806807727}
ep: 2, global_step: 500, loss: 0.05643635243177414
ep: 2, global_step: 600, loss: 0.12589700520038605
ep: 2, {'accuracy': 0.8893178893178894, 'f1': 0.9222423146473779}


## Step8 模型预测

In [11]:
sen = "我觉得这家酒店不错，饭很好吃！"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [12]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [13]:
pipe(sen)

[{'label': '好评！', 'score': 0.9965531826019287}]