# 文本分类实例

## Step1 导入相关包

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

from datasets import load_dataset

## Step2 加载数据集

In [2]:
dataset = load_dataset("csv", data_files="./ChnSentiCorp_htl_all.csv", split="train")

dataset = dataset.filter(lambda x: x["review"] is not None)

dataset

Downloading and preparing dataset csv/default to F:/cache/huggingface/datasets/csv/default-ed3f084e419c4ff6/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to F:/cache/huggingface/datasets/csv/default-ed3f084e419c4ff6/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


Filter:   0%|          | 0/7766 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step4 划分数据集

In [3]:
datasets = dataset.train_test_split(test_size=0.1)

datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## Step5 创建DATALOADER

In [4]:
import torch

tokenizer = AutoTokenizer.from_pretrained("hfl/rbt3")

def process_function(examples):
    
    tokenized_examples = tokenizer(examples["review"], max_length=128, truncation=True)
    
    tokenized_examples["labels"] = examples["label"]
    
    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/6988 [00:00<?, ? examples/s]

Map:   0%|          | 0/777 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 777
    })
})

In [5]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

trainset, validset = tokenized_datasets["train"], tokenized_datasets["test"]

trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer))

In [6]:
next(enumerate(validloader))[1]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[ 101,  702,  782,  ...,    0,    0,    0],
        [ 101, 2697, 6230,  ..., 8024, 4294,  102],
        [ 101, 4384, 1862,  ...,    0,    0,    0],
        ...,
        [ 101, 2769, 3221,  ...,    0,    0,    0],
        [ 101, 6820, 6121,  ...,    0,    0,    0],
        [ 101, 6983, 2421,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
        1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 1, 0

## Step6 创建模型及优化器

In [7]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained("hfl/rbt3")

if torch.cuda.is_available():
    #
    model = model.cuda()

Some weights of the model checkpoint at hfl/rbt3 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3

In [8]:
optimizer = Adam(model.parameters(), lr=2e-5)

## Step7 训练与验证

In [9]:
def evaluate():
    
    model.eval()
    
    acc_num = 0
    
    with torch.inference_mode():
    
        for batch in validloader:
        
            if torch.cuda.is_available():
            
                batch = {k: v.cuda() for k, v in batch.items()}
            
            output = model(**batch)
            
            pred = torch.argmax(output.logits, dim=-1)
            
            acc_num += (pred.long() == batch["labels"].long()).float().sum()
    
    return acc_num / len(validset)

def train(epoch=3, log_step=100):
    
    global_step = 0
    
    for ep in range(epoch):
        
        model.train()
        
        for batch in trainloader:
            
            if torch.cuda.is_available():
                
                batch = {k: v.cuda() for k, v in batch.items()}
            
            optimizer.zero_grad()
            
            output = model(**batch)
            output.loss.backward()
            
            optimizer.step()
            
            if global_step % log_step == 0:
                
                print(f"ep: {ep}, global_step: {global_step}, loss: {output.loss.item()}")
            
            global_step += 1
        
        acc = evaluate()
        
        print(f"ep: {ep}, acc: {acc}")

## Step8 模型训练

In [None]:
train()

ep: 0, global_step: 0, loss: 0.7281174063682556


## Step9 模型预测

In [12]:
sen = "我觉得这家酒店不错，饭很好吃！"

id2_label = {0: "差评！", 1: "好评！"}

model.eval()

with torch.inference_mode():

    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    
    logits = model(**inputs).logits
    
    pred = torch.argmax(logits, dim=-1)
    
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：我觉得这家酒店不错，饭很好吃！
模型预测结果:好评！


In [13]:
from transformers import pipeline

model.config.id2label = id2_label

pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [14]:
pipe(sen)

[{'label': '好评！', 'score': 0.9911834001541138}]