# 文本分类实战  

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import os

In [2]:
os.getcwd()

'D:\\CodeLibrary\\NLP_Task\\classification_demo'

## 加载数据集

In [3]:
dataset = load_dataset('csv', data_files='D:\\CodeLibrary\\NLP_Task\\classification_demo\\ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda x : x['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

In [4]:
dataset['review'][0]

'距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.'

In [5]:
dataset['label'][0]

1

## 划分数据
```python
def train_test_split(
        self,
        test_size: Union[float, int, None] = None,
        train_size: Union[float, int, None] = None,
        shuffle: bool = True,
        stratify_by_column: Optional[str] = None,
        seed: Optional[int] = None,
        generator: Optional[np.random.Generator] = None,
        keep_in_memory: bool = False,
        load_from_cache_file: Optional[bool] = None,
        train_indices_cache_file_name: Optional[str] = None,
        test_indices_cache_file_name: Optional[str] = None,
        writer_batch_size: Optional[int] = 1000,
        train_new_fingerprint: Optional[str] = None,
        test_new_fingerprint: Optional[str] = None,
    ) -> "DatasetDict":
```

In [6]:
datasets = dataset.train_test_split(test_size=0.1)
datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 777
    })
})

## 数据预处理

1. `.map()`: 这是`datasets`库中的一个方法，用于对数据集进行变换。它会将一个函数应用到数据集的每个样本上。
2. `batched=True`: 这个参数指示`.map()`方法以批处理的方式应用`process_datasets`函数。这意味着函数将被应用到一批数据上，而不是单个样本，这通常可以提高处理效率。
3. `remove_columns=datasets['train'].column_names`: 这个参数指示`.map()`方法在处理完成后，从数据集中删除指定的列。这里的`datasets['train'].column_names`是一个包含训练集所有列名的列表。这通常用于删除在处理过程中不再需要的原始列，比如原始文本列，在文本被分词和转换为数字表示后。



In [8]:
datasets['train']

Dataset({
    features: ['label', 'review'],
    num_rows: 6988
})

In [9]:
datasets['train'].column_names

['label', 'review']

In [14]:
import torch
tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-macbert-large')
def process_datasets(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=32, truncation=True, padding=True)
    tokenized_examples['label'] = examples['label'] # 重新添加 ['label']
    return tokenized_examples

# 带有 remove_columns 的 feature栏 少了 ['review'], 本质上 ['label'] 也被删除了，只是重新添加回去了
tokenized_datasets = datasets.map(process_datasets, batched=True, remove_columns=datasets['train'].column_names)
# tokenized_datasets = datasets.map(process_datasets, batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6988
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 777
    })
})

## 创建模型  

In [16]:
model = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-macbert-large')

pytorch_model.bin:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-macbert-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 创建评估函数  

In [30]:
import evaluate

acc_metric = evaluate.load('D:\\CodeLibrary\\NLP_Task\\classification_demo\\metric_accuracy.py')
f1_metirc = evaluate.load('D:\\CodeLibrary\\NLP_Task\\classification_demo\\metric_f1.py')

In [31]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

## TrainingArguments      

- `gradient_aacumulation_steps` 就是为了模拟多 batch训练而已，只更新一次梯度，这样能减少显存占用的同时又用多batch训练


- `optim` 可能是用 'adafactor' 占显存比较少？

In [32]:
train_args = TrainingArguments(output_dir="./checkpoints",      # 输出文件夹
                               per_device_train_batch_size=1,   # 训练时的batch_size
                               gradient_accumulation_steps=32,  # *** 梯度累加 ***
                               gradient_checkpointing=True,     # *** 梯度检查点 ***
                               optim="adafactor",               # *** adafactor优化器 *** 
                               per_device_eval_batch_size=1,    # 验证时的batch_size
                               num_train_epochs=1,              # 训练轮数
                               logging_steps=10,                # log 打印的频率
                               eval_strategy="epoch",     # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型

## Trainer

In [33]:
from transformers import DataCollatorWithPadding

# *** 参数冻结 *** 
for name, param in model.bert.named_parameters():
    param.requires_grad = False

trainer = Trainer(model=model, 
                  args=train_args, 
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
                  compute_metrics=eval_metric)

## model training  

In [34]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
0,0.6034,0.606409,0.694981,0.820046


TrainOutput(global_step=218, training_loss=0.6196840487488913, metrics={'train_runtime': 185.1208, 'train_samples_per_second': 37.748, 'train_steps_per_second': 1.178, 'total_flos': 406322074411008.0, 'train_loss': 0.6196840487488913, 'epoch': 0.998282770463652})

In [35]:
trainer.evaluate(tokenized_datasets["test"])

{'eval_loss': 0.6064087748527527,
 'eval_accuracy': 0.694980694980695,
 'eval_f1': 0.8200455580865603,
 'eval_runtime': 11.1152,
 'eval_samples_per_second': 69.905,
 'eval_steps_per_second': 69.905,
 'epoch': 0.998282770463652}

## model predictions     

这训练结果属实难崩，毕竟显存只用了2G。。。

In [42]:
sen = "杭州酒家的菜很难吃!"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits, dim=-1)
    print(f"输入：{sen}\n模型预测结果:{id2_label.get(pred.item())}")

输入：杭州酒家的菜很难吃!
模型预测结果:好评！


In [43]:
from transformers import pipeline

model.config.id2label = id2_label
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer, device=0)

In [44]:
pipe(sen)

[{'label': '好评！', 'score': 0.7304642796516418}]