# 利用中文微博評價資料進行Bert微調


In [None]:
! pip install transformers datasets
! pip install evaluate

## 下載微博評價資料

In [None]:
!wget https://github.com/shhuangmust/AI/raw/refs/heads/113-1/weibo_senti_100k.csv

## 讀取Weibo資料集
- 共有119988筆資料

In [None]:
from datasets import load_dataset, DatasetDict

ds = load_dataset("csv", data_files="weibo_senti_100k.csv")
print(ds)

## 分割資料集
- 80%訓練(train)資料
- 10%測試(test)資料
- 10%驗證(valid)資料


In [None]:
train_testvalid = ds['train'].train_test_split(test_size=0.2)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
dataset = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})


## 進行分詞

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-chinese")

def tokenize_function(examples):
    return tokenizer(examples["review"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

## 為簡化訓練，挑選10000筆作為訓練與測試資料

In [None]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10000))
print(small_train_dataset)
print(small_eval_dataset)

## 列印一筆資料出來看

In [None]:
tokenized_datasets["train"][100]

## 本次微調需要得到正面/負面的判斷結果，因此挑選AutoModelForSequenceClassification
- 輸出結果為正面/負面，因此num_labels=2

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-chinese", num_labels=2)

## 利用TrainingArguments設定微調參數

In [None]:
from transformers import TrainingArguments
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(output_dir="test_trainer_chinese", evaluation_strategy="epoch")


## 利用Trainer進行訓練
- 此處須輸入wandb key

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

## 利用pipeline進行測試
- LABEL_0：負面
- LABEL_1：正面

In [None]:
from transformers import pipeline
pipe = pipeline("sentiment-analysis", model='test_trainer_chinese/checkpoint-1500', tokenizer=tokenizer)

In [None]:
pipe("我喜歡這個產品")