## 1.載入套件與環境設置

In [None]:
pip install transformers datasets scikit-learn accelerate

In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # 有助除錯 CUDA 問題

import torch
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np1
from collections import Counter
from datasets import concatenate_datasets

In [2]:
!pip install --upgrade datasets
!rm -rf ~/.cache/huggingface/datasets



## 2.資料處理

In [3]:
# 載入中文資料集（只保留正面與負面樣本）
dataset = load_dataset("tyqiangz/multilingual-sentiments", "chinese")
binary_data = dataset["train"].filter(lambda x: x["label"] in [0, 1])

# 檢查數量分布
print(Counter(binary_data["label"]))

# 平衡類別數量
label_counts = Counter(binary_data["label"])
min_count = min(label_counts.values())

label_0 = binary_data.filter(lambda x: x["label"] == 0).select(range(min_count))
label_1 = binary_data.filter(lambda x: x["label"] == 1).select(range(min_count))

balanced_dataset = concatenate_datasets([label_0, label_1]).shuffle(seed=42)

# 快速測試：只取最多 8000 筆資料（實際數量不足則全取）
max_samples = min(8000, len(balanced_dataset))
balanced_dataset = balanced_dataset.select(range(max_samples))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

multilingual-sentiments.py:   0%|          | 0.00/6.23k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/13.5M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/330k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/339k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/120000 [00:00<?, ? examples/s]

Counter({1: 40000, 0: 40000})


Filter:   0%|          | 0/80000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/80000 [00:00<?, ? examples/s]

In [4]:
# 再分割成訓練與驗證集
train_texts, val_texts, train_labels, val_labels = train_test_split(
    balanced_dataset["text"], balanced_dataset["label"], test_size=0.2, random_state=42, stratify=balanced_dataset["label"]
)

In [5]:
print("訓練集：", Counter(train_labels))
print("驗證集：", Counter(val_labels))

訓練集： Counter({0: 3211, 1: 3189})
驗證集： Counter({0: 803, 1: 797})


## 3.分詞與資料集準備

In [6]:
# 載入中文 BERT 分詞器
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

In [7]:
# 分詞
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

In [8]:
# 建立 Dataset
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)


## 4.模型載入與訓練

In [9]:
# 載入模型
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,  # 最多保留2個 checkpoint
    save_steps=500,
    eval_steps=500,
    logging_first_step=True
)

# 評估指標
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np1.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="macro"),
    }

# 建立 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# 訓練
trainer.train()

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
1,1.0601
10,0.8054
20,0.8291
30,0.6851
40,0.6638
50,0.6095
60,0.5564
70,0.5324
80,0.4272
90,0.4112


TrainOutput(global_step=1200, training_loss=0.33739990055561064, metrics={'train_runtime': 1785.1794, 'train_samples_per_second': 10.755, 'train_steps_per_second': 0.672, 'total_flos': 5051732262912000.0, 'train_loss': 0.33739990055561064, 'epoch': 3.0})

## 5.模型評估

In [10]:
# 評估
eval_result = trainer.evaluate()
print("Evaluation result:", eval_result)

Evaluation result: {'eval_loss': 0.4932881295681, 'eval_accuracy': 0.855625, 'eval_f1': 0.8555897435897436, 'eval_runtime': 35.662, 'eval_samples_per_second': 44.866, 'eval_steps_per_second': 0.701, 'epoch': 3.0}


## 6.推論功能

In [11]:
# 推論函數
def predict_sentiment(texts):
    if isinstance(texts, str):
        texts = [texts]

    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

    # 把所有 tensor 搬到與模型相同裝置
    device = model.device
    encodings = {k: v.to(device) for k, v in encodings.items()}

    model.eval()
    with torch.no_grad():
        outputs = model(**encodings)

    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)
    preds = torch.argmax(probs, dim=-1)
    label_map = {0: "正面", 1: "負面"}

    return [
        {
            "text": texts[i],
            "prediction": label_map[preds[i].item()],
            "confidence": round(probs[i][preds[i]].item(), 4)
        }
        for i in range(len(texts))
    ]

## 7.測試案例

In [12]:

test_texts_simplified = [
    "這家店的服務態度真的非常差。",              # 負面
    "我非常喜歡這部電影，演員很棒！",            # 正面
    "東西還可以，但沒有我期待的那麼好。",        # 中性偏負
    "太失望了，再也不會來了。",                  # 負面
    "這是我吃過最好吃的牛肉麵！",                # 正面
    "店員很親切，讓人感覺很舒服。",              # 正面
    "產品質感普通，有待加強。",                  # 負面或中性
    "整體體驗還不錯，下次會再來。",              # 正面
    "不值得這個價錢。",                          # 負面
    "客服反應迅速，問題很快就解決了。",          # 正面
    "這款手機的續航力真的很弱。",               # 負面
    "功能齊全，但外觀設計不夠吸引人。",          # 中性偏負
    "雖然延遲了一下，但還是準時送達。",          # 中性偏正
    "超出我的預期！真的太棒了。",                # 正面
    "包裝破損，內容物也有刮傷，失望。",          # 負面
    "價格合理，品質也很不錯。",                  # 正面
    "拍照效果不如宣傳所說，差強人意。",          # 負面
    "這次的購物體驗非常愉快。",                  # 正面
    "我不知道該怎麼評價，感覺普普通通。",        # 中性
    "根本就是詐騙，千萬不要買！",                # 負面
]

from pprint import pprint
pprint(predict_sentiment(test_texts_simplified))

[{'confidence': 0.9908, 'prediction': '負面', 'text': '這家店的服務態度真的非常差。'},
 {'confidence': 0.9919, 'prediction': '正面', 'text': '我非常喜歡這部電影，演員很棒！'},
 {'confidence': 0.9935, 'prediction': '負面', 'text': '東西還可以，但沒有我期待的那麼好。'},
 {'confidence': 0.9898, 'prediction': '負面', 'text': '太失望了，再也不會來了。'},
 {'confidence': 0.9902, 'prediction': '正面', 'text': '這是我吃過最好吃的牛肉麵！'},
 {'confidence': 0.9907, 'prediction': '正面', 'text': '店員很親切，讓人感覺很舒服。'},
 {'confidence': 0.9936, 'prediction': '負面', 'text': '產品質感普通，有待加強。'},
 {'confidence': 0.9816, 'prediction': '正面', 'text': '整體體驗還不錯，下次會再來。'},
 {'confidence': 0.9917, 'prediction': '負面', 'text': '不值得這個價錢。'},
 {'confidence': 0.9893, 'prediction': '正面', 'text': '客服反應迅速，問題很快就解決了。'},
 {'confidence': 0.9926, 'prediction': '負面', 'text': '這款手機的續航力真的很弱。'},
 {'confidence': 0.9943, 'prediction': '負面', 'text': '功能齊全，但外觀設計不夠吸引人。'},
 {'confidence': 0.9886, 'prediction': '正面', 'text': '雖然延遲了一下，但還是準時送達。'},
 {'confidence': 0.9925, 'prediction': '正面', 'text': '超出我的預期！真的太棒了。'},
 {'confid