By [Yulandy Chiu](https://www.youtube.com/@YulandySpace)

Aided with Gemini/Claude/ChatGPT and modified by Yulandy Chiu

Version: 2025/02

Videos:
* [Hugging Face 快速入門：三大核心功能 Hub + Spaces + Transformers 庫 | HF API Key & Google Colab Secret 設定](https://youtu.be/e0xBXA3hUpQ)

YouTube: [Yulandy Chiu的AI觀測站](https://www.youtube.com/@YulandySpace)

Facebook: [Yulandy Chiu的AI資訊站](https://www.facebook.com/yulandychiu)

 This code is licensed under the Creative Commons Attribution-NonCommercial 4.0
 International License (CC BY-NC 4.0). You are free to use, modify, and share this code for non-commercial purposes, provided you give appropriate credit. For more details, see the LICENSE file or visit: https://creativecommons.org/licenses/by-nc/4.0/
 © [2025] Yulandy Chiu


測試Hugging Face API Key設定

In [None]:
from google.colab import userdata
Python_practice=userdata.get('Python_practice')
print(Python_practice)

BertForSequenceClassification (transformer) 模型直接使用

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

## BertForSequenceClassification以Transformer架構為基礎：
# 專門優化用於分類任務
# 只使用編碼器部分
# 增加了一個分類頭

# BERT 全名是 "Bidirectional Encoder Representations from Transformers"，由 Google AI Language

# 準備數據
texts = [
    "這部電影真的很好看",
    "服務態度很差",
    "價格合理，品質優良",
    "完全不推薦這家餐廳",
    "這家餐廳的菜品非常美味",
    "電影情節很無聊",
    "服務人員態度親切",
    "價格太貴了，不值得",
    "環境非常舒適",
    "等候時間太長了"
]
labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]  # 1: 正面, 0: 負面

# 自定義數據集
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=64):
        self.encodings = tokenizer(texts,
                                 truncation=True,
                                 padding=True,
                                 max_length=max_length,
                                 return_tensors='pt')
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

def train_model(model, train_loader, val_loader, device, epochs=3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    best_accuracy = 0

    for epoch in range(epochs):
        # 訓練階段
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids,
                          attention_mask=attention_mask,
                          labels=labels)

            loss = outputs.loss
            train_loss += loss.item()
            loss.backward()
            optimizer.step()

        # 驗證階段
        model.eval()
        val_accuracy = 0
        val_count = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids,
                              attention_mask=attention_mask)

                predictions = torch.argmax(outputs.logits, dim=-1)
                val_accuracy += (predictions == labels).sum().item()
                val_count += labels.size(0)

        val_accuracy = val_accuracy / val_count
        print(f'Epoch {epoch + 1}:')
        print(f'Average training loss: {train_loss / len(train_loader):.4f}')
        print(f'Validation accuracy: {val_accuracy:.4f}\n')

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), 'best_model.pt')

def predict_sentiment(text, model, tokenizer, device):
    model.eval()
    encoding = tokenizer(text,
                        truncation=True,
                        padding=True,
                        max_length=64,
                        return_tensors='pt')

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids,
                       attention_mask=attention_mask)
        probabilities = F.softmax(outputs.logits, dim=-1)
        prediction = torch.argmax(probabilities, dim=-1)
        confidence = torch.max(probabilities).item()

    sentiment = "正面" if prediction.item() == 1 else "負面"
    return sentiment, confidence


# 設置設備
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用設備: {device}")

# 加載tokenizer和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=2
).to(device)

# 分割數據集
train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )

# 創建數據加載器
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)


# 測試一些新的文本
test_texts = [
        "這部電影真的很好看",
        "服務態度很差",
        "價格合理，品質優良",
        "完全不推薦這家餐廳"
    ]


print("Pretrained model測試結果:")
for text in test_texts:
     sentiment, confidence = predict_sentiment(text, model, tokenizer, device)
     print(f"\n文本: {text}")
     print(f"預測情感: {sentiment}")
     print(f"信心度: {confidence*100:.2f}%")