<a href="https://colab.research.google.com/github/xi1224-pan/ml-1/blob/main/%E8%A8%93%E7%B7%B4%E5%A5%A7%E5%BE%B7%E8%B3%BD(%E5%B7%B2%E8%A8%93%E7%B7%B4).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets scikit-learn --quiet
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
import torch
from torch import nn
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np

# 1. 讀取已標註的資料
df = pd.read_csv("/content/奧德賽標註.csv")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# 2. 載入 tokenizer 與模型（這裡仍用 roberta，num_labels=3）
model_name = "uer/roberta-base-finetuned-jd-binary-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3, ignore_mismatched_sizes=True)

# 3. Tokenize 資料
def tokenize_fn(texts, labels):
    encoding = tokenizer(list(texts), padding=True, truncation=True, max_length=64, return_tensors="pt")
    encoding["labels"] = torch.tensor(list(labels), dtype=torch.long)  # <- 明確轉換為 list
    return encoding


train_enc = tokenize_fn(train_df["text"], train_df["label"])
val_enc = tokenize_fn(val_df["text"], val_df["label"])

# 4. 建立 DataLoader
train_dataset = torch.utils.data.TensorDataset(train_enc["input_ids"], train_enc["attention_mask"], train_enc["labels"])
val_dataset = torch.utils.data.TensorDataset(val_enc["input_ids"], val_enc["attention_mask"], val_enc["labels"])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 5. 訓練設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# 6. 開始訓練
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} - Train loss: {total_loss / len(train_loader):.4f}")

    # 驗證階段
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    acc = correct / total
    print(f"Epoch {epoch+1} - Validation Accuracy: {acc:.4f}")

# 7. 儲存模型
model.save_pretrained("/mnt/data/mario_sentiment_model")
tokenizer.save_pretrained("/mnt/data/mario_sentiment_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at uer/roberta-base-finetuned-jd-binary-chinese and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([3]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 9/9 [00:00<00:00,  9.44it/s]


Epoch 1 - Train loss: 1.0602
Epoch 1 - Validation Accuracy: 0.6667


Epoch 2: 100%|██████████| 9/9 [00:00<00:00, 10.50it/s]


Epoch 2 - Train loss: 0.9356
Epoch 2 - Validation Accuracy: 0.8333


Epoch 3: 100%|██████████| 9/9 [00:00<00:00, 10.59it/s]


Epoch 3 - Train loss: 0.8494
Epoch 3 - Validation Accuracy: 0.8889


Epoch 4: 100%|██████████| 9/9 [00:00<00:00, 10.61it/s]


Epoch 4 - Train loss: 0.7005
Epoch 4 - Validation Accuracy: 0.8889


Epoch 5: 100%|██████████| 9/9 [00:00<00:00, 10.59it/s]


Epoch 5 - Train loss: 0.5637
Epoch 5 - Validation Accuracy: 0.9444


Epoch 6: 100%|██████████| 9/9 [00:00<00:00, 10.58it/s]


Epoch 6 - Train loss: 0.4058
Epoch 6 - Validation Accuracy: 0.9444


Epoch 7: 100%|██████████| 9/9 [00:00<00:00, 10.50it/s]


Epoch 7 - Train loss: 0.2603
Epoch 7 - Validation Accuracy: 0.9444


Epoch 8: 100%|██████████| 9/9 [00:00<00:00, 10.36it/s]


Epoch 8 - Train loss: 0.1888
Epoch 8 - Validation Accuracy: 0.8889


Epoch 9: 100%|██████████| 9/9 [00:00<00:00, 10.63it/s]


Epoch 9 - Train loss: 0.0858
Epoch 9 - Validation Accuracy: 0.8889


Epoch 10: 100%|██████████| 9/9 [00:00<00:00, 10.59it/s]


Epoch 10 - Train loss: 0.0499
Epoch 10 - Validation Accuracy: 0.9444


('/mnt/data/mario_sentiment_model/tokenizer_config.json',
 '/mnt/data/mario_sentiment_model/special_tokens_map.json',
 '/mnt/data/mario_sentiment_model/vocab.txt',
 '/mnt/data/mario_sentiment_model/added_tokens.json')

In [None]:
df["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,30
1,30
0,30


In [None]:
!pip install transformers pandas numpy torch --quiet
# 重新載入必要模組並執行模型初始化與分析邏輯
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import numpy as np

# 1. 載入模型與 tokenizer
model_path = "/mnt/data/mario_sentiment_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 2. 定義情緒分析函式
def predict_with_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        score = probs[0][2].item() - probs[0][0].item()  # 正面 - 負面
        label = torch.argmax(probs, dim=1).item()
    label_map = {0: "負面", 1: "中立", 2: "正面"}
    return {
        "情緒分類": label_map[label],
        "情緒分數": round(score, 4)
    }

# 3. 分析整份檔案（逐行分析）
def analyze_file(df, text_column="text"):
    results = []
    for _, row in df.iterrows():
        analysis = predict_with_score(row[text_column])
        results.append({
            "句子": row[text_column],
            "情緒分類": analysis["情緒分類"],
            "情緒分數": analysis["情緒分數"]
        })

    # 總結
    scores = [r["情緒分數"] for r in results]
    avg_score = np.mean(scores)
    if avg_score > 1:
        overall = "結論為強烈正面"
    elif avg_score > 0:
        overall = "結論為正面"
    elif avg_score == 0:
        overall = "結論為中立"
    else:
        overall = "結論為負面"

    results.append({
        "句子": "【總結】",
        "情緒分類": overall,
        "情緒分數": round(avg_score, 4)
    })

    return pd.DataFrame(results)
# 匯入資料
df_input = pd.read_csv("/content/hk4g4.csv")  # 確保有一欄是 "text"

# 執行分析
result_df = analyze_file(df_input, text_column="text")

# 匯出結果
result_df.to_csv("/mnt/data/情緒分析結果.csv", index=False, encoding="utf-8-sig")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━