<a href="https://colab.research.google.com/github/xi1224-pan/ml-1/blob/main/%E8%A8%93%E7%B7%B4%E5%A5%A7%E5%BE%B7%E8%B3%BD(%E5%B7%B2%E8%A8%93%E7%B7%B4).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets scikit-learn --quiet
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
import torch
from torch import nn
from torch.optim import AdamW
from tqdm import tqdm
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
# 1. 讀取已標註的資料
df = pd.read_csv("/content/drive/MyDrive/奧德賽標註.csv")
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# 2. 載入 tokenizer 與模型（這裡仍用 roberta，num_labels=3）
model_name = "uer/roberta-base-finetuned-jd-binary-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3, ignore_mismatched_sizes=True)

# 3. Tokenize 資料
def tokenize_fn(texts, labels):
    encoding = tokenizer(list(texts), padding=True, truncation=True, max_length=64, return_tensors="pt")
    encoding["labels"] = torch.tensor(list(labels), dtype=torch.long)  # <- 明確轉換為 list
    return encoding


train_enc = tokenize_fn(train_df["text"], train_df["label"])
val_enc = tokenize_fn(val_df["text"], val_df["label"])

# 4. 建立 DataLoader
train_dataset = torch.utils.data.TensorDataset(train_enc["input_ids"], train_enc["attention_mask"], train_enc["labels"])
val_dataset = torch.utils.data.TensorDataset(val_enc["input_ids"], val_enc["attention_mask"], val_enc["labels"])

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

# 5. 訓練設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# 6. 開始訓練
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        input_ids, attention_mask, labels = [b.to(device) for b in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch+1} - Train loss: {total_loss / len(train_loader):.4f}")

    # 驗證階段
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    acc = correct / total
    print(f"Epoch {epoch+1} - Validation Accuracy: {acc:.4f}")

# 7. 儲存模型到 Google Drive
model_save_path = "/content/drive/MyDrive/mario_評論模型"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

In [None]:
df["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,30
1,30
0,30


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

model_path = "/content/drive/MyDrive/mario_評論模型"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# 2. 定義情緒分析函式
def predict_with_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        score = probs[0][2].item() - probs[0][0].item()  # 正面 - 負面
        label = torch.argmax(probs, dim=1).item()
    label_map = {0: "負面", 1: "中立", 2: "正面"}
    return {
        "情緒分類": label_map[label],
        "情緒分數": round(score, 4)
    }

# 3. 分析整份檔案（逐行分析）
def analyze_file(df, text_column="text"):
    results = []
    for _, row in df.iterrows():
        analysis = predict_with_score(row[text_column])
        results.append({
            "句子": row[text_column],
            "情緒分類": analysis["情緒分類"],
            "情緒分數": analysis["情緒分數"]
        })

    # 總結
    scores = [r["情緒分數"] for r in results]
    avg_score = np.mean(scores)
    if avg_score > 1:
        overall = "結論為強烈正面"
    elif avg_score > 0:
        overall = "結論為正面"
    elif avg_score == 0:
        overall = "結論為中立"
    else:
        overall = "結論為負面"

    results.append({
        "句子": "【總結】",
        "情緒分類": overall,
        "情緒分數": round(avg_score, 4)
    })

    return pd.DataFrame(results)
# 匯入資料
df_input = pd.read_csv("/content/mario_comments_testset.csv")  # 確保有一欄是 "text"

# 執行分析
result_df = analyze_file(df_input, text_column="text")

# 匯出結果
result_df.to_csv("/content/情緒分析結果.csv", index=False, encoding="utf-8-sig")

In [None]:
rom transformers import BertTokenizer, BertForSequenceClassification
import torch

model_path = "/content/drive/MyDrive/mario_評論模型"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
# 2. 定義情緒分析函式
def predict_with_score(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        score = probs[0][2].item() - probs[0][0].item()  # 正面 - 負面
        label = torch.argmax(probs, dim=1).item()
    label_map = {0: "負面", 1: "中立", 2: "正面"}
    return {
        "情緒分類": label_map[label],
        "情緒分數": round(score, 4)
    }
def predict_sentiment_with_score(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=1)
        score = probs[0][1].item() - probs[0][0].item()  # 正面機率 - 負面機率

    # 分類規則：你可以根據需求調整
    if score > 1:
        sentiment = "強烈正面"
    elif score > 0:
        sentiment = "正面"
    elif score == 0:
        sentiment = "中立"
    else:
        sentiment = "負面"

    return {
        "情緒分類": sentiment,
        "情緒分數": round(score, 4),

    }
print(predict_sentiment_with_score("跟小朋友一起絕對是首推吧，小朋友選耀希就無敵了，加上，又沒時間限制可以慢慢通關"))
print(predict_sentiment_with_score("關卡都過不了，好好玩喔。"))
print(predict_sentiment_with_score("難玩至極"))