In [None]:
!pip install numpy==1.26.4



In [None]:
import shutil
shutil.rmtree('/root/.cache/huggingface', ignore_errors=True)
shutil.rmtree('/root/.cache/torch', ignore_errors=True)

In [None]:
!pip install --upgrade --force-reinstall transformers

Collecting transformers
  Using cached transformers-4.52.3-py3-none-any.whl.metadata (40 kB)
Collecting filelock (from transformers)
  Using cached filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.32.2-py3-none-any.whl.metadata (14 kB)
Collecting numpy>=1.17 (from transformers)
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting packaging>=20.0 (from transformers)
  Using cached packaging-25.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pyyaml>=5.1 (from transformers)
  Using cached PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers)
  Using cached regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
C

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments,  EarlyStoppingCallback
import torch
from google.colab import drive
drive.mount('/content/drive')

# 1. 載入資料
df1 = pd.read_csv("/content/drive/MyDrive/奧德賽標註.csv")
df2 = pd.read_csv("/content/drive/MyDrive/反諷標註.csv")
df3 = pd.read_csv("/content/drive/MyDrive/新評論.csv")
df = pd.concat([df1, df2, df3], ignore_index=True)
# 2. 分割資料集
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# 3. 轉換為 Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# 4. 載入 tokenizer 與模型
model_name = "hfl/chinese-roberta-wwm-ext"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

# 5. Tokenize
def tokenize_fn(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=64)

train_dataset = train_dataset.map(tokenize_fn, batched=True)
val_dataset = val_dataset.map(tokenize_fn, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# 6. 訓練參數
training_args = TrainingArguments(
    output_dir="./mario_model",
    logging_dir="/content/drive/MyDrive/logs",
    logging_strategy="steps",
    logging_steps=10,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# 7. 評估函式
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=1)
    return {"accuracy": accuracy_score(labels, preds)}

# 8. 建立 Trainer 並開始訓練
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()

# 9. 儲存模型
model.save_pretrained("/content/drive/MyDrive/mario_sentiment_新模型")
tokenizer.save_pretrained("/content/drive/MyDrive/mario_sentiment_新模型")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/144 [00:00<?, ? examples/s]

Map:   0%|          | 0/36 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy
1,1.2881,1.037219,0.472222
2,0.822,0.553196,0.75
3,0.2442,0.485203,0.777778
4,0.0904,0.27783,0.916667
5,0.019,0.357886,0.833333


('/content/drive/MyDrive/mario_sentiment_新模型/tokenizer_config.json',
 '/content/drive/MyDrive/mario_sentiment_新模型/special_tokens_map.json',
 '/content/drive/MyDrive/mario_sentiment_新模型/vocab.txt',
 '/content/drive/MyDrive/mario_sentiment_新模型/added_tokens.json')