In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
#print(torch.cuda.get_device_name(0))

In [None]:
!pip install transformers sentencepiece accelerate
!pip install -U datasets

## 載入資料集與模型

In [None]:
from datasets import load_dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# 載入資料集
dataset = load_dataset("Maciel/FinCUGE-Instruction")

# 載入預訓練的中文 BERT 模型與 tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3)


用 Dataset.filter() 清除無效樣本

In [None]:
from datasets import DatasetDict

dataset_org = dataset
# 使用 论坛情绪分析任务
filtered_dataset = dataset_org.filter(lambda example: example['desc'] == '论坛情绪分析任务')
dataset = filtered_dataset
# 顯示筆數
print(f"Train before :{len(dataset_org['train'])}, after :{len(dataset['train'])}")
print(f"Eval before :{len(dataset_org['eval'])}, after :{len(dataset['eval'])}")


從自然語言中抽出情緒詞

In [None]:
import re

label_map = {
    "积极": 0,
    "消极": 1,
    "中性": 2
}
def extract_label(output_str):
    # 簡單的方式：直接尋找關鍵詞
    for key in label_map:
        if key in output_str:
            return label_map[key]
    # 找不到就視為無效
    raise ValueError(f"無法從 output 中解析出標籤: {output_str}")

資料預處理

In [None]:
'''#acc : 0.78
def preprocess_function(examples):
    tokenized = tokenizer(
        examples['input'],
        truncation=True,
        padding='max_length',
        max_length=128
    )
    #抓出情緒詞
    tokenized["labels"] = [extract_label(o) for o in examples["output"]]
    return tokenized
'''
def preprocess_function(examples):
    texts = [i + " " + j for i, j in zip(examples["instruction"], examples["input"])]
    tokenized = tokenizer(texts, truncation=True, padding='max_length', max_length=128)
    tokenized["labels"] = [extract_label(o) for o in examples["output"]]
    return tokenized


tokenized_datasets = dataset.map(preprocess_function, batched=True)


訓練模型

In [None]:
import os
from transformers import EarlyStoppingCallback
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    learning_rate=1e-5,  #LR
    warmup_steps=500,   #LR
    load_best_model_at_end=True,#Early Stop
    save_strategy="epoch",
)

#驗證指標
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='macro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] #EARLY STOP
)

trainer.train()


In [None]:
from datetime import datetime

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f"./results/model_{timestamp}"
os.makedirs(model_path, exist_ok=True)

trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)
trainer.save_state()

In [None]:
timestamp

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# 載入 tokenizer 和 微調後的模型
model_path = f"./results/model_{timestamp}"  # 假設你存在這個目錄
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()  # 設定為推論模式

# 定義標籤對應（你訓練時應該是三類情感）
label_map = {0: "消極", 1: "中性", 2: "積極"}

text = "這支股票今天開低走高，明天應該會繼續漲！"

# 前處理
input_text = "这个文本的情感倾向是积极、消极还是中性的。 " + text  # 與訓練時格式一致
# Tokenize 輸入
inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True, max_length=128)



# 預測
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=1).item()
    predicted_label = label_map[predicted_class_id]

# 輸出結果
print(f"輸入句子：{text}")
print(f"預測情感：{predicted_label}")


In [None]:
#rm -r results

類別平衡?

In [None]:
from collections import Counter
Counter(tokenized_datasets["train"]["labels"])

CONFUSION MATRIX

In [None]:
from sklearn.metrics import confusion_matrix
preds = trainer.predict(tokenized_datasets["eval"]).predictions.argmax(axis=1)
labels = tokenized_datasets["eval"]["labels"]
print(confusion_matrix(labels, preds))
