使用 Hugging Face 框架建立 BERT 模型，針對金融領域文本進行情緒分析訓練。

In [1]:
pip install --upgrade fsspec gcsfs



In [2]:
pip install fsspec==2024.10.0 gcsfs --upgrade




In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

import os
os.environ["WANDB_DISABLED"] = "true"
# 載入金融情緒分析的 PhraseBank 資料集
dataset = load_dataset("takala/financial_phrasebank", "sentences_50agree")

train_texts, val_texts, train_labels, val_labels = train_test_split(
    dataset['train']['sentence'][:500], dataset['train']['label'][:500], test_size=0.2, random_state=42
)

# 選擇適合的 BERT 模型 (如 'bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# 4. 將數據包裝為 PyTorch Dataset
class FinancialDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # 返回一個字典，符合 Trainer 的要求
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item


train_dataset = FinancialDataset(train_encodings, train_labels)
val_dataset = FinancialDataset(val_encodings, val_labels)

# 5. 加載預訓練的 BERT 模型並指定分類數
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# 6. 設置訓練參數
training_args = TrainingArguments(
    output_dir="./results",          # 模型輸出的保存路徑
    evaluation_strategy="epoch",    # 每個 epoch 進行一次評估
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",           # 日志保存路徑
    logging_steps=10,               # 訓練過程中每 10 步記錄一次
    load_best_model_at_end=True,     # 訓練結束後加載最佳模型
    report_to="none"
)

# 7. 使用 Trainer 進行模型訓練
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

trainer.train()



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5377,0.554129
2,0.3548,0.499384
3,0.3346,0.483422


TrainOutput(global_step=75, training_loss=0.4424799760182699, metrics={'train_runtime': 1267.1351, 'train_samples_per_second': 0.947, 'train_steps_per_second': 0.059, 'total_flos': 49333765824000.0, 'train_loss': 0.4424799760182699, 'epoch': 3.0})

In [10]:
test_texts = [
    "The company's profit has increased significantly this quarter.",
    "The increase in costs negatively affected the revenue.",
    "The company's performance remained stable."
]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt").to(device)

outputs = model(**test_encodings)

# 獲取情緒預測
preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_labels = [label_map[pred] for pred in preds]
print(predicted_labels)


['Positive', 'Positive', 'Positive']
