In [None]:
!pip install pandas transformers scikit-learn torch numpy


In [None]:
# !pip install pandas transformers scikit-learn torch numpy

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer, DistilBertTokenizer, DistilBertForSequenceClassification, LlamaTokenizer, LlamaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# 自定义数据集类
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 加载数据集
train_df = pd.read_csv("/kaggle/input/stockemotion/train_stockemo.csv", encoding="utf-8")
val_df = pd.read_csv("/kaggle/input/stockemotion/val_stockemo.csv", encoding="utf-8")
test_df = pd.read_csv("/kaggle/input/stockemotion/test_stockemo.csv", encoding="utf-8")

# 标签编码
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_df['emo_label'])
val_labels = label_encoder.transform(val_df['emo_label'])
test_labels = label_encoder.transform(test_df['emo_label'])

# 创建数据集实例
max_len = 128
train_dataset = SentimentDataset(train_df['processed'].tolist(), train_labels, BertTokenizer.from_pretrained('bert-base-uncased'), max_len)
val_dataset = SentimentDataset(val_df['processed'].tolist(), val_labels, BertTokenizer.from_pretrained('bert-base-uncased'), max_len)
test_dataset = SentimentDataset(test_df['processed'].tolist(), test_labels, BertTokenizer.from_pretrained('bert-base-uncased'), max_len)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

def train_and_evaluate(model_name, model_class, tokenizer_class, train_dataset, val_dataset, test_dataset, num_labels):
    # 初始化 tokenizer 和模型
    tokenizer = tokenizer_class.from_pretrained(model_name)
    model = model_class.from_pretrained(model_name, num_labels=num_labels, ignore_mismatched_sizes=True)
    
    # 创建数据集实例
    train_dataset.tokenizer = tokenizer
    val_dataset.tokenizer = tokenizer
    test_dataset.tokenizer = tokenizer
    
    # 定义训练参数
    training_args = TrainingArguments(
        output_dir=f'./results/{model_name}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir=f'./logs/{model_name}',
        logging_steps=10,
        evaluation_strategy="epoch",
        report_to="none",  # 禁用wandb日志记录

    )

    # 定义 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # 训练模型
    trainer.train()

    # 在验证集上进行评估
    eval_result = trainer.evaluate()
    print(f"Validation results for {model_name}: {eval_result}")

    # 在测试集上进行预测
    predictions, labels, _ = trainer.predict(test_dataset)
    predictions = torch.tensor(predictions)
    predicted_labels = torch.argmax(predictions, axis=1)
    
    return predicted_labels.numpy()

# 训练和评估每个模型

In [3]:

finbert_predictions = train_and_evaluate('yiyanghkust/finbert-tone', BertForSequenceClassification, BertTokenizer, train_dataset, val_dataset, test_dataset, num_labels=len(label_encoder.classes_))


In [None]:
# from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig, TrainingArguments, Trainer
# distilbert_predictions = train_and_evaluate('distilbert-base-uncased', DistilBertForSequenceClassification, DistilBertTokenizer, train_dataset, val_dataset, test_dataset, num_labels=len(label_encoder.classes_))

In [5]:
bert_predictions = train_and_evaluate('bert-base-uncased', BertForSequenceClassification, BertTokenizer, train_dataset, val_dataset, test_dataset, num_labels=len(label_encoder.classes_))


In [6]:
roberta_predictions = train_and_evaluate('cardiffnlp/twitter-roberta-base-emotion', RobertaForSequenceClassification, RobertaTokenizer, train_dataset, val_dataset, test_dataset, num_labels=len(label_encoder.classes_))


In [7]:
final_predictions = []
for i in range(len(test_labels)):
    votes = [finbert_predictions[i], bert_predictions[i], roberta_predictions[i]]
    final_predictions.append(np.bincount(votes).argmax())

# 计算准确率和F1值
accuracy = accuracy_score(test_labels, final_predictions)
f1 = f1_score(test_labels, final_predictions, average='weighted')

print(f"Test Accuracy: {accuracy}")
print(f"Test F1 Score: {f1}")

In [None]:

# 你可以在这里继续添加更多的模型

# 投票机制
final_predictions = []

for i in range(len(test_labels)):
    votes = [finbert_predictions[i], roberta_predictions[i], bert_predictions[i], distilbert_predictions[i], spanbert_predictions[i]]
    final_predictions.append(np.bincount(votes).argmax())

# 计算准确率和F1值
accuracy = accuracy_score(test_labels, final_predictions)
f1 = f1_score(test_labels, final_predictions, average='weighted')

print(f"Test Accuracy: {accuracy}")
print(f"Test F1 Score: {f1}")