In [None]:
import numpy as np
import inspect
from datasets import load_dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import accuracy_score, f1_score

model_path = "./bert"
train_file = "split_data/train.parquet"
test_file  = "split_data/test.parquet"

ds = load_dataset("parquet", data_files={"train": train_file, "test": test_file})

label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {v: k for k, v in label2id.items()}

def map_label(example):
    example["label"] = label2id[str(example["label"]).strip()]
    return example

ds = ds.map(map_label)

tokenizer = BertTokenizer.from_pretrained(model_path)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

ds = ds.map(tokenize, batched=True)
ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

model = BertForSequenceClassification.from_pretrained(
    model_path,
    num_labels=3,
    label2id=label2id,
    id2label=id2label
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
    }

# 兼容 evaluation_strategy / eval_strategy
ta_params = inspect.signature(TrainingArguments.__init__).parameters
training_kwargs = dict(
    output_dir="./bert_cls_out",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=50,
)
if "evaluation_strategy" in ta_params:
    training_kwargs["evaluation_strategy"] = "epoch"
elif "eval_strategy" in ta_params:
    training_kwargs["eval_strategy"] = "epoch"
else:
    print("Warning: 该 transformers 版本不支持 evaluation_strategy/eval_strategy，将不自动评估。")

args = TrainingArguments(**training_kwargs)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

trainer.save_model("./bert_cls_final_en_uncased")
tokenizer.save_pretrained("./bert_cls_final_en_uncased")
print("训练完成，模型已保存到 ./bert_cls_final_en_uncased")
