<a href="https://colab.research.google.com/github/syq-tju/Bert/blob/main/BertFineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets torch




In [2]:
pip install accelerate




In [3]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from torch.nn.functional import softmax

# 安装加速库（如果需要的话，可以在命令行执行）
# pip install accelerate

# 加载数据集
data_files = {'train': 'fake_news_dataset.csv', 'test': 'fake_news_dataset.csv'}
dataset = load_dataset('csv', data_files=data_files)

# 加载预训练的分词器和模型
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 数据预处理函数
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True, max_length=512)

# 对数据集应用预处理
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# 定义训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

# 开始训练
trainer.train()

# 评估模型性能
evaluation_results = trainer.evaluate()
print(evaluation_results)

# 对一个示例进行预测
test_sentence = "New studies show that eating cheese can extend your life."
encoded_input = tokenizer(test_sentence, return_tensors='pt')
with torch.no_grad():
    output = model(**encoded_input)
    probabilities = softmax(output.logits, dim=1)
    prediction = torch.argmax(probabilities)

print(f"Text: {test_sentence}")
print(f"Predicted class: {'Real' if prediction.item() == 0 else 'Fake'}")
print(f"Confidence: {probabilities[0][prediction].item():.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


{'eval_loss': 0.6429685950279236, 'eval_runtime': 0.6101, 'eval_samples_per_second': 9.835, 'eval_steps_per_second': 1.639, 'epoch': 3.0}
Text: New studies show that eating cheese can extend your life.
Predicted class: Fake
Confidence: 0.5553
