<a href="https://colab.research.google.com/github/syq-tju/Bert/blob/main/BertChinese.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.nn.functional import softmax

# 加载分词器和模型
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 假设是二分类任务

# 示例中文文本
texts = ["这是一个非常棒的产品。", "这个产品真的很糟糕。"]

# 对文本进行编码
inputs = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

# 使用模型进行预测
with torch.no_grad():
    outputs = model(**inputs)
    predictions = softmax(outputs.logits, dim=1)

# 输出每个文本的预测结果
for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"Predicted class: {'Positive' if predictions[i][1] > 0.5 else 'Negative'}, Score: {predictions[i][1]:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text: 这是一个非常棒的产品。
Predicted class: Negative, Score: 0.2718
Text: 这个产品真的很糟糕。
Predicted class: Negative, Score: 0.3618


In [1]:
!pip install datasets
!pip install accelerate





In [6]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
import torch
from torch.nn.functional import softmax

# 加载分词器和模型
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # 假设二分类任务

# 假设你有一个包含文本和标签的数据集
texts = ["这是一个非常棒的产品。", "这个产品真的很糟糕。"]
labels = [1, 0]  # 1为正面，0为负面

# 编码文本
encoding = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']

# 创建PyTorch数据集
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = TextDataset(encoding, labels)

# 训练参数设置
training_args = TrainingArguments(
    output_dir='./results',          # 输出目录
    num_train_epochs=10,              # 训练轮次
    per_device_train_batch_size=8,   # 每个设备的批大小
    warmup_steps=500,                # 预热步骤
    weight_decay=0.01,               # 权重衰减
    logging_dir='./logs',            # 日志目录
    logging_steps=10,
)

# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# 训练模型
trainer.train()

# 使用训练后的模型进行预测
model.eval()  # 设置为评估模式
with torch.no_grad():
    outputs = model(**encoding)
    predictions = softmax(outputs.logits, dim=1)

for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"Predicted class: {'Positive' if predictions[i][1] > 0.5 else 'Negative'}, Score: {predictions[i][1]:.4f}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.7881


Text: 这是一个非常棒的产品。
Predicted class: Positive, Score: 0.6394
Text: 这个产品真的很糟糕。
Predicted class: Positive, Score: 0.6265
