In [1]:
import pandas as pd
data_train = pd.read_csv("../dataset/train.csv")
data_test = pd.read_csv("../dataset/test.csv")

In [2]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset

# 载入预训练的BERT模型和分词器
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=24)  # 假设有24个类别

# 准备数据（示例） 
labels = data_train.label  # 用于示例的标签，根据你的数据集修改
texts = data_train.text.to_list()

# 对文本数据进行分词和编码
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128, add_special_tokens=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# 创建数据集和数据加载器
input_ids = encoded_texts["input_ids"]
attention_mask = encoded_texts["attention_mask"]
labels = torch.tensor(labels)
dataset = TensorDataset(input_ids, attention_mask, labels)

In [4]:
# 定义训练参数
batch_size = 15
learning_rate = 1e-5
num_epochs = 3

In [5]:
# 创建数据加载器
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [6]:
# 定义优化器
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [7]:
# 模型微调
model.train()
# 保存微调后的模型
# model.save_pretrained("bert_text_classification")


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [8]:
# for epoch in range(num_epochs):
for batch in dataloader:
    input_ids, attention_mask, labels = batch
    outputs = model(
        input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    break


: 

In [None]:
# 推理和预测
model.eval()
with torch.no_grad():
    new_texts = ["New text samples for prediction.", "More text samples."]
    encoded_new_texts = tokenizer(new_texts, padding=True, truncation=True, return_tensors='pt')
    input_ids = encoded_new_texts["input_ids"]
    attention_mask = encoded_new_texts["attention_mask"]
    outputs = model(input_ids, attention_mask=attention_mask)
    predicted_labels = torch.argmax(outputs.logits, dim=1)
    
print("Predicted labels:", predicted_labels)