In [11]:
import pandas as pd
data_train = pd.read_csv("../dataset/train.csv")
data_test = pd.read_csv("../dataset/test.csv")

In [12]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from torch.utils.data import DataLoader, TensorDataset

# 载入预训练的BERT模型和分词器
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=24)  # 假设有24个类别

# 准备数据（示例） 
labels = data_train.label  # 用于示例的标签，根据你的数据集修改
texts = data_train.text.to_list()

# 对文本数据进行分词和编码
encoded_texts = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128, add_special_tokens=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# 创建数据集和数据加载器
input_ids = encoded_texts["input_ids"]
attention_mask = encoded_texts["attention_mask"]
labels = torch.tensor(labels)
dataset = TensorDataset(input_ids, attention_mask, labels)


In [14]:
# 定义训练参数
batch_size = 16
learning_rate = 1e-5
num_epochs = 3

In [15]:
# 创建数据加载器
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [16]:
# 定义优化器
optimizer = AdamW(model.parameters(), lr=learning_rate)



In [17]:
# 模型微调
model.train()
for epoch in range(num_epochs):
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

# 保存微调后的模型
# model.save_pretrained("bert_text_classification")


In [None]:
# 推理和预测
model.eval()
with torch.no_grad():
    new_texts = ["New text samples for prediction.", "More text samples."]
    encoded_new_texts = tokenizer(new_texts, padding=True, truncation=True, return_tensors='pt')
    input_ids = encoded_new_texts["input_ids"]
    attention_mask = encoded_new_texts["attention_mask"]
    outputs = model(input_ids, attention_mask=attention_mask)
    predicted_labels = torch.argmax(outputs.logits, dim=1)
    
print("Predicted labels:", predicted_labels)