In [None]:
from torch import nn
import os
import torch
# 加载数据集
from datasets import load_dataset
# Robert分词
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"


dataset_train = load_dataset('csv', data_files='data/train.csv', encoding='utf-8')
dataset_val = load_dataset('csv', data_files='data/valid.csv', encoding='utf-8')



tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
def encode_batch(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True, max_length=96 , return_tensors="pt")
dataset_train = dataset_train.map(encode_batch).remove_columns("text")
dataset_val = dataset_val.map(encode_batch).remove_columns("text")
dataset_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
dataset_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# 定义模型
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4).to("cuda:0")


weights = torch.tensor([0.1915, 0.2766, 0.3404, 0.1915], dtype=torch.float32).to("cuda:0")
optimizer = AdamW(model.parameters(), lr=1e-5, weight_decay=1e-4)

train_dataloader = DataLoader(dataset_train['train'], shuffle=True, batch_size=16)
valid_dataloader = DataLoader(dataset_val['train'], shuffle=True, batch_size=16)
certrion = nn.CrossEntropyLoss(ignore_index=4, weight=weights)



In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score
import torch

epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in tqdm(train_dataloader):
        # 将数据移动到cuda
        batch = {k: v.to("cuda:0") for k, v in batch.items()}
        labels=batch["labels"]
        input_ids = batch["input_ids"].squeeze(1)
        attention_mask = batch["attention_mask"].squeeze(1)
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask)
        logits = outputs.logits

        loss = certrion(logits, labels)
        total_loss += loss.item()

        # 反向传播和优化
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    
    model.eval()
    true_labels = []
    predictions = []
    with torch.no_grad():
        for batch in tqdm(valid_dataloader):
            batch = {k: v.to("cuda:0") for k, v in batch.items()}
            labels=batch["labels"]
            input_ids = batch["input_ids"].squeeze(1)
            attention_mask = batch["attention_mask"].squeeze(1)
            outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask)
            logits = outputs.logits
            predicted_labels = torch.argmax(logits, dim=1)
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(batch["labels"].cpu().numpy())

    # 计算指标
    accuracy = accuracy_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions, average='macro')  # 'macro'未加权计算每个类的指标
    f1 = f1_score(true_labels, predictions, average='macro')
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_dataloader)}")
    print(f"Epoch {epoch+1}, Accuracy: {accuracy}, Recall: {recall}, F1 Score: {f1}")

100%|██████████| 3110/3110 [05:33<00:00,  9.32it/s]
100%|██████████| 611/611 [00:13<00:00, 45.29it/s]


Epoch 1, Loss: 0.42527758348194156
Epoch 1, Accuracy: 0.800613810741688, Recall: 0.6656466702942392, F1 Score: 0.647775556158309


100%|██████████| 3110/3110 [05:33<00:00,  9.33it/s]
100%|██████████| 611/611 [00:13<00:00, 45.33it/s]


Epoch 2, Loss: 0.30080432383361544
Epoch 2, Accuracy: 0.8002046035805627, Recall: 0.666704409213122, F1 Score: 0.645903365323835


 33%|███▎      | 1021/3110 [01:49<03:48,  9.13it/s]