In [1]:
!pip install transformers torch scikit-learn


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 加载数据
data = pd.read_csv('./imdbs.csv')  # 使用正确的路径
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.1, random_state=42)


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch

# 初始化 tokenizer 和 model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenization
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=512)

# 转换为 torch dataset
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']), torch.tensor(train_encodings['attention_mask']), torch.tensor(y_train.values))
test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']), torch.tensor(test_encodings['attention_mask']), torch.tensor(y_test.values))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from torch.optim import AdamW
from transformers import get_scheduler
import numpy as np

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
optim = AdamW(model.parameters(), lr=1e-5)

num_epochs = 20
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optim, num_warmup_steps=0, num_training_steps=num_training_steps)

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        # 更新这里的批次处理方式
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optim.step()
        lr_scheduler.step()
        optim.zero_grad()

        print(f"Epoch: {epoch}, Loss: {loss.item()}")


Epoch: 0, Loss: 0.011813975870609283
Epoch: 0, Loss: 0.013225291855633259
Epoch: 0, Loss: 0.011208174750208855
Epoch: 0, Loss: 0.009458571672439575
Epoch: 0, Loss: 0.008070779033005238
Epoch: 0, Loss: 0.010138371028006077
Epoch: 0, Loss: 0.007704258430749178
Epoch: 0, Loss: 0.007010070141404867
Epoch: 0, Loss: 0.006595974788069725
Epoch: 0, Loss: 0.0050117867067456245
Epoch: 0, Loss: 0.006049966439604759
Epoch: 0, Loss: 0.005655188579112291
Epoch: 1, Loss: 0.006603015586733818
Epoch: 1, Loss: 0.004771417938172817
Epoch: 1, Loss: 0.0035569719038903713
Epoch: 1, Loss: 0.0047248550690710545
Epoch: 1, Loss: 0.0039009570609778166
Epoch: 1, Loss: 0.003919094335287809
Epoch: 1, Loss: 0.003292639972642064
Epoch: 1, Loss: 0.003140582237392664
Epoch: 1, Loss: 0.0037586400285363197
Epoch: 1, Loss: 0.003500710939988494
Epoch: 1, Loss: 0.0030736045446246862
Epoch: 1, Loss: 0.002740195021033287
Epoch: 2, Loss: 0.0027477070689201355
Epoch: 2, Loss: 0.002667754888534546
Epoch: 2, Loss: 0.0030064901802

In [12]:
from sklearn.metrics import accuracy_score

model.eval()
predictions, true_labels = [], []
with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=8):
        # 同样地，对批次数据进行解包，并发送到相应设备
        input_ids, attention_mask, labels = [tensor.to(device) for tensor in batch]

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy}")


Test Accuracy: 0.8
