### 处理数据

In [5]:
from datasets import load_dataset

datasets_path = "/root/datasets"
# 读取数据并保存在当前文件夹
# datasets = load_dataset(datasets_path,split={"train":"train[:20]","test":"test[:20]"})
datasets = load_dataset(datasets_path)

In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
})

In [10]:
from transformers import AutoTokenizer

# pretrained_model_path = "D:/hugging_face/models/bert-base-cased"
pretrained_model_path = "/root/models/distilbert-base-uncased" #TODO

tokenizer = AutoTokenizer.from_pretrained(pretrained_model_path)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length= 512) # max_length 必须指定值
    

tokenized_datasets = datasets.map(tokenize_function, batched=True)



Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [11]:
# 移除 text 列, 因为模型不接受原始文本作为输入:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
# 将 label 列重命名为 labels, 因为模型期望参数的名称为 labels:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

In [12]:
# 设置数据集的格式以返回 PyTorch 张量
tokenized_datasets.set_format("torch")

In [13]:
tokenized_datasets['train']['labels']

tensor([0, 0, 0,  ..., 1, 1, 1])

In [14]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=16)
eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16)

### 训练

In [31]:
from transformers import AutoModelForSequenceClassification
# 加载模型并指定期望的标签数
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_path, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /root/models/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [33]:
from transformers import get_scheduler

num_epochs = 2  #TODO
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [3]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [35]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()

# 初始化一个列表来保存损失值
losses = []

# 存储学习率的列表
learning_rates = []

# 用于比较 得到最好的模型参数
best_loss = 1e10

# 设置最小误差的模型保存路经
save_model_path = "./my_awesome_model"

for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        losses.append(loss)
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        current_lr = lr_scheduler.get_last_lr()[0]  # 获取当前学习率
        learning_rates.append(current_lr)
        optimizer.zero_grad()
        progress_bar.update(1)
    if (epoch+1) % 1 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')  
    if loss.item() < best_loss:
        best_loss = loss.item()
        tokenizer.save_pretrained(save_model_path)
        model.save_pretrained(save_model_path, safe_serialization=False)

  0%|          | 0/3126 [00:00<?, ?it/s]

Epoch [1/2], Loss: 0.0414
Epoch [2/2], Loss: 0.1104


### 评估

In [None]:
import evaluate
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

In [2]:
datasets_path = "D:/datasets/imdb" #TODO

datasets = load_dataset(datasets_path)

tokenizer = AutoTokenizer.from_pretrained("./my_awesome_model")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True,max_length= 512) # max_length 必须指定值

tokenized_datasets = datasets.map(tokenize_function, batched=True)


# 移除 text 列, 因为模型不接受原始文本作为输入:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
# 将 label 列重命名为 labels, 因为模型期望参数的名称为 labels:
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
# 设置数据集的格式以返回 PyTorch 张量
tokenized_datasets.set_format("torch")


eval_dataloader = DataLoader(tokenized_datasets['test'], batch_size=16)


  return self.fget.__get__(instance, owner)()


NameError: name 'eval_dataloader' is not defined

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model = AutoModelForSequenceClassification.from_pretrained("./my_awesome_model")

model.to(device)

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])


In [None]:

progress_bar = tqdm(range(1563))

for i, batch in enumerate(eval_dataloader):
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    logits = outputs.logits
    predicted_class_id = logits.argmax(-1)
    print(predicted_class_id.tolist())
    print(batch['labels'].tolist())
    prediction = predicted_class_id.tolist()
    reference = batch['labels'].tolist()
    clf_metrics.add_batch(predictions=prediction, references=reference)
    progress_bar.update(1)
clf_metrics.compute()