<a href="https://colab.research.google.com/github/weiyuli20/llm_alignment/blob/main/BERT_%E5%88%86%E7%B1%BB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


# 1.加载bert模型

In [2]:
from transformers import BertTokenizer, BertModel


tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = BertModel.from_pretrained('bert-base-cased')

print(tokenizer.tokenize('Hello, my dog is cute'))

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

['Hello', ',', 'my', 'dog', 'is', 'cute']


In [3]:
text = 'this is a simple sentence'
bert_input = tokenizer(text,padding='max_length',
                       max_length = 10,
                       truncation  = True,
                       return_tensors = 'pt')
print(bert_input['input_ids'])
print(bert_input['token_type_ids'])
print(bert_input['attention_mask'])
print(tokenizer.decode(bert_input.input_ids[0]))

#BertTokenizer 负责输入文本的所有必要转换，可以看到，在分词时已经加入[CLS][sep]

tensor([[ 101, 1142, 1110,  170, 3014, 5650,  102,    0,    0,    0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]])
[CLS] this is a simple sentence [SEP] [PAD] [PAD] [PAD]


#2.加载数据集
使用 BBC 新闻分类数据集。有 2126 个不同的文本，每个文本都标记在 5 个类别中的一个下：

In [4]:
from datasets import load_dataset

ds = load_dataset("SetFit/bbc-news")

README.md:   0%|          | 0.00/880 [00:00<?, ?B/s]

train.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/1225 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
ds['train'][0]

{'text': 'wales want rugby league training wales could follow england s lead by training with a rugby league club.  england have already had a three-day session with leeds rhinos  and wales are thought to be interested in a similar clinic with rivals st helens. saints coach ian millward has given his approval  but if it does happen it is unlikely to be this season. saints have a week s training in portugal next week  while wales will play england in the opening six nations match on 5 february.  we have had an approach from wales   confirmed a saints spokesman.  it s in the very early stages but it is something we are giving serious consideration to.  st helens  who are proud of their welsh connections  are obvious partners for the welsh rugby union  despite a spat in 2001 over the collapse of kieron cunningham s proposed £500 000 move to union side swansea. a similar cross-code deal that took iestyn harris from leeds to cardiff in 2001 did go through  before the talented stand-off retu

#对数据集进行处理，处理成BERT模型需要的格式

In [6]:
def tokenize_function(dataset):
  return tokenizer(dataset['text'],max_length = 512,truncation = True)

ds = ds.map(tokenize_function, batched  =True)

Map:   0%|          | 0/1225 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer = tokenizer)  #动态批次填充，按照各个批次的最大长度进行填充

# 3. 构建分类模型，微调BERT实现新闻文本分类

In [None]:
import torch
import torch.nn  as nn
class BertClassifier(nn.Module):
  def __init__(self,dropout = 0.5):
    super().__init__()
    self.bert = BertModel.from_pretrained('bert-base-cased')
    self.dropout = nn.Dropout(dropout)
    self.classifier = nn.Linear(768,5)
    self.relu = nn.ReLU()

  def forward(self,x):
    input_ids = x['input_ids']
    attention_mask = x['attention_mask']
    _,pooled_output = self.bert(input_ids = input_ids,attention_mask = attention_mask,return_dict = False)
    dropout_output = self.dropout(pooled_output)
    linear_output = self.classifier(dropout_output)
    final_layer = self.relu(linear_output)
    return final_layer

    # pooled_output是[cls]对应的embedding

#更简单的写法，使用HUGGINGFACE的TrainerAPI

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-cased',num_labels = 5)

# 4.进行训练(全参微调）

In [15]:
#定义
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer",
                                  eval_strategy="steps",
                                  eval_steps=100,)

In [13]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    # eval_pred 是 (predictions, labels) 的元组
    predictions, labels = eval_pred

    # 如果 predictions 是概率分布（如模型输出的 logits），需要先转成类别标签
    if predictions.ndim > 1:
        predictions = np.argmax(predictions, axis=1)

    # 计算准确率
    accuracy = accuracy_score(y_true=labels, y_pred=predictions)
    # 可以同时计算其他指标（如之前的 F1 分数）
    f1 = f1_score(y_true=labels, y_pred=predictions, average='weighted')

    return {"accuracy": accuracy, "f1": f1}

In [16]:
from transformers import Trainer


trainer = Trainer(
    model,
    training_args,
    train_dataset=ds['train'],
    eval_dataset=ds["test"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.372503,0.945,0.944622
200,No log,0.200963,0.968,0.967894
300,No log,0.1934,0.973,0.973032
400,No log,0.166945,0.975,0.975


TrainOutput(global_step=462, training_loss=0.08015990154051678, metrics={'train_runtime': 481.7381, 'train_samples_per_second': 7.629, 'train_steps_per_second': 0.959, 'total_flos': 964965745284462.0, 'train_loss': 0.08015990154051678, 'epoch': 3.0})


## 5.冻结一部分参数，只训练分类头




In [20]:
for param in model.base_model.parameters():
    param.requires_grad = False  # 冻结预训练的主体参数

# 确认只有头部参数可训练
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"可训练参数: {trainable_params} ({trainable_params/total_params:.5%})")


可训练参数: 3845 (0.00355%)


In [22]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=ds['train'],
    eval_dataset=ds["test"],
    data_collator=data_collator,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
100,No log,0.185596,0.975,0.974996
200,No log,0.187973,0.975,0.974996
300,No log,0.189357,0.975,0.974996
400,No log,0.189118,0.975,0.974996


TrainOutput(global_step=462, training_loss=0.02071167380262763, metrics={'train_runtime': 239.8591, 'train_samples_per_second': 15.321, 'train_steps_per_second': 1.926, 'total_flos': 964965745284462.0, 'train_loss': 0.02071167380262763, 'epoch': 3.0})

In [21]:
state_dict = model.state_dict()

# 查看所有参数的key（参数名称）
print("模型参数的key列表：")
for key in state_dict.keys():
    print(key)

模型参数的key列表：
bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.wei