In [1]:
import torch, transformers
from transformers import BertForMaskedLM, BertTokenizer
import torch.nn.functional as F

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# 添加LoRA适配器
from transformers.adapters import LoRAConfig

config1 = LoRAConfig(r=8, alpha=16)
config2 = LoRAConfig(r=8, alpha=16)
model.add_adapter("lora_adapter1", config=config1)
model.add_adapter("lora_adapter2", config=config2)
model.set_active_adapters("lora_adapter1")  # 设置适配器1为活动适配器
model.train_adapter("lora_adapter1", train_embeddings=True)
model.set_active_adapters("lora_adapter2")  # 设置适配器2为活动适配器
model.train_adapter("lora_adapter2", train_embeddings=True)

# 将模型移动到设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForMaskedLM(
  (shared_parameters): ModuleDict()
  (bert): BertModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict(
                  (lora_adapter1): LoRA()
                  (lora_adapter2): LoRA()
                )
              )
              (key): Linear(
                in_features=768, out_features=768, bias=True
                (loras): ModuleDict

In [3]:
# emb_params = 0
# trainable_params = 0
# frozen_params = 0
# for name, param in model.named_parameters():
#     if "lora" in name:
#         param.requires_grad = True
#         emb_params += param.numel()
#     else:
#         param.requires_grad = False

#     if not param.requires_grad:
#         print(f"🥶 Frozen layer '{name}'")
#         frozen_params += param.numel()
#     else:
#         print(f"🚀 Trainable layer '{name}'")
#         trainable_params += param.numel()

# print(f"Total frozen parameters: {frozen_params}")
# print(f"Total trainable parameters: {trainable_params}")

🥶 Frozen layer 'bert.embeddings.word_embeddings.weight'
🥶 Frozen layer 'bert.embeddings.position_embeddings.weight'
🥶 Frozen layer 'bert.embeddings.token_type_embeddings.weight'
🥶 Frozen layer 'bert.embeddings.LayerNorm.weight'
🥶 Frozen layer 'bert.embeddings.LayerNorm.bias'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.query.weight'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.query.bias'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.query.loras.lora_adapter1.lora_A'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.query.loras.lora_adapter1.lora_B'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.key.weight'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.key.bias'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.value.weight'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.value.bias'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.value.loras.lora_adapter1.lora_A'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.value.loras.lora

In [3]:
#truth data
path = '/afs/inf.ed.ac.uk/user/s20/s2057508/Documents/phdwork/ASRErrorCorrection/'
data1_truth = path + '/05pt/ses1_truth.txt'
data2_truth = path + '/05pt/ses2_truth.txt'
data3_truth = path + '/05pt/ses3_truth.txt'
data4_truth = path + '/05pt/ses4_truth.txt'
data5_truth = path + '/05pt/ses5_truth.txt'

In [4]:
#trans data
path = '/afs/inf.ed.ac.uk/user/s20/s2057508/Documents/phdwork/ASRErrorCorrection/'
data1_trans = path + '/05pt/ses1_trans.txt'
data2_trans = path + '/05pt/ses2_trans.txt'
data3_trans = path + '/05pt/ses3_trans.txt'
data4_trans = path + '/05pt/ses4_trans.txt'
data5_trans = path + '/05pt/ses5_trans.txt'

In [5]:
with open(data1_truth, 'r', encoding='utf-8') as f1:
    lines1_truth = f1.readlines()
with open(data2_truth, 'r', encoding='utf-8') as f2:
    lines2_truth = f2.readlines()
with open(data3_truth, 'r', encoding='utf-8') as f3:
    lines3_truth = f3.readlines()
with open(data4_truth, 'r', encoding='utf-8') as f4:
    lines4_truth = f4.readlines()
with open(data5_truth, 'r', encoding='utf-8') as f5:
    lines5_truth = f5.readlines()

with open(data1_trans, 'r', encoding='utf-8') as f6:
    lines1_trans = f6.readlines()
with open(data2_trans, 'r', encoding='utf-8') as f7:
    lines2_trans = f7.readlines()
with open(data3_trans, 'r', encoding='utf-8') as f8:
    lines3_trans = f8.readlines()
with open(data4_trans, 'r', encoding='utf-8') as f9:
    lines4_trans = f9.readlines()
with open(data5_trans, 'r', encoding='utf-8') as f10:
    lines5_trans = f10.readlines()

In [6]:
# train data
import random
fold1_truth = lines2_truth + lines3_truth + lines4_truth + lines5_truth
fold2_truth = lines1_truth + lines3_truth + lines4_truth + lines5_truth
fold3_truth = lines1_truth + lines2_truth + lines4_truth + lines5_truth
fold4_truth = lines1_truth + lines2_truth + lines3_truth + lines5_truth
fold5_truth = lines1_truth + lines2_truth + lines3_truth + lines4_truth
fold1_trans = lines2_trans + lines3_trans + lines4_trans + lines5_trans
fold2_trans = lines1_trans + lines3_trans + lines4_trans + lines5_trans
fold3_trans = lines1_trans + lines2_trans + lines4_trans + lines5_trans
fold4_trans = lines1_trans + lines2_trans + lines3_trans + lines5_trans
fold5_trans = lines1_trans + lines2_trans + lines3_trans + lines4_trans

combined_fold1 = list(zip(fold1_truth, fold1_trans))
combined_fold2 = list(zip(fold2_truth, fold2_trans))
combined_fold3 = list(zip(fold3_truth, fold3_trans))
combined_fold4 = list(zip(fold4_truth, fold4_trans))
combined_fold5 = list(zip(fold5_truth, fold5_trans))
random.shuffle(combined_fold1)
random.shuffle(combined_fold2)
random.shuffle(combined_fold3)
random.shuffle(combined_fold4)
random.shuffle(combined_fold5)
# val data
combined_lines1 = list(zip(lines1_truth, lines1_trans))
combined_lines2 = list(zip(lines2_truth, lines2_trans))
combined_lines3 = list(zip(lines3_truth, lines3_trans))
combined_lines4 = list(zip(lines4_truth, lines4_trans))
combined_lines5 = list(zip(lines5_truth, lines5_trans))

In [7]:
train_data = combined_fold1
val_data = combined_lines1
# train_data = combined_fold2
# val_data = combined_lines2
# train_data = combined_fold3
# val_data = combined_lines3
# train_data = combined_fold4
# val_data = combined_lines4
# train_data = combined_fold5
# val_data = combined_lines5

In [8]:
train_truth, train_trans = [tokenizer.encode(line_truth.strip(), add_special_tokens=True) for line_truth, _ in train_data], [tokenizer.encode(line_trans.strip(), add_special_tokens=True) for _, line_trans in train_data]
train_truth, train_trans = [torch.tensor(b) for b in train_truth], [torch.tensor(b) for b in train_trans]
train_truth, train_trans = torch.nn.utils.rnn.pad_sequence(train_truth, batch_first=True, padding_value=tokenizer.pad_token_id), torch.nn.utils.rnn.pad_sequence(train_trans, batch_first=True, padding_value=tokenizer.pad_token_id)

train_mask_truth, train_mask_trans = torch.ones_like(train_truth), torch.ones_like(train_trans)  # 创建与train_data形状相同的全1张量
train_mask_truth[train_truth == tokenizer.pad_token_id] = 0  # 将填充token位置的mask设置为0
train_mask_trans[train_trans == tokenizer.pad_token_id] = 0  # 将填充token位置的mask设置为0

# 生成随机的mask
mask_prob = 0.2  # 随机mask的概率
mask_token_id = tokenizer.mask_token_id  # 获取mask token的ID
rand_mask_truth = torch.rand(train_truth.shape) < mask_prob  # 随机生成掩码
rand_mask_trans = torch.rand(train_trans.shape) < mask_prob  # 随机生成掩码

train_truth[rand_mask_truth] = mask_token_id  # 将随机选中的token替换为mask token
train_mask_truth[rand_mask_truth] = 0  # 将随机选中的token位置的mask设置为0
train_trans[rand_mask_trans] = mask_token_id  # 将随机选中的token替换为mask token
train_mask_trans[rand_mask_trans] = 0  # 将随机选中的token位置的mask设置为0

In [9]:
val_truth, val_trans = [tokenizer.encode(line_truth.strip(), add_special_tokens=True) for line_truth, _ in val_data], [tokenizer.encode(line_trans.strip(), add_special_tokens=True) for _, line_trans in val_data]
val_truth, val_trans = [torch.tensor(b) for b in val_truth], [torch.tensor(b) for b in val_trans]
val_truth, val_trans = torch.nn.utils.rnn.pad_sequence(val_truth, batch_first=True, padding_value=tokenizer.pad_token_id), torch.nn.utils.rnn.pad_sequence(val_trans, batch_first=True, padding_value=tokenizer.pad_token_id)

val_mask_truth, val_mask_trans = torch.ones_like(val_truth), torch.ones_like(val_trans)  # 创建与train_data形状相同的全1张量
val_mask_truth[val_truth == tokenizer.pad_token_id] = 0  # 将填充token位置的mask设置为0
val_mask_trans[val_trans == tokenizer.pad_token_id] = 0  # 将填充token位置的mask设置为0

# 生成随机的mask
mask_prob = 0.2  # 随机mask的概率
mask_token_id = tokenizer.mask_token_id  # 获取mask token的ID
rand_mask_truth = torch.rand(val_truth.shape) < mask_prob  # 随机生成掩码
rand_mask_trans = torch.rand(val_trans.shape) < mask_prob  # 随机生成掩码

val_truth[rand_mask_truth] = mask_token_id  # 将随机选中的token替换为mask token
val_mask_truth[rand_mask_truth] = 0  # 将随机选中的token位置的mask设置为0
val_trans[rand_mask_trans] = mask_token_id  # 将随机选中的token替换为mask token
val_mask_trans[rand_mask_trans] = 0  # 将随机选中的token位置的mask设置为0

In [10]:
from torch.optim import AdamW

batch_size = 32
num_epochs = 10
learning_rate = 1e-5
adam_epsilon = 1e-8
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
best_val_loss = None

In [14]:
# 微调过程

import time

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    
    for i in range(0, len(train_truth), batch_size):
        # 获取适配器1数据批次        
        batch_inputs_truth = train_truth[i:i+batch_size].to(device)
        batch_masks_truth = train_mask_truth[i:i+batch_size].to(device)
        labels_truth = batch_inputs_truth.clone().detach()
        labels_truth[labels_truth == tokenizer.pad_token_id] = -100

        # 获取适配器2数据批次        
        batch_inputs_trans = train_trans[i:i+batch_size].to(device)
        batch_masks_trans = train_mask_trans[i:i+batch_size].to(device)
        labels_trans = batch_inputs_trans.clone().detach()
        labels_trans[labels_trans == tokenizer.pad_token_id] = -100

        # Forward pass
        outputs_truth = model(input_ids=batch_inputs_truth, attention_mask=batch_masks_truth, labels=labels_truth)
        logits_truth = outputs_truth.logits
        outputs_trans = model(input_ids=batch_inputs_trans, attention_mask=batch_masks_trans, labels=labels_trans)
        logits_trans = outputs_trans.logits

        # Calculate loss only for non-padding tokens
        active_loss_truth = batch_masks_truth.view(-1) != 0
        active_logits_truth = logits_truth.view(-1, model.config.vocab_size)[active_loss_truth]
        active_labels_truth = labels_truth.view(-1)[active_loss_truth]
        loss_truth = F.cross_entropy(active_logits_truth, active_labels_truth)

        active_loss_trans = batch_masks_trans.view(-1) != 0
        active_logits_trans = logits_trans.view(-1, model.config.vocab_size)[active_loss_trans]
        active_labels_trans = labels_trans.view(-1)[active_loss_trans]
        loss_trans = F.cross_entropy(active_logits_trans, active_labels_trans)

        loss = loss_truth + loss_trans
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    # Calculate average loss
    avg_loss = total_loss / (len(train_data) // batch_size)
    print(f"Epoch {epoch+1} / {num_epochs}: Average Train Loss = {avg_loss:.4f}, Truth Loss = {loss_truth:.4f}, Trans Loss = {loss_trans:.4f}, Time: {epoch_time:.2f} seconds")
#     pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#     print(pytorch_total_params)

    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for i in range(0, len(val_truth), batch_size):
            # 获取适配器1数据批次        
            batch_inputs_truth = val_truth[i:i+batch_size].to(device)
            batch_masks_truth = val_mask_truth[i:i+batch_size].to(device)
            labels_truth = batch_inputs_truth.clone().detach()
            labels_truth[labels_truth == tokenizer.pad_token_id] = -100

            # 获取适配器2数据批次        
            batch_inputs_trans = val_trans[i:i+batch_size].to(device)
            batch_masks_trans = val_mask_trans[i:i+batch_size].to(device)
            labels_trans = batch_inputs_trans.clone().detach()
            labels_trans[labels_trans == tokenizer.pad_token_id] = -100

            # Forward pass
            outputs_truth = model(input_ids=batch_inputs_truth, attention_mask=batch_masks_truth, labels=labels_truth)
            logits_truth = outputs_truth.logits
            outputs_trans = model(input_ids=batch_inputs_trans, attention_mask=batch_masks_trans, labels=labels_trans)
            logits_trans = outputs_trans.logits

            # Calculate loss only for non-padding tokens
            active_loss_truth = batch_masks_truth.view(-1) != 0
            active_logits_truth = logits_truth.view(-1, model.config.vocab_size)[active_loss_truth]
            active_labels_truth = labels_truth.view(-1)[active_loss_truth]
            loss_truth = F.cross_entropy(active_logits_truth, active_labels_truth)

            active_loss_trans = batch_masks_trans.view(-1) != 0
            active_logits_trans = logits_trans.view(-1, model.config.vocab_size)[active_loss_trans]
            active_labels_trans = labels_trans.view(-1)[active_loss_trans]
            loss_trans = F.cross_entropy(active_logits_trans, active_labels_trans)

            loss = loss_truth + loss_trans

            val_loss += loss.item()
    
    # Calculate average validation loss
    avg_val_loss = val_loss / (len(val_data) // batch_size)
    end_time = time.time()
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1} / {num_epochs}: Average Valid Loss = {avg_val_loss:.4f}, Truth Loss = {loss_truth:.4f}, Trans Loss = {loss_trans:.4f}, Time: {epoch_time:.2f} seconds")
    
    torch.cuda.empty_cache()
    
    # Save the best model
    if best_val_loss is None or avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "truth_model_fold5.pt")

Epoch 1 / 10: Average Loss = 2.2763, Truth Loss = 0.7810, Loss Trans = 0.8475, Time: 113.09 seconds
Epoch 1 / 10: Average Validation Loss = 1.7888, Truth Loss = 1.1987, Trans Loss = 1.2712, Time: 112.58 seconds
Epoch 2 / 10: Average Loss = 1.2554, Truth Loss = 0.1826, Loss Trans = 0.1897, Time: 112.58 seconds
Epoch 2 / 10: Average Validation Loss = 0.4387, Truth Loss = 0.2770, Trans Loss = 0.2956, Time: 113.25 seconds


KeyboardInterrupt: 