In [1]:
import torch, transformers
from transformers import BertForMaskedLM, BertTokenizer
import torch.nn.functional as F

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForMaskedLM.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# 添加LoRA适配器
from transformers.adapters import LoRAConfig

config = LoRAConfig(r=8, alpha=16)
model.add_adapter("lora_adapter", config=config)
model.train_adapter("lora_adapter", train_embeddings=True)

# 将模型移动到设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [13]:
# emb_params = 0
# trainable_params = 0
# frozen_params = 0
# for name, param in model.named_parameters():
#     if "lora" in name:
#         param.requires_grad = True
#         emb_params += param.numel()
#     else:
#         param.requires_grad = False

#     if not param.requires_grad:
#         print(f"🥶 Frozen layer '{name}'")
#         frozen_params += param.numel()
#     else:
#         print(f"🚀 Trainable layer '{name}'")
#         trainable_params += param.numel()

# print(f"Total frozen parameters: {frozen_params}")
# print(f"Total trainable parameters: {trainable_params}")

🥶 Frozen layer 'bert.embeddings.word_embeddings.weight'
🥶 Frozen layer 'bert.embeddings.position_embeddings.weight'
🥶 Frozen layer 'bert.embeddings.token_type_embeddings.weight'
🥶 Frozen layer 'bert.embeddings.LayerNorm.weight'
🥶 Frozen layer 'bert.embeddings.LayerNorm.bias'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.query.weight'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.query.bias'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.query.loras.lora_adapter.lora_A'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.query.loras.lora_adapter.lora_B'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.key.weight'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.key.bias'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.value.weight'
🥶 Frozen layer 'bert.encoder.layer.0.attention.self.value.bias'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.value.loras.lora_adapter.lora_A'
🚀 Trainable layer 'bert.encoder.layer.0.attention.self.value.loras.lora_ad

In [4]:
# trainable_params = []

# for name, param in model.named_parameters():
#     if "lora" in name:
#         param.requires_grad = True
#         trainable_params.append(param)

In [3]:
path = '/afs/inf.ed.ac.uk/user/s20/s2057508/Documents/phdwork/ASRErrorCorrection/'
data1 = path + '/05pt/ses1_truth.txt'
data2 = path + '/05pt/ses2_truth.txt'
data3 = path + '/05pt/ses3_truth.txt'
data4 = path + '/05pt/ses4_truth.txt'
data5 = path + '/05pt/ses5_truth.txt'

In [4]:
with open(data1, 'r', encoding='utf-8') as f1:
    lines1 = f1.readlines()
with open(data2, 'r', encoding='utf-8') as f2:
    lines2 = f2.readlines()
with open(data3, 'r', encoding='utf-8') as f3:
    lines3 = f3.readlines()
with open(data4, 'r', encoding='utf-8') as f4:
    lines4 = f4.readlines()
with open(data5, 'r', encoding='utf-8') as f5:
    lines5 = f5.readlines()

In [5]:
import random
fold1 = lines2 + lines3 + lines4 + lines5
fold2 = lines1 + lines3 + lines4 + lines5
fold3 = lines1 + lines2 + lines4 + lines5
fold4 = lines1 + lines2 + lines3 + lines5
fold5 = lines1 + lines2 + lines3 + lines4
random.shuffle(fold1)
random.shuffle(fold2)
random.shuffle(fold3)
random.shuffle(fold4)
random.shuffle(fold5)

In [6]:
# train_data = fold1
# val_data = lines1
# train_data = fold2
# val_data = lines2
# train_data = fold3
# val_data = lines3
# train_data = fold4
# val_data = lines4
train_data = fold5
val_data = lines5

In [7]:
train_data = [tokenizer.encode(line.strip(), add_special_tokens=True) for line in train_data]
train_data = [torch.tensor(b) for b in train_data]
train_data = torch.nn.utils.rnn.pad_sequence(train_data, batch_first=True, padding_value=tokenizer.pad_token_id)

train_mask = torch.ones_like(train_data)  # 创建与train_data形状相同的全1张量
train_mask[train_data == tokenizer.pad_token_id] = 0  # 将填充token位置的mask设置为0

# 生成随机的mask
mask_prob = 0.2  # 随机mask的概率
mask_token_id = tokenizer.mask_token_id  # 获取mask token的ID
rand_mask = torch.rand(train_data.shape) < mask_prob  # 随机生成掩码

train_data[rand_mask] = mask_token_id  # 将随机选中的token替换为mask token
train_mask[rand_mask] = 0  # 将随机选中的token位置的mask设置为0

In [8]:
val_data = [tokenizer.encode(line.strip(), add_special_tokens=True) for line in val_data]
val_data = [torch.tensor(b) for b in val_data]
val_data = torch.nn.utils.rnn.pad_sequence(val_data, batch_first=True, padding_value=tokenizer.pad_token_id)

val_mask = torch.ones_like(val_data)
val_mask[val_data == tokenizer.pad_token_id] = 0

# 生成随机的mask
mask_prob = 0.2
mask_token_id = tokenizer.mask_token_id
rand_mask = torch.rand(val_data.shape) < mask_prob

val_data[rand_mask] = mask_token_id
val_mask[rand_mask] = 0

In [9]:
from torch.optim import AdamW

batch_size = 32
num_epochs = 10
learning_rate = 1e-5
adam_epsilon = 1e-8
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=adam_epsilon)
best_val_loss = None

In [10]:
# 微调过程

import time

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    total_loss = 0
    
    for i in range(0, len(train_data), batch_size):
        batch_inputs = train_data[i:i+batch_size].to(device)
        batch_masks = train_mask[i:i+batch_size].to(device)
        labels = batch_inputs.clone().detach()
        labels[labels == tokenizer.pad_token_id] = -100

        # Forward pass
        outputs = model(input_ids=batch_inputs, attention_mask=batch_masks, labels=labels)
        logits = outputs.logits

        # Calculate loss only for non-padding tokens
        active_loss = batch_masks.view(-1) != 0
        active_logits = logits.view(-1, model.config.vocab_size)[active_loss]
        active_labels = labels.view(-1)[active_loss]
        loss = F.cross_entropy(active_logits, active_labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    # Calculate average loss
    avg_loss = total_loss / (len(train_data) // batch_size)
    print(f"Epoch {epoch+1} / {num_epochs}: Average Train Loss = {avg_loss:.4f}, Time: {epoch_time:.2f} seconds")
#     pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
#     print(pytorch_total_params)

    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for i in range(0, len(val_data), batch_size):
            batch_inputs = val_data[i:i+batch_size].to(device)
            batch_masks = val_mask[i:i+batch_size].to(device)
            labels = batch_inputs.clone().detach()
            labels[labels == tokenizer.pad_token_id] = -100
            
            # Forward pass
            outputs = model(input_ids=batch_inputs, attention_mask=batch_masks, labels=labels)
            logits = outputs.logits

            # Calculate loss only for non-padding tokens
            active_loss = batch_masks.view(-1) != 0
            active_logits = logits.view(-1, model.config.vocab_size)[active_loss]
            active_labels = labels.view(-1)[active_loss]
            loss = F.cross_entropy(active_logits, active_labels)
            
            val_loss += loss.item()
    
    # Calculate average validation loss
    avg_val_loss = val_loss / (len(val_data) // batch_size)
    end_time = time.time()
    epoch_time = end_time - start_time
    print(f"Epoch {epoch+1} / {num_epochs}: Average Valid Loss = {avg_val_loss:.4f}, Time: {epoch_time:.2f} seconds")
    
    torch.cuda.empty_cache()
    
    # Save the best model
    if best_val_loss is None or avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), "truth_model_fold5.pt")

Epoch 1 / 10: Average Loss = 2.6370943399300253
917562
Epoch 1 / 10: Average Validation Loss = 1.7663666605949402, Time: 46.27532768249512 seconds


KeyboardInterrupt: 