In [1]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn as nn
import torch.optim as optim

bert_model_id = "/Users/zhangnengwei/nnnewworkspace/bert-base-chinese"
# 使用BERT的预训练中文词表
tokenizer_cn = BertTokenizer.from_pretrained(bert_model_id)



In [2]:
# 准备数据
# 先用这个数据作为训练数据，模型训练完成之后，
# 向模型输入这个source_sentence,模型能输出target_sentence 那就很满足。
source_sentence = "你真是个傻逼"
target_sentence = "你真是个小可爱"



In [3]:

# 标记化并添加特殊标记
# 为啥要添加特殊标记呢？
# 是为了数据可以很规整，模型可以训练起来，并且模型可以识别需要的数据。
source_tokens = tokenizer_cn.tokenize(source_sentence)
source_tokens = ['[CLS]'] + source_tokens + ['[SEP]']

target_tokens = tokenizer_cn.tokenize(target_sentence)
target_tokens = ['[CLS]'] + target_tokens + ['[SEP]']


In [153]:
source_tokens

['[CLS]', '你', '真', '是', '个', '傻', '逼', '[SEP]']

In [5]:
tokenizer_cn.cls_token_id

101

In [6]:
target_tokens

['[CLS]', '你', '真', '是', '个', '小', '可', '爱', '[SEP]']

In [7]:

# 转换为ID
source_ids = tokenizer_cn.convert_tokens_to_ids(source_tokens)
target_ids = tokenizer_cn.convert_tokens_to_ids(target_tokens)


In [8]:
source_ids

[101, 872, 4696, 3221, 702, 1004, 6873, 102]

In [9]:
target_ids

[101, 872, 4696, 3221, 702, 2207, 1377, 4263, 102]

In [10]:

# 确定最大长度（例如设置为12）
# 为啥设置这个最大长度？
# 为了整齐划一，模型可以训练.
max_source_length = 12
max_target_length = 12

# 填充（PAD）并创建注意力掩码
padding_length_source = max_source_length - len(source_ids)
source_ids = source_ids + [0] * padding_length_source
attention_mask_source = [1] * len(source_tokens) + [0] * padding_length_source

padding_length_target = max_target_length - len(target_ids)
target_ids = target_ids + [0] * padding_length_target
attention_mask_target = [1] * len(target_tokens) + [0] * padding_length_target


In [11]:
source_ids

[101, 872, 4696, 3221, 702, 1004, 6873, 102, 0, 0, 0, 0]

In [12]:
attention_mask_source

[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]

In [13]:
target_ids

[101, 872, 4696, 3221, 702, 2207, 1377, 4263, 102, 0, 0, 0]

In [14]:
attention_mask_target

[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]

In [15]:

# 转换为PyTorch张量
source_ids_tensor = torch.tensor([source_ids])
attention_mask_source_tensor = torch.tensor([attention_mask_source])

target_ids_tensor = torch.tensor([target_ids])
attention_mask_target_tensor = torch.tensor([attention_mask_target])


In [16]:
source_ids_tensor.shape

torch.Size([1, 12])

In [17]:
attention_mask_source_tensor.shape

torch.Size([1, 12])

In [18]:
target_ids_tensor.shape

torch.Size([1, 12])

In [19]:
attention_mask_target_tensor.shape

torch.Size([1, 12])

In [20]:
# 这里的[1,12] 相当于 [batch_size,seq_length]

In [21]:

# 定义模型
class Seq2SeqModel(nn.Module):
    def __init__(self, decoder_vocab_size):
        super().__init__()
        # 现在只看数据流的情况下，直接使用bert模型的编码器和解码器
        encoder = BertModel.from_pretrained(bert_model_id)
        decoder = BertModel.from_pretrained(bert_model_id)
        d_model = decoder.config.hidden_size  # 隐藏层的维度，也就是模型中间分析的特征数量

        self.encoder = encoder
        self.decoder = decoder
        self.linear = nn.Linear(d_model, decoder_vocab_size)

    def forward(self, input_ids, attention_mask, decoder_input_ids, decoder_attention_mask):
        encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # 编码器的输出
        encoder_hidden_states = encoder_outputs.last_hidden_state
        decoder_outputs = self.decoder(input_ids=decoder_input_ids, attention_mask=decoder_attention_mask,
                                       encoder_hidden_states=encoder_hidden_states)
        # 解码器的输出
        return self.linear(decoder_outputs.last_hidden_state)


In [22]:

# 初始化模型
model = Seq2SeqModel(tokenizer_cn.vocab_size)

In [23]:
# 前向传播
logits = model(input_ids=source_ids_tensor,
               attention_mask=attention_mask_source_tensor,
               # 将最后一位干掉，让模型去训练，生成这一位。
               decoder_input_ids=target_ids_tensor[:, :-1],
               decoder_attention_mask=attention_mask_target_tensor[:, :-1])

FileNotFoundError: [Errno 2] No such file or directory

In [None]:
target_ids_shifted = target_ids_tensor[:, 1:].contiguous().view(-1)

In [173]:
target_ids_shifted.shape

torch.Size([11])

In [174]:
logits = logits.contiguous().view(-1, tokenizer_cn.vocab_size)

In [175]:
logits.shape

torch.Size([11, 21128])

In [176]:
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(ignore_index=0)  # 忽略填充值0

In [177]:
# 计算损失
loss = criterion(logits, target_ids_shifted)

In [178]:
loss

tensor(10.1809, grad_fn=<NllLossBackward0>)

In [179]:
# 到这里一次就运行结束了。
# 下面开始循环训练的过程

In [180]:
# 定义优化器
optimizer = optim.Adam(model.parameters(), lr=0.0001)

In [181]:

# 循环训练
epochs = 100
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    # 前向传播
    logits = model(input_ids=source_ids_tensor,
                   attention_mask=attention_mask_source_tensor,
                   decoder_input_ids=target_ids_tensor[:, :-1],
                   decoder_attention_mask=attention_mask_target_tensor[:, :-1])

    # 调整目标张量形状以适应损失函数
    # contiguous() 方法在 PyTorch 中的作用是确保张量的内存布局是连续的，
    # 从而使得一些需要连续内存布局的操作（如 view）能够顺利进行。
    # 了解和正确使用 contiguous() 是处理复杂张量操作时的一项重要技能。
    # .view(-1)是一种常用的方法，用于将张量展平（flatten）成一个一维张量。
    target_ids_shifted = target_ids_tensor[:, 1:].contiguous().view(-1)
    logits = logits.contiguous().view(-1, tokenizer_cn.vocab_size)

    # 计算损失
    loss = criterion(logits, target_ids_shifted)

    # 反向传播和优化
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")

Epoch 1/100, Loss: 10.106040000915527
Epoch 2/100, Loss: 8.465002059936523
Epoch 3/100, Loss: 7.590599536895752
Epoch 4/100, Loss: 6.773575305938721
Epoch 5/100, Loss: 5.4766340255737305
Epoch 6/100, Loss: 4.685584545135498
Epoch 7/100, Loss: 3.8850173950195312
Epoch 8/100, Loss: 3.201249837875366
Epoch 9/100, Loss: 2.6361939907073975
Epoch 10/100, Loss: 2.285773515701294
Epoch 11/100, Loss: 1.7995291948318481
Epoch 12/100, Loss: 1.500985860824585
Epoch 13/100, Loss: 1.1916507482528687
Epoch 14/100, Loss: 0.8764806985855103
Epoch 15/100, Loss: 0.8087360858917236
Epoch 16/100, Loss: 0.6433466672897339
Epoch 17/100, Loss: 0.5131864547729492
Epoch 18/100, Loss: 0.4078104794025421
Epoch 19/100, Loss: 0.33131086826324463
Epoch 20/100, Loss: 0.2964836657047272
Epoch 21/100, Loss: 0.2502151131629944
Epoch 22/100, Loss: 0.22394880652427673
Epoch 23/100, Loss: 0.1938513219356537
Epoch 24/100, Loss: 0.18232785165309906
Epoch 25/100, Loss: 0.1472962647676468
Epoch 26/100, Loss: 0.1368027925491333

In [128]:

# 保存模型和优化器状态字典
save_path = "seq2seq_model.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, save_path)

print(f"模型已保存到 {save_path}")

模型已保存到 seq2seq_model.pth


In [129]:

# 加载模型
checkpoint = torch.load(save_path)

# 重新初始化模型和优化器
loaded_model = Seq2SeqModel(tokenizer_cn.vocab_size)
loaded_model.load_state_dict(checkpoint['model_state_dict'])

loaded_optimizer = optim.Adam(loaded_model.parameters(), lr=0.0001)
loaded_optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

print("模型和优化器已加载")


模型和优化器已加载


In [184]:

# 使用模型推理

# 设置模型为评估模式
loaded_model.eval()

def translate(model, source_sentence, max_target_length=12):
    # 将源句子标记化并转换为ID
    source_tokens = tokenizer_cn.tokenize(source_sentence)
    source_tokens = ['[CLS]'] + source_tokens + ['[SEP]']
    source_ids = tokenizer_cn.convert_tokens_to_ids(source_tokens)

    # 填充源输入
    padding_length_source = max_source_length - len(source_ids)
    source_ids = source_ids + [0] * padding_length_source
    attention_mask_source = [1] * len(source_tokens) + [0] * padding_length_source

    # 转换为PyTorch张量
    source_ids_tensor = torch.tensor([source_ids])
    attention_mask_source_tensor = torch.tensor([attention_mask_source])

    # 初始化解码器输入
    decoder_input_ids = torch.tensor([[tokenizer_cn.cls_token_id]])

    # 用于存储生成的目标序列
    generated_ids = []

    for _ in range(max_target_length):
        # 前向传播
        with torch.no_grad():
            logits = model(input_ids=source_ids_tensor,
                           attention_mask=attention_mask_source_tensor,
                           decoder_input_ids=decoder_input_ids,
                           decoder_attention_mask=torch.ones_like(decoder_input_ids))

        # 获取当前时间步的预测结果
        # [1, 21128]
        next_token_logits = logits[:, -1, :]
        # 找出概率最大的哪一个
        next_token_id = next_token_logits.argmax(dim=-1).item()
        print(f"next_token_id : {next_token_id}")

        # 添加到生成的序列中
        generated_ids.append(next_token_id)

        # 更新解码器输入
        decoder_input_ids = torch.cat([decoder_input_ids, torch.tensor([[next_token_id]])], dim=-1)

        # 如果预测到了[SEP]标记，则停止生成
        if next_token_id == tokenizer_cn.sep_token_id:
            break

    # 转换生成的ID为标记
    generated_tokens = tokenizer_cn.convert_ids_to_tokens(generated_ids)
    return tokenizer_cn.convert_tokens_to_string(generated_tokens)


In [185]:

# 测试
source_sentence = "她是个傻逼"
translation = translate(loaded_model, source_sentence)
print(f"翻译: {translation}")

next_token_id : 4106
next_token_id : 3221
next_token_id : 702
next_token_id : 2207
next_token_id : 1377
next_token_id : 4263
next_token_id : 102
翻译: 瀕 是 个 小 可 爱 [SEP]
