In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/chinese-couplets/couplet/vocabs
/kaggle/input/chinese-couplets/couplet/test/out.txt
/kaggle/input/chinese-couplets/couplet/test/in.txt
/kaggle/input/chinese-couplets/couplet/test/.in.txt.swp
/kaggle/input/chinese-couplets/couplet/test/.out.txt.swp
/kaggle/input/chinese-couplets/couplet/train/out.txt
/kaggle/input/chinese-couplets/couplet/train/in.txt


In [2]:
file_in='/kaggle/input/chinese-couplets/couplet/train/in.txt'
file_out='/kaggle/input/chinese-couplets/couplet/train/out.txt'

In [3]:
###读取数据
try:
    enc_tokens = []
    dec_tokens = []
    cnt=0

    with open(file_in, 'r', encoding='utf-8') as f1, open(file_out, 'r', encoding='utf-8') as f2:
        for line1, line2 in zip(f1, f2):
            cnt+=1
            if cnt==1000:
                break
            chs=line1+" , "+line2
            chs = chs.split(" ")
            chs = [element for element in chs if element != "\n"]

            
            for i in range(1,len(chs)):
                enc=chs[:i]
                dec=['<s>']+chs[i:]+['</s>']
                enc_tokens.append(enc)
                dec_tokens.append(dec)
            
            
            
except FileNotFoundError:
    print("文件未找到，请检查文件路径。")
except Exception as e:
    print(f"读取文件时出现错误: {e}")

In [4]:
print(enc_tokens[:5])
print("----------------------")
print(dec_tokens[:5])

[['晚'], ['晚', '风'], ['晚', '风', '摇'], ['晚', '风', '摇', '树'], ['晚', '风', '摇', '树', '树']]
----------------------
[['<s>', '风', '摇', '树', '树', '还', '挺', ',', '晨', '露', '润', '花', '花', '更', '红', '</s>'], ['<s>', '摇', '树', '树', '还', '挺', ',', '晨', '露', '润', '花', '花', '更', '红', '</s>'], ['<s>', '树', '树', '还', '挺', ',', '晨', '露', '润', '花', '花', '更', '红', '</s>'], ['<s>', '树', '还', '挺', ',', '晨', '露', '润', '花', '花', '更', '红', '</s>'], ['<s>', '还', '挺', ',', '晨', '露', '润', '花', '花', '更', '红', '</s>']]


In [5]:
len(dec_tokens)

18760

In [6]:
vocab_file='/kaggle/input/chinese-couplets/couplet/vocabs'

In [7]:
###读取词典
with open(vocab_file,encoding='utf-8') as f:
    vocab=f.read().split("\n")
    vocab=['<pad>']+[tk for tk in vocab if tk !='']
    vocab={tk:i for i,tk in enumerate(vocab)}


In [8]:
vocab['<s>']

1

In [9]:
import torch.nn as nn
import math
import torch

In [10]:
###模型
class PositionalEncoding(nn.Module):
    def __init__(self,emb_size,dropout,maxlen=5000):
        super().__init__()
        ##行缩放数值
        den=torch.exp(-torch.arange(0,emb_size,2)*math.log(10000)/emb_size)
        ##位置编码索引
        pos=torch.arange(0,maxlen).reshape(maxlen,1)
        ##编码矩阵
        pos_embedding=torch.zeros((maxlen,emb_size))
        pos_embedding[:,0::2]=torch.sin(pos*den)
        pos_embedding[:,1::2]=torch.cos(pos*den)
        ##添加batch对应维度
        pos_embedding=pos_embedding.unsqueeze(0)
        ##dropout
        self.dropout=nn.Dropout(dropout)
        ##注册当前矩阵不参与参数更新
        self.register_buffer('pos_embedding',pos_embedding)

    def forward(self,token_embedding):
        token_len=token_embedding.size(1)
        add_emb=self.pos_embedding[:,:token_len,:]+token_embedding
        return self.dropout(add_emb)

class Seq2SeqTransformer(nn.Module):

    def __init__(self,d_model,nhead,num_enc_layers,num_dec_layers,
                dim_forward,dropout,enc_voc_size,dec_voc_size):
        super().__init__()
        ##transformer
        self.transformer=nn.Transformer(d_model=d_model,
                                       nhead=nhead,
                                       num_encoder_layers=num_enc_layers,
                                       num_decoder_layers=num_dec_layers,
                                       dim_feedforward=dim_forward,
                                       dropout=dropout,
                                                   batch_first=True  # 启用批次优先模式
)
        ##encoder input embedding
        self.enc_emb=nn.Embedding(enc_voc_size,d_model)
        ##decoder input embedding
        self.dec_emb=nn.Embedding(dec_voc_size,d_model)
        ##predict generate linear
        self.predict=nn.Linear(d_model,dec_voc_size)
        ##positional encoding
        self.pos_encoding=PositionalEncoding(d_model,dropout)

    def forward(self,enc_inp,dec_inp,tgt_mask,enc_pad_mask,dec_pad_mask):
        ##multi head attention 之前基于位置编码embedding生成
        enc_emb=self.pos_encoding(self.enc_emb(enc_inp))
        dec_emb=self.pos_encoding(self.dec_emb(dec_inp))

        ##调用transformer 计算
        outs=self.transformer(src=enc_emb,tgt=dec_emb,tgt_mask=tgt_mask,
                             src_key_padding_mask=enc_pad_mask,
                             tgt_key_padding_mask=dec_pad_mask)
        #推理
        return self.predict(outs)

       ##推理环节使用
    def encode(self,enc_inp):
        enc_emb=self.pos_encoding(self.enc_emb(enc_inp))
        return self.transformer.encoder(enc_emb)

    def decode(self,dec_inp,memory,dec_mask):
        dec_emb=self.pos_encoding(self.dec_emb(dec_inp))
        return self.transformer.decoder(dec_emb,memory,dec_mask)
        

In [11]:
def get_proc(enc_voc, dec_voc):

    # 嵌套函数定义
    # 外部函数变量生命周期会延续到内部函数调用结束 （闭包）

    def batch_proc(data):
        """
        批次数据处理并返回
        """
        enc_ids, dec_ids= [],[]
        for enc,dec in data:
            # token -> token index
            enc_idx = [vocab[tk] for tk in enc]
            dec_idx = [vocab[tk] for tk in dec]

            # encoder_input
            enc_ids.append(torch.tensor(enc_idx))
            # decoder_input
            dec_ids.append(torch.tensor(dec_idx))
        
        
        # 数据转换张量 [batch, max_token_len]
        # 用批次中最长token序列构建张量
        enc_input = pad_sequence(enc_ids, batch_first=True)
        dec_input = pad_sequence(dec_ids, batch_first=True)

        # 返回数据都是模型训练和推理的需要
        return enc_input, dec_input

    # 返回回调函数
    return batch_proc

In [12]:
from torch.utils.data import DataLoader

ds = list(zip(enc_tokens,dec_tokens))
dl = DataLoader(ds, batch_size=64, shuffle=True, collate_fn=get_proc(vocab, vocab))

In [13]:
d_model=512
nhead=8
num_enc_layers=6
num_dec_layers=6
dim_feedforward=2048
dropout=0.3


In [14]:
device = torch.device('cuda')


In [15]:
import torch.optim as optim
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence


In [16]:

# 生成掩码
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [17]:
# 构建训练模型
# 模型构建
model = Seq2SeqTransformer(
    d_model=512,
    nhead=8,
    num_enc_layers=6,
    num_dec_layers=6,
    dim_forward=2048,
    dropout=0.3,
enc_voc_size=len(vocab),
dec_voc_size=len(vocab))
model.to(device)

# 优化器、损失
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
train_loss_cnt=0
total_loss=0
epochs=2
# 训练
for epoch in range(epochs):
    model.train()
    tpbar = tqdm(dl)
    for enc_input, dec_input in tpbar:
        enc_input = enc_input.long()
        dec_input = dec_input.long()
        enc_input = enc_input.to(device)
        dec_input = dec_input.to(device)

        # 生成掩码
        tgt_mask = generate_square_subsequent_mask(dec_input.size(1)).to(device)
        enc_pad_mask = (enc_input == 0).to(device)
        dec_pad_mask = (dec_input == 0).to(device)
        # 前向传播 
        logits = model(enc_input, dec_input,tgt_mask,enc_pad_mask,dec_pad_mask)
        # 计算损失
        tgt_output = dec_input[:, 1:].contiguous().view(-1)  # 移除 <bos>，形状：(batch * (tgt_len-1),)
        output = logits[:, :-1].contiguous().view(-1, logits.size(-1))  # 移除最后一个位置（假设是 <eos>）
        
       
        loss = criterion(output, tgt_output)
            
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)  # 梯度裁剪
        optimizer.step()
            
        total_loss += loss.item()
        
        # 打印训练信息
        tpbar.set_description(f'Epoch {epoch+1}, Loss: {loss.item():.4f}')


torch.save(model.state_dict(), 'transformer.bin')

Epoch 1, Loss: 3.6583: 100%|██████████| 294/294 [00:53<00:00,  5.53it/s]
Epoch 2, Loss: 4.1365: 100%|██████████| 294/294 [00:52<00:00,  5.61it/s]


In [None]:
 ##1、通过词典把token转换为token_index
    ##2、通过Dataloader把encoder,decoder封装为带有batch的训练数据
    #3 Dataloader的collate_fn调用自定义转换方法填充模型训练数据
    ##3.1  encoder矩阵使用pad_sequence填充
    ##3.2 decoder前面部分训练数据dec_token_matrix[:,:-1,:]
    ##3.3 decoder后面部分训练目标 dec_token_matrix[:,1:,:]
    ##创建mask
    #4.1、dec_mask上三角填充 -inf的mask
    #4.2 enc_pad_mask :{enc矩阵==0}
    #4.3 dec_pad_mask:{dec矩阵==0}
    ##5 创建模型（根据GPU内存大小设计编码和解码器参数和层数）优化器 损失
    ##6 训练模型并保存