In [24]:
from collections import Counter 
import torch 
import torch.optim as optim
import numpy as np 
import torch.nn as nn 

import torchtext 
from torch import nn, Tensor
import torch.nn.functional as F
import pandas as pd
import jieba
from torch.utils.data import DataLoader
from torch.utils import data
import math
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from tqdm import tqdm
import json
from torch.nn.functional import pad, log_softmax
from torch.utils.data import Dataset
from torchtext.data import get_tokenizer
from nltk.translate.bleu_score import corpus_bleu

def read_file(json_path):
    datas=[]
    tokenizer = get_tokenizer('basic_english')
    with open(json_path, 'r', encoding="utf-8") as fp:
        lines = fp.readlines()
        for line in lines:
            line = json.loads(line)
            for i in line:
                english, chinese = i[0], i[1]
                   
            english = tokenizer(english)       
            chinese = list(jieba.cut(chinese))     
            
            english = " ".join(english)
            chinese = " ".join(chinese)             
            datas.append([english,chinese])       

    return datas

trainDatas = read_file('/home/yhz2023/code_file/train.json')
print(trainDatas[0])

class TranslationCorpus:
    def __init__(self):
        self.trainDatas = trainDatas
        self.x_max_len = max(len(row[0].split()) for row in trainDatas) + 1  
        self.y_max_len = max(len(row[1].split()) for row in trainDatas) + 2  
        
        x_datas = Counter(word for row in self.trainDatas for word in row[0].split())
        y_datas = Counter(word for row in self.trainDatas for word in row[1].split())  
        
        x_vocab = {'<pad>': 0, **{word: i+1 for i, word in enumerate(x_datas)}}                    
        y_vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2, **{word: i+3 for i, word in enumerate(y_datas)}} 
        self.x_vocab, self.y_vocab = x_vocab, y_vocab

        self.x_idx2word = {v: k for k, v in self.x_vocab.items()}
        self.y_idx2word = {v: k for k, v in self.y_vocab.items()}

    def make_batch(self, batch_size, test_batch=False):
        input_batch, output_batch, target_batch = [], [], [] 
        
        # 随机选择句子索引
        shuffleDatas = torch.randperm(len(self.trainDatas))[:batch_size] 

        for index in shuffleDatas:            
            
            x_sentence, y_sentence = self.trainDatas[index]           
            x_sentence_key = [self.x_vocab[word] for word in x_sentence.split()]
            y_sentence_key = [self.y_vocab[word] for word in y_sentence.split()]               
         
            x_sentence_key_all = x_sentence_key + [self.x_vocab['<pad>']] * (self.x_max_len - len(x_sentence_key))
            y_sentence_key_all = [self.y_vocab['<sos>']] + y_sentence_key + [self.y_vocab['<eos>']]
            y_sentence_key_all = y_sentence_key_all + [self.y_vocab['<pad>']] * (self.y_max_len - len(y_sentence_key_all)) 
           
            decode_x_sentence_key_all=[self.y_vocab['<sos>']] + ([self.y_vocab['<pad>']] * (self.y_max_len - 2))     if test_batch else y_sentence_key_all[:-1] 

            input_batch.append(x_sentence_key_all)              
            output_batch.append( decode_x_sentence_key_all )    
            target_batch.append(y_sentence_key_all[1:])      
           
        x = torch.LongTensor(input_batch)              
        decode_x = torch.LongTensor(output_batch)       
        y = torch.LongTensor(target_batch)              
        return x, decode_x, y

corpus = TranslationCorpus()
batch_size = 64
x, decode_x, y = corpus.make_batch(batch_size) 

d_k = 64          
d_v = 64           
d_embedding = 512  
n_heads = 8         
n_layers = 6        

# 词表长度
x_vocab_len=len(corpus.x_vocab)
y_vocab_len=len(corpus.y_vocab) 

# 位置编码长度
x_pos_emb_len=corpus.x_max_len+1
y_pos_emb_len=corpus.y_max_len+1


# 多头自注意力类
class MultiHeadAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.W_Q = nn.Linear(d_embedding, d_k * n_heads) 
        self.W_K = nn.Linear(d_embedding, d_k * n_heads) 
        self.W_V = nn.Linear(d_embedding, d_v * n_heads) 
        self.linear = nn.Linear(n_heads * d_v, d_embedding)
        self.layer_norm = nn.LayerNorm(d_embedding)

        
    def forward(self, Q, K, V, encodePadMask):  
        residual, batch_size = Q, Q.shape[0]

        QQ = self.W_Q(Q).view(batch_size, -1, n_heads, d_k).transpose(1,2) 
        KK = self.W_K(K).view(batch_size, -1, n_heads, d_k).transpose(1,2)  
        VV = self.W_V(V).view(batch_size, -1, n_heads, d_v).transpose(1,2)    

        encodePadMask = encodePadMask.unsqueeze(1).repeat(1, n_heads, 1, 1) 
                        
        scaled_attention = torch.matmul(QQ, KK.transpose(-1, -2)) / np.sqrt(d_k)       
        scaled_attention.masked_fill_(encodePadMask, -1e9) 
        attention_weight = nn.Softmax(dim=-1)(scaled_attention)       
        
        context = torch.matmul(attention_weight, VV)   
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, n_heads * d_v)       
        output = self.linear(context)         
        output = self.layer_norm(output + residual)           
        return output, attention_weight 

# 前馈网络类
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, d_ff=2048):
        super().__init__()
        
        self.conv1 = nn.Conv1d(in_channels=d_embedding, out_channels=d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_embedding, kernel_size=1)
        self.layer_norm = nn.LayerNorm(d_embedding)

    def forward(self, inputs):                               
        residual = inputs 
        output = nn.ReLU()(self.conv1(inputs.transpose(1, 2)))
        output = self.conv2(output).transpose(1, 2)    
        output = self.layer_norm(output + residual)         
        
        return output

def PositionalEncoding(n_position, embedding_dim):     
    position_table = np.zeros((n_position, embedding_dim)) 
    for pos_i in range(n_position):
        for hid_j in range(embedding_dim):
            angle = pos_i / np.power(10000, 2 * (hid_j // 2) / embedding_dim)
            position_table[pos_i, hid_j] = angle    
    
    position_table[:, 0::2] = np.sin(position_table[:, 0::2])  
    position_table[:, 1::2] = np.cos(position_table[:, 1::2])  
      
    return torch.FloatTensor(position_table) 

def encode_pad_mask(seq_q, seq_k):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()

    pad_attn_mask = seq_k.data.eq(0).unsqueeze(1)   
    pad_attn_mask = pad_attn_mask.expand(batch_size, len_q, len_k)    
    return pad_attn_mask

def decoder_subsequent_mask(seq):
    attn_shape = [seq.size(0), seq.size(1), seq.size(1)]  
    subsequent_mask = np.triu(np.ones(attn_shape), k=1)
    subsequent_mask = torch.from_numpy(subsequent_mask).byte()   
    return subsequent_mask 

# 编码器层
class EncoderLayer(nn.Module):
    def __init__(self):
        super().__init__()        
        self.multiHeadAttention = MultiHeadAttention()            
        self.poswiseFeedForwardNet = PoswiseFeedForwardNet()   
    def forward(self, x, encodePadMask): 
        encode_output, encode_attention_weight = self.multiHeadAttention(x,x,x,encodePadMask)    
        encode_output = self.poswiseFeedForwardNet(encode_output) 
       
        return encode_output, encode_attention_weight 

# 编码器
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()        
        self.x_emb = nn.Embedding(x_vocab_len, d_embedding) 
        self.pos_emb = nn.Embedding.from_pretrained( PositionalEncoding(x_pos_emb_len, d_embedding) , freeze=True) 
        self.layers = nn.ModuleList(EncoderLayer() for _ in range(n_layers))
    def forward(self, x):  
        pos_ids = torch.arange(1, x.size(1) + 1).unsqueeze(0).to(x)
        x_pos_emb = self.x_emb(x) + self.pos_emb(pos_ids)
        encodePadMask = encode_pad_mask(x, x)
        encode_attention_weights = []

        for layer in self.layers: 
            encode_output, encode_attention_weight = layer(x_pos_emb, encodePadMask)
            encode_attention_weights.append(encode_attention_weight)
        return encode_output, encode_attention_weights

# 解码器层
class DecoderLayer(nn.Module):
    def __init__(self):
        super().__init__()        
        self.maskMultiHeadAttention = MultiHeadAttention()    
        self.multiHeadAttention = MultiHeadAttention()          
        self.pos_ffn = PoswiseFeedForwardNet()            
    def forward(self, decode_x, encode_output, encodePadMask_add_decoderSubsequentMask, encodePadMask_decodeX_x): 
        mask_decoder_output, mask_decoder_attention_weight = self.maskMultiHeadAttention(decode_x, decode_x, decode_x, encodePadMask_add_decoderSubsequentMask)
        decoder_output, decoder_attention_weight = self.multiHeadAttention(mask_decoder_output, encode_output, encode_output, encodePadMask_decodeX_x)       
        decoder_output = self.pos_ffn(decoder_output)
        return decoder_output, mask_decoder_attention_weight, decoder_attention_weight

#  解码器
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.y_emb = nn.Embedding(y_vocab_len, d_embedding) 
        self.pos_emb = nn.Embedding.from_pretrained( PositionalEncoding(y_pos_emb_len, d_embedding), freeze=True)     
        self.layers = nn.ModuleList([DecoderLayer() for _ in range(n_layers)])
    def forward(self, decode_x, x, encode_output): 
        pos_ids = torch.arange(1, decode_x.size(1) + 1).unsqueeze(0).to(decode_x)
        y_pos_emb = self.y_emb(decode_x) + self.pos_emb(pos_ids)        
        encodePadMask_decodeX_decodeX = encode_pad_mask(decode_x, decode_x)    
        decoderSubsequentMask = decoder_subsequent_mask(decode_x)

        encodePadMask_add_decoderSubsequentMask = torch.gt((encodePadMask_decodeX_decodeX + decoderSubsequentMask), 0) 
        encodePadMask_decodeX_x = encode_pad_mask(decode_x, x)   
       
        mask_decoder_attention_weights, decoder_attention_weights = [], [] 
       
        for layer in self.layers:
            decoder_output, mask_decoder_attention_weight, decoder_attention_weight = layer(y_pos_emb, encode_output, encodePadMask_add_decoderSubsequentMask, encodePadMask_decodeX_x)
            mask_decoder_attention_weights.append(mask_decoder_attention_weight)
            decoder_attention_weights.append(decoder_attention_weight)        
          
        return decoder_output, mask_decoder_attention_weights, decoder_attention_weights

class Transformer(nn.Module):
    def __init__(self):
        super().__init__()        
        self.encoder = Encoder()        
        self.decoder = Decoder() 
        self.projection = nn.Linear(d_embedding, y_vocab_len, bias=False)
        
    def forward(self, x, decode_x):    
        encode_output, encode_attention_weights = self.encoder(x)
        decoder_output, mask_decoder_attention_weights, decoder_attention_weights = self.decoder(decode_x, x, encode_output)
        outputs = self.projection(decoder_output)  
     
        return outputs, encode_attention_weights, mask_decoder_attention_weights, decoder_attention_weights

def tokens_to_sentence(tokens, idx2word):
    return [idx2word[token] for token in tokens if token != corpus.y_vocab['<pad>']]  # Excluding padding

def calculate_bleu(model, data_loader, corpus):
    model.eval()
    references = []
    hypotheses = []

    with torch.no_grad():
        for x, decode_x, y in data_loader:
            outputs, _, _, _ = model(x, decode_x)
            _, predicted_indices = torch.max(outputs, dim=2)

            # Convert predicted indices to words
            for i in range(predicted_indices.size(0)):
                hypothesis = tokens_to_sentence(predicted_indices[i].tolist(), corpus.y_idx2word)
                reference = [tokens_to_sentence(y[i].tolist(), corpus.y_idx2word)]
                
                hypotheses.append(hypothesis)
                references.append(reference)

    # Calculate BLEU score
    bleu_score = corpus_bleu(references, hypotheses)
    return bleu_score
    
model = Transformer() 
criterion = nn.CrossEntropyLoss() 
optimizer = optim.Adam(model.parameters(), lr=0.0001) 
epochs = 20
for epoch in range(epochs):
    optimizer.zero_grad() 
    x, decode_x, y = corpus.make_batch(batch_size) # 创建训练数据-[编码器输入,解码器输入,目标数据]    
    outputs, _, _, _ = model(x, decode_x) # 获取模型输出 
    loss = criterion(outputs.view(-1, y_vocab_len), y.view(-1)) 
    if (epoch + 1) % 1 == 0:
        bleu_score = calculate_bleu(model, [corpus.make_batch(batch_size, test_batch=True) for _ in range(10)], corpus)
        print(f"Epoch: {epoch + 1:04d}, Loss: {loss:.6f}")
    loss.backward()     
    optimizer.step()
    

['if structural reforms simply lower all wages and prices , it may indeed be difficult in the short-term to counter the drop in aggregate demand .', '如果 结构 改革 降低 了 所有 工资 和 物价 ， 那么 克服 短期 总需求 下降 的确 十分困难 。']
Epoch: 0001, Loss: 3.021563
Epoch: 0002, Loss: 2.666291
Epoch: 0003, Loss: 2.346162
Epoch: 0004, Loss: 2.047945
Epoch: 0005, Loss: 1.762816
Epoch: 0006, Loss: 1.488882
Epoch: 0007, Loss: 1.229650
Epoch: 0008, Loss: 0.991173
Epoch: 0009, Loss: 0.779623
Epoch: 0010, Loss: 0.599071
Epoch: 0011, Loss: 0.451447
Epoch: 0012, Loss: 0.335408
Epoch: 0013, Loss: 0.247433
Epoch: 0014, Loss: 0.182554
Epoch: 0015, Loss: 0.135583
Epoch: 0016, Loss: 0.101789
Epoch: 0017, Loss: 0.077460
Epoch: 0018, Loss: 0.059849
Epoch: 0019, Loss: 0.047009
Epoch: 0020, Loss: 0.037548
