In [1]:
import datetime
import torch
from torch import optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import unicodedata
import re
import random
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np
import datetime

In [2]:
# 定义特殊标记
SOS_token = 0
EOS_token = 1
PAD_token = 2  # 用于填充

# 定义语言处理类
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {SOS_token: "SOS", EOS_token: "EOS", PAD_token: "<pad>"}
        self.n_words = 3  # 计数 SOS、EOS 和 PAD

    def index_words(self, sentence):
        for word in sentence.split(' '):
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def read_langs(lang1, lang2, reverse=False):
    print("Reading lines...")
    lines = open(f'{lang1}-{lang2}.txt', 'r', encoding='utf-8').read().strip().split('\n')
    pairs = [[normalize_string(s) for s in l.split('\t')] for l in lines]

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

MAX_LENGTH = 10

def filter_pair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and len(p[1].split(' ')) < MAX_LENGTH

def filter_pairs(pairs):
    return [pair for pair in pairs if filter_pair(pair)]

def prepare_data(lang1_name, lang2_name, reverse=False):
    input_lang, output_lang, pairs = read_langs(lang1_name, lang2_name, reverse)
    print(f"Read {len(pairs)} sentence pairs")

    pairs = filter_pairs(pairs)
    print(f"Trimmed to {len(pairs)} sentence pairs")

    print("Indexing words...")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])

    return input_lang, output_lang, pairs

In [3]:
def evaluate(model, iterator, criterion):

    model.eval()

    epoch_loss = 0

    with torch.no_grad():

        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]

            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(iterator)



In [4]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers,batch_first=True)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):

        embedded = self.dropout(self.embedding(src))
        outputs, hidden = self.rnn(embedded)
        return hidden

# 定义解码器
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, n_layers,batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden):
        input = input.unsqueeze(1)

        embedded = self.dropout(self.embedding(input))
        output, hidden = self.rnn(embedded, hidden)
        prediction = self.fc_out(output.squeeze(1))
        #prediction = self.fc_out(output)
        return prediction, hidden

# 定义seq2seq模型
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len = trg.shape[1]
        batch_size = trg.shape[0]
        output_dim = self.decoder.output_dim

        outputs = torch.zeros(batch_size,trg_len, output_dim).to(self.device)

        hidden = self.encoder(src)

        # 解码器的输入为目标序列的第一个词
        input = trg[:, 0]

        for t in range(1, trg_len):

            output, hidden = self.decoder(input, hidden)
            outputs[:,t] = output

            # 决定是否使用teacher forcing
            teacher_force = random.random() < teacher_forcing_ratio
            input = trg[:,t] if teacher_force else output.argmax(1)

        return outputs

In [5]:
SOS_token = 0
EOS_token = 1
PAD_token = 2  # 用于填充
USE_CUDA = torch.cuda.is_available()
#USE_CUDA = False
# 准备数据
input_lang, output_lang, pairs = prepare_data('eng', 'fra', True)
input_vocab_size = len(input_lang.index2word)
output_vocab_size = len(output_lang.index2word)

print(f"Input vocabulary size: {input_vocab_size}")
print(f"Output vocabulary size: {output_vocab_size}")

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 创建训练数据
def indexes_from_sentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def variable_from_sentence(lang, sentence):
    indexes = indexes_from_sentence(lang, sentence)
    indexes.append(EOS_token)  # 添加 EOS_token
    var = torch.LongTensor(indexes)
    if USE_CUDA:
        var = var.cuda()  # 转移到 GPU
    return var

def variables_from_pair(pair):
    input_variable = variable_from_sentence(input_lang, pair[0])
    target_variable = variable_from_sentence(output_lang, pair[1])
    return (input_variable, target_variable)

train_data = [variables_from_pair(pair) for pair in pairs]
input_variables, target_variables = zip(*train_data)

# 使用 pad_sequence 填充输入和目标张量
input_tensor = pad_sequence(input_variables, batch_first=True, padding_value=PAD_token )  # [batch_size ,max_length]
target_tensor = pad_sequence(target_variables, batch_first=True, padding_value=PAD_token )  #[batch_size ,max_length]

# 创建 DataLoader
dataset = TensorDataset(input_tensor, target_tensor)
train_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# 模型参数
ENC_EMB_DIM = 300
DEC_EMB_DIM = 300
HID_DIM = 256
N_LAYERS = 1
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

# 初始化模型
device = 'cuda' if torch.cuda.is_available() else 'cpu'
enc = Encoder(len(input_lang.index2word), ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT).to(device)
dec = Decoder(len(output_lang.index2word), DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT).to(device)
model = Seq2Seq(enc, dec, device).to(device)

# 训练设置
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.NLLLoss()

# 训练函数
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for src, trg in iterator:
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()

        intrg=trg[:,:-1]
        # 创建一个全0的Tensor，其形状与第二维的每个子Tensor相同，除了第一个维度是1
        zero_tensor = torch.zeros((intrg.size(0), 1)).long().to(device)

        # 使用torch.cat在第二维（dim=1）上拼接这两个Tensor
        resultin_trg = torch.cat((zero_tensor, intrg), dim=1)

        output = model(src, resultin_trg)#[batch_size,sequence_len,output_dim]
        logp = nn.functional.log_softmax(output, dim=-1)


        output_dim = output.shape[-1]

        logp = logp.reshape(-1, output_dim)  # 使用 reshape
        trg = trg.reshape(-1)  # 使用 reshape

        loss = criterion(logp, trg)  # 计算损失
        #print(loss)
        loss.backward()
       # print(output.size())
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)  # 梯度裁剪
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# 开始训练
N_EPOCHS = 10
CLIP = 1

for epoch in range(1, N_EPOCHS + 1):
    start = datetime.datetime.now()
    loss = train(model, train_loader, optimizer, criterion, CLIP)
    end = datetime.datetime.now()
    print(f'Epoch {epoch}, Loss: {loss:.4f},用时:{end-start}')

Reading lines...
Read 20000 sentence pairs
Trimmed to 19920 sentence pairs
Indexing words...
Input vocabulary size: 6069
Output vocabulary size: 3439
Epoch 1, Loss: 3.2278,用时:0:00:11.880610
Epoch 2, Loss: 2.6748,用时:0:00:14.263614
Epoch 3, Loss: 2.4429,用时:0:00:11.574438
Epoch 4, Loss: 2.2787,用时:0:00:09.636923
Epoch 5, Loss: 2.1500,用时:0:00:09.511301
Epoch 6, Loss: 2.0473,用时:0:00:09.547615
Epoch 7, Loss: 1.9608,用时:0:00:09.525235
Epoch 8, Loss: 1.8867,用时:0:00:09.549306
Epoch 9, Loss: 1.8225,用时:0:00:09.228362
Epoch 10, Loss: 1.7727,用时:0:00:09.446836
