In [4]:
sentences = [
    "我 喜欢 玩具",
    "我 爱 爸爸",
    "我 讨厌 挨打" 
]
word_list = list(set(" ".join(sentences).split()))
word_to_idx = {w: i for i, w in enumerate(word_list)}
idx_to_word = {i: w for i, w in enumerate(word_list)}

voc_size = len(word_list)
print(f"词汇表: {word_to_idx}")
print(f"词汇表大小: {voc_size}")

词汇表: {'玩具': 0, '爸爸': 1, '喜欢': 2, '挨打': 3, '我': 4, '讨厌': 5, '爱': 6}
词汇表大小: 7


In [5]:
import torch
import random

batch_size = 2


def make_batch():
    input_batch = []
    ouput_batch = []
    selected_sentences = random.sample(sentences, batch_size)
    for sent in selected_sentences:
        word = sent.split()
        inp = [word_to_idx[w] for w in word[:-1]]
        target = word_to_idx[word[-1]]
        input_batch.append(inp)
        ouput_batch.append(target)
    input_batch = torch.LongTensor(input_batch)
    ouput_batch = torch.LongTensor(ouput_batch)
    return input_batch, ouput_batch


input_batch, output_batch = make_batch()
print(f"输入批处理数据: {input_batch}")

input_words = []
for input_idx in input_batch:
    input_words.append([idx_to_word[idx.item()] for idx in input_idx])
print(f"输入批处理数据的原始词: {input_words}")

print(f"目标批处理数据: {output_batch}")
target_words = [idx_to_word[idx.item()] for idx in output_batch]
print(f"目标批处理数据的原始词: {target_words}")

输入批处理数据: tensor([[4, 6],
        [4, 2]])
输入批处理数据的原始词: [['我', '爱'], ['我', '喜欢']]
目标批处理数据: tensor([1, 0])
目标批处理数据的原始词: ['爸爸', '玩具']


In [10]:
import torch.nn as nn

class NPLM(nn.Module):
    
    def __init__(self, voc_size, embedding_size, n_hidden):
        super().__init__()
        self.voc_size = voc_size
        self.embedding_size = embedding_size
        self.n_hidden = n_hidden
        self.C = nn.Embedding(voc_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, n_hidden, batch_first=True)
        self.linear = nn.Linear(n_hidden, voc_size)
    
    def forward(self, X):
        X = self.C(X)
        lstm_out, _ = self.lstm(X)
        output = self.linear(lstm_out[:, -1, :])
        return output
    
embedding_size = 2
n_hidden = 2
rnn_model = NPLM(voc_size, embedding_size, n_hidden)
print(f"RNN 模型结构: {rnn_model}")
        

RNN 模型结构: NPLM(
  (C): Embedding(7, 2)
  (lstm): LSTM(2, 2, batch_first=True)
  (linear): Linear(in_features=2, out_features=7, bias=True)
)


In [11]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(rnn_model.parameters(), lr=0.1)
for epoch in range(5000):
    optimizer.zero_grad()
    input_batch, target_batch = make_batch()
    output = rnn_model(input_batch)
    loss = criterion(output, target_batch)
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch+1}, Loss: {loss:.6f}")
    loss.backward()
    optimizer.step()

Epoch: 1000, Loss: 0.000304
Epoch: 2000, Loss: 0.000107
Epoch: 3000, Loss: 0.000048
Epoch: 4000, Loss: 0.000026
Epoch: 5000, Loss: 0.000015


In [12]:
inputs = [["我", "讨厌"], ["我", "喜欢"]]
input_idx = [[word_to_idx[w] for w in sent] for  sent in inputs]
input_batch = torch.LongTensor(input_batch)

predict = rnn_model(input_batch).data.max(1)[1]
predict_str = [idx_to_word[p.item()] for p in predict.squeeze()]
for input_seq, pred in zip(inputs, predict_str):
    print(f"{input_seq} -> {pred}")

['我', '讨厌'] -> 爸爸
['我', '喜欢'] -> 玩具
