# 1. 构建实验语料库

In [1]:
sentences = [
    "我 喜欢 玩具",
    "我 爱 爸爸",
    "我 讨厌 挨打" 
]
word_list = list(set(" ".join(sentences).split()))
word_to_idx = {w: i for i, w in enumerate(word_list)}
idx_to_word = {i: w for i, w in enumerate(word_list)}

voc_size = len(word_list)
print(f"词汇表: {word_to_idx}")
print(f"词汇表大小: {voc_size}")

词汇表: {'讨厌': 0, '喜欢': 1, '爱': 2, '我': 3, '玩具': 4, '爸爸': 5, '挨打': 6}
词汇表大小: 7


# 2. 生成 NPLM 训练数据

In [2]:
import torch
import random
batch_size = 2
def make_batch():
    input_batch = []
    ouput_batch = []
    selected_sentences = random.sample(sentences, batch_size)
    for sent in selected_sentences:
        word = sent.split()
        inp = [word_to_idx[w] for w in word[:-1]]
        target = word_to_idx[word[-1]]
        input_batch.append(inp)
        ouput_batch.append(target)
    input_batch = torch.LongTensor(input_batch)
    ouput_batch = torch.LongTensor(ouput_batch)
    return input_batch, ouput_batch


In [3]:
input_batch, output_batch = make_batch()
print(f"输入批处理数据: {input_batch}")

input_words = []
for input_idx in input_batch:
    input_words.append([idx_to_word[idx.item()] for idx in input_idx])
print(f"输入批处理数据的原始词: {input_words}")

print(f"目标批处理数据: {output_batch}")
target_words = [idx_to_word[idx.item()] for idx in output_batch]
print(f"目标批处理数据的原始词: {target_words}")

输入批处理数据: tensor([[3, 2],
        [3, 0]])
输入批处理数据的原始词: [['我', '爱'], ['我', '讨厌']]
目标批处理数据: tensor([5, 6])
目标批处理数据的原始词: ['爸爸', '挨打']


# 3. 定义 NPLM

In [4]:
import torch.nn as nn

class NPLM(nn.Module):
    def __init__(self, n_step, voc_size, embedding_size, n_hidden):
        super().__init__()
        self.n_step = n_step
        self.voc_size = voc_size
        self.embedding_size = embedding_size
        self.n_hidden = n_hidden
        self.C = nn.Embedding(voc_size, embedding_size)
        self.linear1 = nn.Linear(n_step*embedding_size, n_hidden)
        self.linear2 = nn.Linear(n_hidden, voc_size)
    
    def forward(self, X):
        X = self.C(X)
        X = X.view(-1, self.n_step * self.embedding_size)
        hidden = torch.tanh(self.linear1(X))
        output = self.linear2(hidden)
        return output

# 4. 实例化 NPLM

In [5]:
n_step = 2
n_hidden = 2
embedding_size = 2
model = NPLM(n_step, voc_size, embedding_size, n_hidden)
print(f"MPLM Model: {model}")

MPLM Model: NPLM(
  (C): Embedding(7, 2)
  (linear1): Linear(in_features=4, out_features=2, bias=True)
  (linear2): Linear(in_features=2, out_features=7, bias=True)
)


# 5. 训练模型

In [6]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)
for epoch in range(5000):
    optimizer.zero_grad()
    input_batch, target_batch = make_batch()
    output = model(input_batch)
    loss = criterion(output, target_batch)
    if (epoch + 1) % 1000 == 0:
        print(f"Epoch: {epoch+1}, Loss: {loss:.6f}")
    loss.backward()
    optimizer.step()

Epoch: 1000, Loss: 0.000629
Epoch: 2000, Loss: 0.000186
Epoch: 3000, Loss: 0.000093
Epoch: 4000, Loss: 0.000048
Epoch: 5000, Loss: 0.000036


# 6. 预测

In [7]:
inputs = [["我", "讨厌"], ["我", "喜欢"]]
input_idx = [[word_to_idx[w] for w in sent] for  sent in inputs]
input_batch = torch.LongTensor(input_batch)

predict = model(input_batch).data.max(1)[1]
predict_str = [idx_to_word[p.item()] for p in predict.squeeze()]
for input_seq, pred in zip(inputs, predict_str):
    print(f"{input_seq} -> {pred}")

['我', '讨厌'] -> 玩具
['我', '喜欢'] -> 挨打
