In [None]:
# edited by xyk
# code by Tae Hwan Jung @graykode

# NNLM: Neural Network Language Model

import torch
import torch.nn as nn
import torch.optim as optim
# optim引入优化算法

In [None]:
def make_batch():
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split() # space tokenizer
        input = [word_dict[n] for n in word[:-1]] # create (1~n-1) as input
        target = word_dict[word[-1]] # create (n) as target, We usually call this 'casual language model'
        # 因果语言模型

        input_batch.append(input)
        target_batch.append(target)
    
    print("make batch: ", input_batch, target_batch)

    return input_batch, target_batch

In [None]:
# Model
class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Linear(n_step * m, n_hidden, bias=False)
        self.d = nn.Parameter(torch.ones(n_hidden))
        self.U = nn.Linear(n_hidden, n_class, bias=False)
        self.W = nn.Linear(n_step * m, n_class, bias=False)
        self.b = nn.Parameter(torch.ones(n_class))
        # C: 词向量, 计算词向量表, 大小是len(word_dict)*m, 词向量随机赋值, 先使用one-hot, 然后使用matrix_C映射到词向量
        # H: 隐藏层的权重
        # d: 隐藏层的偏置
        # W: 输入层到输出层的权重
        # U: 输出层的权重
        # b: 输出层的偏置
        # m: 词向量的维度
        # n_step: 用n_step个词(n-gram)预测下一个词
        # n_hidden: 隐藏层神经元的数量
        # n_class: 分类总数, 即单词数

    def forward(self, X):
        X = self.C(X) # X : [batch_size, n_step, m]
        # 将batch_size*n_step转换为batch_size*n_step*m
        X = X.view(-1, n_step * m) # [batch_size, n_step * m]
        # 拼接词向量
        tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]
        # 隐藏层, 利用tanh激活
        output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]
        # 输出层, softmax
        return output

In [None]:
"""
if __name__ == '__main__':
"""
n_step = 2 # number of steps, n-1 in paper
n_hidden = 2 # number of hidden size, h in paper
m = 2 # embedding size, m in paper

sentences = ["i like dog", "i love coffee", "i hate milk", "you hate apple", "you like dog"]
# sentences = ["i like dog", "i love coffee", "i hate milk", "you hate apple", "you like dog", "you hate dog"]

word_list = " ".join(sentences).split()
word_list = list(set(word_list))
word_dict = {w: i for i, w in enumerate(word_list)}
number_dict = {i: w for i, w in enumerate(word_list)}
print(word_dict)
n_class = len(word_dict)  # number of Vocabulary

model = NNLM()
# 实例化

criterion = nn.CrossEntropyLoss()
# 交叉熵损失
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Adam优化

input_batch, target_batch = make_batch()
input_batch = torch.LongTensor(input_batch)
target_batch = torch.LongTensor(target_batch)

In [None]:
# Training
r = 10000
for epoch in range(r):
    optimizer.zero_grad()
    # 梯度清零
    output = model(input_batch)
    # output : [batch_size, n_class], target_batch : [batch_size]
    
    loss = criterion(output, target_batch)
    # criterion计算损失
    if (epoch + 1) % 1000 == 0:
        print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))
        
    if epoch + 1 == r:
        print("output: ", output)

    loss.backward()
    # 反向传播
    optimizer.step()
    # 更新参数

# Predict
predict = model(input_batch).data.max(1, keepdim=True)[1]
# 获取最大的值对应的序号

# Test
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])