In [1]:
"""
n-gramのneural language modelで学習と予測を行うプログラム
"""

import torch
import torch.nn as nn
import torch.optim as optim


class NLM(nn.Module):
    """
    n-gram neural language model
    """
    def __init__(self, context_size, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear_W = nn.Linear(context_size*embedding_dim, hidden_dim)
        self.linear_U = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, context_word_ids):
        # shape: (1, context_size*embedding_dim)
        projection_layer = self.embedding(context_word_ids).view(1, -1)
        # shape: (1, hidden_dim)
        hidden_layer = torch.relu(self.linear_W(projection_layer))
        # shape: (1, vocab_size)
        output_layer = self.linear_U(hidden_layer)
        # shape: (1, vocab_size)
        return torch.log_softmax(output_layer, dim=1)

    
def build_vocab(sentences):
    all_tokens = []
    for s in sentences:
        all_tokens += s.split()
    vocab = set(all_tokens)
    token_to_index = dict(zip(vocab, range(len(vocab))))
    index_to_token = {index:token for token, index in token_to_index.items()}
    return token_to_index, index_to_token


def ngram_generator(context_size, sentences, token_to_index):
    """
    yield (n-gram context token ids tensor, target token id tensor)
    e.g. (tensor([12,  0, 10]), tensor([4]))
    corresponding to (['I', 'have', 'to'], 'make')
    """
    for s in sentences:
        tokens = s.split()
        for t in range(len(tokens)):
            if t+context_size == len(tokens):
                break
            context, target = tokens[t:t+context_size], tokens[t+context_size]
            context = torch.tensor(
                [token_to_index[w] for w in context], dtype=torch.long)
            target = torch.tensor(
                [token_to_index[target]], dtype=torch.long)
            yield context, target

In [85]:
sentences = ["I have to make sure when I get home to foo the cat .",
             "I have to make foo when I get home to feed the cat .",
             "I have to make sure when you get home to feed the cat .",
             "I have to make sure when I get house to feed the dog ."]
token_to_index, index_to_token = build_vocab(sentences)
context_size = 3  # ngram size - 1
model = NLM(context_size=context_size, vocab_size=len(token_to_index), embedding_dim=32, hidden_dim=32)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# training
for _ in range(30):
    for context, target in ngram_generator(context_size, sentences, token_to_index):
        optimizer.zero_grad()
        log_prob = model(context)
        loss = criterion(log_prob, target)
        loss.backward()
        optimizer.step()

In [86]:
print('corpus sentences:')
for s in sentences:
    print(s)
print()

context_words = ['I', 'have', 'to']
assert len(context_words) == context_size
print('predicted sentence:')
for w in context_words:
    print(w, end=' ')

# predict
while True:
    context_tensor = torch.tensor([token_to_index[w] for w in context_words])
    predicted_log_prob = model(context_tensor)
    next_word_index = torch.argmax(predicted_log_prob).item()
    next_word = index_to_token[next_word_index]
    print(next_word, end=' ')
    if next_word == '.':
        break
    context_words.pop(0)
    context_words.append(next_word)

corpus sentences:
I have to make sure when I get home to foo the cat .
I have to make foo when I get home to feed the cat .
I have to make sure when you get home to feed the cat .
I have to make sure when I get house to feed the dog .

predicted sentence:
I have to make sure when I get home to feed the cat . 

In [74]:
# コーパスにないが最も妥当な文を生成できている
# context_sizeを変えても面白いかもしれない