In [184]:
import os
import torchtext
import torch.nn as nn
import torch.utils.data as tud
import torch.nn.functional as F
from collections import Counter

In [208]:
#prepare data
BATCH_SIZE = 32
sequence_len = 50
vocab_size = 30000
train_file, dev_file, test_file = [os.path.join('./data', file) \
                                   for file in ['text8.train', 'text8.dev', 'text8.test']]
train_raw = open(train_file).readlines()[0]
dev_raw = open(dev_file).readlines()[0]
test_raw = open(test_file).readlines()[0]
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


def tokenize(text):
    return text.split(' ')

vocab = Counter(train_raw.split(' ')).most_common(vocab_size - 1)
idx_to_word = [item[0] for item in vocab]
idx_to_word.append('UNK')
word_to_idx = {word: i for i, word in enumerate(idx_to_word)}

class LanguageDataset(tud.Dataset):
    def __init__(self, text, sequence_len, idx_to_word, word_to_idx, vocab_size, device):
        super(LanguageDataset, self).__init__()
        self.device = device
        self.vocab_size = vocab_size
        self.idx_to_word = idx_to_word
        self.word_to_idx = word_to_idx
        self.word_encode = [self.word_to_idx.get(word, self.vocab_size - 1) for word in text]
        self.word_encode = torch.LongTensor(self.word_encode).to(device)
        self.sequence_len = sequence_len
    
    def __len__(self):
        return len(self.word_encode) - self.sequence_len
    
    def __getitem__(self, idx):
        x = self.word_encode[idx: min(idx + self.sequence_len, len(self.word_encode) - 1)]
        y = self.word_encode[idx + 1: min(idx + self.sequence_len + 1, len(self.word_encode))]
        return x, y

In [209]:
#data loader
train_data = LanguageDataset(train_raw, sequence_len, idx_to_word, word_to_idx, vocab_size, device)
dev_data = LanguageDataset(dev_raw, sequence_len, idx_to_word, word_to_idx, vocab_size, device)
test_data = LanguageDataset(test_raw, sequence_len, idx_to_word, word_to_idx, vocab_size, device)
train_iter = tud.DataLoader(train_data, batch_size = BATCH_SIZE, shuffle = True)
dev_iter = tud.DataLoader(dev_data, batch_size = BATCH_SIZE, shuffle = True)
test_iter = tud.DataLoader(test_data, batch_size = BATCH_SIZE, shuffle = True)

In [211]:
for i, (x, y) in enumerate(train_iter):
    print(x.shape)
    print(y.shape)
    print(' '.join([idx_to_word[idx] for idx in x[0]]))
    print(' '.join([idx_to_word[idx] for idx in y[0]]))
    break

torch.Size([32, 50])
torch.Size([32, 50])
a n d UNK p i c n i c s UNK a c r o s s UNK a r g e n t i n a UNK v e g e t a b l e s UNK a n d UNK s a l a d s
n d UNK p i c n i c s UNK a c r o s s UNK a r g e n t i n a UNK v e g e t a b l e s UNK a n d UNK s a l a d s UNK


In [234]:
#model
import torch
embed_size, hidden_size = 300, 1000
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LanguageModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.linear = nn.Linear(hidden_size, vocab_size)
        self.hidden_size = hidden_size
    
    def forward(self, x, hidden):
        input_x = self.embed(x)     #batch_size * sequence_len * embedding_size
        output, hidden = self.lstm(input_x, hidden)  #output: batch_size * sequence_len * embedding_size
        output_vocab = self.linear(output)   #output_vocab:  batch_size * sequence_len * vocab_size
        return output_vocab, hidden
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1, bsz, self.hidden_size), requires_grad=requires_grad),
                    weight.new_zeros((1, bsz, self.hidden_size), requires_grad=requires_grad))
        

In [236]:
#train
model = LanguageModel(vocab_size, embed_size, hidden_size).to(device)
learning_rate = 4e-4
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)
requires_grad = False
GRAD_CLIP = 1.0

def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
    
def evaluate(model, data_iter):
    model.eval()
    loss_all = 0.
    count = 0.
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        print(len(data_iter))
        for i, (x, y) in enumerate(data_iter):
            hidden = repackage_hidden(hidden)
            output, hidden = model(x, hidden)
            loss = loss_fn(output.view(-1, vocab_size), y.view(-1))
            loss_all += loss * x.shape[0]
            count += x.shape[0]
    mode.train()
    return loss_all / count
    
dev_loss_list = []
model_path = './best_mode.pth'
for epoch in range(2):
    model.train()
    hidden = model.init_hidden(BATCH_SIZE)
    for i, (x, y) in enumerate(train_iter):
        hidden = repackage_hidden(hidden)
        output, hidden = model(x, hidden)
        loss = loss_fn(output.view(-1, vocab_size), y.view(-1))
        loss.requires_grad_(True)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
        if i % 100 == 0:
            print("Epoch: {}, iter: {}, train loss: {}".format(epoch, i, loss))
            
        if i % 1000 == 0:
            dev_loss = evaluate(model, dev_iter)
            print("Epoch: {}, iter: {}, dev loss: {}".format(epoch, i, dev_loss))
            if len(dev_loss_list) == 0 or dev_loss < min(dev_loss_list):
                torch.save(model.state_dict(), model_path)
            else:
                scheduler.step()
            dev_loss_list.append(dev_loss)

NameError: name 'data_iter0' is not defined

In [None]:
test_model = LanguageModel(vocab_size, embed_size, hidden_size)
test_model.load_state_dict(torch.load(model_path))
words_list = []
input_x = torch.randint(vocab_size, (1, 1), dtype=torch.long)
hidden = test_model.init_hidden(1)
for i in range(100):
    output, hidden = model(input_x, hidden)
    y = torch.argmax(output.view(-1))
    input_x.fill_(y)
    word = idx_to_word[y]
    words_list.append(word)
print(' '.join(word_list))

In [92]:
import torch
rnn = nn.LSTM(10, 20, 2) #embedding_size, hidden_size, num_layer
input = torch.randn(5, 3, 10)   #sequence_len, batch_size, embedding_size
h0 = torch.randn(2, 3, 20)   #num_layer, batch_size, hidden_size
c0 = torch.randn(2, 3, 20)   #num_layer, batch_size, hidden_size
output, (hn, cn) = rnn(input, (h0, c0))
#output: sequence_len, batch_size, embedding_size
weights = next(rnn.parameters())