# 语言模型
利用pytorch训练一个语言模型

学习利用torchtext来创建vocabulary，然后把数据读成batch_size的格式。

In [1]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random
import torch.nn as nn

# USE_CUDA = torch.cuda.is_available()

# 为了保证实验结果的一致性，把random_seed固定在一个值
random.seed(53113)
np.random.seed(53113)
torch.manual_seed(53113)
# if USE_CUDA:
#     torch.manual_seed(53113)

BATCH_SIZE = 32
MAX_VOCAB_SIZE = 50000
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
NUM_EPOCHS = 2
LEARNING_RATE = 1e-3

利用torchtext来读取数据
- torchtext的一个重要概念就是Field，他决定了你的数据是如何处理，我们使用TEXT的这个类来处理文本数据；
- torchtext提供了LanguageModelingDataset这个类来帮我们处理语言模型数据集；
- build_vocab可以根据我们提供的训练数据集来创建最高频词的单词表，max_size来帮助我们限定单词总量；
- BPTTIterator可以连续的得到连贯的句子，BPTT的全过程是back propagation through time

In [2]:
TEXT = torchtext.data.Field(lower=True)
train, val, test = torchtext.datasets.LanguageModelingDataset.splits(path='./text8', 
                                                  train='text8.train.txt', 
                                                  validation='text8.dev.txt', 
                                                  test='text8.test.txt',
                                                 text_field=TEXT)


In [3]:
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
len(TEXT.vocab)

50002

单词表有50002而不是5000是因为torchtext给我们增加了两个特殊的token，<unk>表示未知的单词，<pad>表示padding

In [4]:
# 词典index to string
type(TEXT.vocab.itos) #list
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']

In [5]:
# 词典string to index
print(type(TEXT.vocab.stoi)) #collections.defaultdict
TEXT.vocab.stoi['july']

<class 'collections.defaultdict'>


498

In [6]:
train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits((train, val, test),
                                                                     batch_size=BATCH_SIZE,
#                                                                      device=torch.device('cuda'),
                                                                    bptt_len=50,
                                                                    repeat=False,
                                                                    shuffle=True)

In [7]:
it = iter(train_iter)
batch = next(it)
batch


[torchtext.data.batch.Batch of size 32]
	[.text]:[torch.LongTensor of size 50x32]
	[.target]:[torch.LongTensor of size 50x32]

batch.text表示的是输入的句子，batch,target表示预测的句子，维度都是50*32  
下面看看.text的内容


In [8]:
batch.text

tensor([[4815,   50,    6,  ..., 9116,   33,    7],
        [3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        ...,
        [   8,   34,  522,  ..., 5237,    3,   12],
        [3628, 1266,  968,  ...,    3,    2,    6],
        [   2,   54,   78,  ...,   12,  185, 3027]])

将index转化成string

In [9]:
# 第二个维度表示的是batch的大小
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the
originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization


可以看到.text的内容和.target的内容仅相差一个位置的内容

模型的输入是一串文字，输出也是一串文字，他们之间相差一个位置，因为语言模型的的目标是根据之前的单词预测下一个单词

# 定义模型
- 继承nn.Module
- 初始化函数
- forward函数
- 其余可以根据模型需要定义的相关函数

In [10]:
class RNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.linear = nn.Linear(hidden_size, vocab_size)
        
    def forward(self,input_text, hidden):
        # input_text: sqe_len * BATCH_SIZE
        emb = self.embed(input_text)
        output, hidden = self.lstm(emb, hidden)
        '''
        out_put: seq_len * batch_size * hidden_size
        hiddenL (1 * batch_size * hidden_size, 1 * batch_size * hidden_size)
        '''
        # output = output.view(-1, output.shape[2])  # (seq_len * batch_size) * hidden_size
        out_vocab = self.linear(output.view(-1, output.shape[2]))   # (seq_len * batch_size) * vocab_size
        out_vocab = out_vocab.view(out_vocab.shape[0], out_vocab.shape[1], -1)
        return out_vocab, hidden
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1, bsz, self.hidden_size), requires_grad=True),
                weight.new_zeros((1, bsz, self.hidden_size), requires_grad=True))
        

初始化一个模型

In [11]:
model = RNNModel(vocab_size=len(TEXT.vocab), embed_size=EMBEDDING_SIZE, hidden_size=HIDDEN_SIZE)
# if USE_CUDA:
#     model = model.cuda()

In [16]:
next(model.parameters())

Parameter containing:
tensor([[-0.4029,  0.3864,  0.5945,  ..., -1.9139, -0.3455, -0.9382],
        [ 0.2939, -0.0949,  0.7116,  ..., -0.2196, -0.2233, -1.8917],
        [-0.3359,  1.6099,  1.1068,  ...,  0.2782, -0.6299,  2.3107],
        ...,
        [ 1.1677,  0.9386, -0.7086,  ..., -0.4453,  0.1337,  2.0498],
        [-1.4050, -1.4229, -1.1365,  ...,  0.8331,  0.9180,  0.5792],
        [ 1.4029, -0.7460,  1.0759,  ..., -1.0833, -0.9645,  0.5022]],
       requires_grad=True)

训练模型

In [17]:
def repackge_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackge_hidden(v) for v in h)

In [18]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

In [22]:
def evaluate(model, data):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(data)
    with torch.no_grad():
        hidden = model.init_hidden(bsz=BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackge_hidden(hidden)
            output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, len(TEXT.vocab)), target.view(-1))
            total_loss = loss.item() * np.multiply(*data.size())
            total_count = np.multiply(*data.size())
    loss = total_loss / total_count
    model.train()
    return loss

In [23]:
for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(bsz=BATCH_SIZE)
    val_losses = []
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackge_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = loss_fn(output.view(-1, len(TEXT.vocab)), target.view(-1))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()
        if i % 3 == 0:
            print("loss", loss.item())
            
        # 保存模型
        if i % 5 == 0:
            # 保存在validation上表现最好的模型
            val_loss = evaluate(model, val_iter)
            if len(val_losses) == 0 or val_loss < min(val_losses):
                val_losses.append(val_loss)
                torch.save(model.state_dict(), 'lm.model')
                print("best mdel saved to lm.model")
            else:
                # learning rate decay
                scheduler.step()

loss 5.804832935333252


NameError: name 'vla_loss' is not defined

由于计算资源有限，这边只能在cpu上计算，速度特别慢，先把代码保存上传