In [None]:
import torch
import torch.nn as nn
import torchtext
from torchtext.vocab import Vectors
import numpy as np
import random


In [None]:
class LanguageModel(nn.Module):

    def __init__(self, vocab_size, embedding_size, hidden_size, nlayers, dropout=0.5):
        super(LanguageModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.hidden_size = hidden_size
        self.nlayers = nlayers

        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, nlayers, dropout=dropout)
        self.linear = nn.Linear(hidden_size, vocab_size)  # (1000, 50002)
        self.init_weights()
        
    def init_weights(self):
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.linear.bias.data.zero_()
        self.embedding.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):  
       
        embed = self.drop(self.embedding(input)) 
        output, hidden = self.rnn(embed, hidden) 
        output = self.drop(output)
        linear = self.linear(output.view(-1, output.size(2)))  
        return linear.view(output.size(0), output.size(1), linear.size(1)), hidden

    def init_hidden(self, batch_size, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((self.nlayers, batch_size, self.hidden_size),requires_grad=requires_grad),weight.new_zeros((self.nlayers, batch_size, self.hidden_size),requires_grad=requires_grad))             

In [None]:
def repackage_hidden(hidden):
    if isinstance(hidden, torch.Tensor): 
        return hidden.detach()
    else: 
        return tuple(repackage_hidden(v) for v in hidden)

In [None]:
def evaluate(model, dev_iter):
    model.eval()
    total_loss = 0.
    it = iter(data)
    total_count = 0.
    
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)

        for i, batch in enumerate(dev_iter):
            data, target = batch.text, batch.target
            if use_cuda:
                data, target = data.cuda(), target.cuda()
                
            hidden = repackage_hidden(hidden) 
            with torch.no_grad():
                output, hidden = model(data, hidden)
            loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1))
            total_count += np.multiply(*data.size()) 
            total_loss += loss.item()*np.multiply(*data.size()) 

    loss = total_loss / total_count 
    model.train() 
    return loss

In [None]:
use_cuda= torch.cuda.is_available()
random.seed(1234)
np.random.seed(1234)
torch.manual_seed(1234)
if use_cuda:
    torch.cuda.manual_seed(1234)

#超参
BATCH_SIZE = 32 
EMBEDDING_SIZE = 500  
MAX_VOCAB_SIZE = 50000
hidden_size = 1000 
learning_rate = 0.001
GRAD_CLIP = 1.
NUM_EPOCHS = 2


TEXT = torchtext.legacy.data.Field(lower=True)   
train, val, test = torchtext.legacy.datasets.LanguageModelingDataset.splits(path=".",train="train.txt",validation="dev.txt",test="test.txt",text_field=TEXT)

TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)
print("词汇表数量: {}".format(len(TEXT.vocab)))

VOCAB_SIZE = len(TEXT.vocab) 
train_iter, val_iter, test_iter = torchtext.legacy.data.BPTTIterator.splits((train, val, test),batch_size=BATCH_SIZE,device=-1,bptt_len=50,repeat=False,shuffle=True)

model = LanguageModel(VOCAB_SIZE, EMBEDDING_SIZE, hidden_size, 2, dropout=0.5)
if use_cuda:
    model = model.cuda()

loss_fn = nn.CrossEntropyLoss() 
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)

val_losses = []
for epoch in range(NUM_EPOCHS):
    model.train() 
    hidden = model.init_hidden(BATCH_SIZE)

    for i, batch in enumerate(train_iter):
        data, target = batch.text, batch.target
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        hidden = repackage_hidden(hidden)

        optimizer.zero_grad()   
        output, hidden = model(data, hidden) 
        loss = loss_fn(output.view(-1, VOCAB_SIZE), target.view(-1)) 
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP) 
        optimizer.step()
        
        if i % 1000 == 0:
            print("epoch", epoch, "iter", i, "loss", loss.item())
    
        if i % 10000 == 0:
            val_loss = evaluate(model, val_iter)  
            
            if len(val_losses) == 0 or val_loss < min(val_losses):
                print("best model, val_loss: ", val_loss)
                torch.save(model.state_dict(), "LanguageModel.th")
            else:
                scheduler.step()  
                optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
            val_losses.append(val_loss)

