In [6]:
import argparse
import time
import math
import os
import torch
import torch.nn as nn
import torch.onnx

# import data
# import model


In [None]:

parser = argparse.ArgumentParser(description='PyTorch Wikitext-2 RNN/LSTM/GRU/Transformer Language Model')
parser.add_argument('--data', type=str, default='./data.gigaspeech', 
                    help='location of data corpus')
parser.add_argument('--model', type=str, default='LSTM', 
                    help='type of network (RNN_TANH, RNN_RELU, LSTM, GRU, Transformer)')
parser.add_argument('--emsize', type=int, default=200,
                    help='size of word embeddings')
parser.add_argument('--nhid', type=int, default=200,
                    help='number of hidden units per layer')
parser.add_argument('--nlayers', type=int, default=2,
                    help='number of layers')
parser.add_argument('--lr', type=float, default=20,
                    help='initial learning rate')
parser.add_argument('--clip', type=float, default=0.25,
                    help='gradient clipping')
parser.add_argument('--epochs', type=int, default=40,
                    help='upper epoch limit')
parser.add_argument('--batch_size', type=int, default=20, metavar='N',
                    help='batch size')
# TODO: bptt 是指 BackPropagation Through Time 吗
parser.add_argument('--bptt', type=int, default=35, 
                    help='swquence length')
parser.add_argument('dropout', type=float, default=0.2,
                    help='dropout applied to layers (0 = no dropout)')
parser.add_argument('--tied', action='store_true', 
                    help='tie the word embedding and softmax weights')
parser.add_argument('--seed', type=int, default=1111,
                    help='random seed')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--log-interval', type=int, default=500, metavar='N',
                    help='report interval')
parser.add_argument('--save', type=str, default='model.pt', 
                    help='path to save the final model')
parser.add_argument('--onnx-export', type=str, default='',
                    help='path to export the final model in onnx format')
parser.add_argument('--nhear', type=int, default=2,
                    help='the number of heard in the encoder/decoder of the transformer model')
parser.add_argument('--dry-run', action='store_true', 
                    help='verify the code and model')

args = parser.parse_args()


In [15]:
device = 'cuda'
def batchify(data, batch_size):
    num_batchs = data.size(0) // batch_size
    data = data.narrow(0, 0, num_batchs*batch_size)
    data = data.reshape(-1, batch_size).contiguous()
    return data.to(device)

def get_batch(source, i):
    seq_len = min(args.bptt, len(source-1-i))
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].view(-1)

In [29]:
for x, y in enumerate(range(0, 3, 2)):
    print(x, y)

0 0
1 2


In [14]:
corpus = data.Corpus(args.data)
eval_batch_size = 10
train_data = batchify(corpus.train, args.batch_size)
valid_data = batchify(corpus.valid, eval_batch_size)
test_data = batchify(corpus.test, eval_batch_size)

1

In [None]:
criterion = nn.NLLLoss()
ntokens = len(corpus.dictionary)
print("vocabulary Size", ntokens)
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)


In [None]:
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [None]:
def evaluate(data_source):
    model.eval() # 关闭模型的 dropout
    total_loss = 0
    ntoken = len(corpus.dictionary)
    if args.model != 'Transformer':
        hidden = model.init_hidden(eval_batch_size)
    
    with torch.no_grad():
        for i in range(0, data_source.size(0)-1, args.bptt):
            data, target = get_batch(data_source, i)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            total_loss += len(data) * criterion(output, targets).items()
        total_loss / (len(data_source)-1)

In [None]:
def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(args.batch_size)
    for batch, i in enumerate(range(0, train_data.size(0)-1, args.bptt)):
        data, targets = get_batch(train_data, i)
        model.zero_grad()
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden)
        loss = criterion(output, targets)
        loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
    for p in model.parameters():
        p.data.add_(p.grad, alpha=-lr)
    
    total_loss += loss.item()

    if (batch % args.log_interval == 0 and batch > 0):
        cur_loss = total_loss / args.log_interval
    