<a href="https://colab.research.google.com/github/tamirmal/tau_dl_proj/blob/master/HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
from google.colab import drive
drive.mount('/content/drive')
!cp "/content/drive/My Drive/Deep learning/HW2/PTB.zip" ./
!mkdir -p ./PTB/
!unzip -o ./PTB.zip -d ./PTB/
cuda = torch.device('cuda')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Archive:  ./PTB.zip
  inflating: ./PTB/ptb.char.train.txt  
  inflating: ./PTB/ptb.char.valid.txt  
  inflating: ./PTB/ptb.test.txt      
  inflating: ./PTB/ptb.train.txt     
  inflating: ./PTB/ptb.valid.txt     
  inflating: ./PTB/README            
  inflating: ./PTB/ptb.char.test.txt  


In [0]:
ARGS = {}
ARGS['BATCH_SIZE'] = 20
ARGS['EPOCHS'] = 13
ARGS['BPTT_LEN'] = 20         # sequence length to unroll / backpropegate through time
ARGS['HIDDEN_DIM'] = 200      # hidden state vector dimension
ARGS['N_LAYERS'] = 2          # Number of hidden layers
ARGS['LOG_BATCH_INTVL'] = 600 # print training info every <VAL> minibatches

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext import data
from torchtext.datasets import LanguageModelingDataset
 
TEXT = data.Field(lower=True, tokenize='spacy', unk_token='<unk>')

ptb_train = LanguageModelingDataset("./PTB/ptb.train.txt", TEXT)
ptb_valid = LanguageModelingDataset("./PTB/ptb.valid.txt", TEXT)
#ptb_test = LanguageModelingDataset("./PTB/ptb.test.txt", TEXT)

TEXT.build_vocab(ptb_train)

train_iter = data.BPTTIterator(
    ptb_train,
    batch_size=ARGS['BATCH_SIZE'],
    bptt_len=ARGS['BPTT_LEN'],
    device=cuda,
    repeat=False,
    shuffle=True)
valid_iter = data.BPTTIterator(
    ptb_valid,
    batch_size=ARGS['BATCH_SIZE'],
    bptt_len=ARGS['BPTT_LEN'],
    device=cuda,
    repeat=False,
    shuffle=True)

#################################
## unit test
#################################
run = False

if run is True:
  print(len(train_iter))

  for batch in train_iter:
    print(batch)
    text, target = batch.text, batch.target
    text, target = text[:,0], target[:,0]
    print(text)
    print(len(text))
    print(target)
    print(len(target))

    text_s = [TEXT.vocab.itos[word_idx] for word_idx in text]
    target_s = [TEXT.vocab.itos[word_idx] for word_idx in target]

    print(text_s)
    print(target_s)

    break
###################################

In [0]:
class MyLSTM(nn.Module):
  def __init__(self):
    super(MyLSTM, self).__init__()
    self.lstm = torch.nn.LSTM(ARGS['HIDDEN_DIM'], ARGS['HIDDEN_DIM'], ARGS['N_LAYERS'])

    # we need a encoder/decoder to convert to/from one-hot vectors of the vocabulary
    self.encoder = nn.Embedding(len(TEXT.vocab), ARGS['HIDDEN_DIM'])
    self.encoder.weight.data.uniform_(-0.1, 0.1)
    self.decoder = nn.Linear(ARGS['HIDDEN_DIM'], len(TEXT.vocab))
    self.decoder.bias.data.zero_()
    self.decoder.weight.data.uniform_(-0.1, 0.1)

    self.num_layers = self.lstm.num_layers
    # End

  def forward(self, input, hidden):
    # https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
    # we can do the entire sequence all at once.
    # the first value returned by LSTM is all of the hidden states throughout
    # the sequence. the second is just the most recent hidden state
    # (compare the last slice of "out" with "hidden" below, they are the same)
    # The reason for this is that:
    # "out" will give you access to all hidden states in the sequence
    # "hidden" will allow you to continue the sequence and backpropagate,
    # by passing it as an argument  to the lstm at a later time
    # Add the extra 2nd dimension

    x = self.encoder(input)
    output, hidden = self.lstm(x, hidden)
    decoded = self.decoder(output)
    return decoded, hidden
    # End

In [0]:
# inspired by : https://github.com/pytorch/examples/tree/master/word_language_model
import torch.optim as optim
import time
import math

def repackage_hidden(h):
    # Wraps hidden states in new Tensors, to detach them from their history.
    #
    # detach h,c so we can safely use them as input to next minibatch
    # this make sure that gradients dont backprop between minibatches
    # https://discuss.pytorch.org/t/help-clarifying-repackage-hidden-in-word-language-model/226
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)
# End


def train_epoch(model, e, criter, opt):
  # loss and time variables for logging prints (not used in model training)
  total_loss = 0.
  start_time = time.time()
  model.to(cuda)
  model.train() 
  h = torch.zeros(model.num_layers, ARGS['BATCH_SIZE'], ARGS['HIDDEN_DIM']).to(cuda)
  c = torch.zeros(model.num_layers, ARGS['BATCH_SIZE'], ARGS['HIDDEN_DIM']).to(cuda)
  hidden = (h, c)
  for i, batch in enumerate(train_iter):
    data, target = batch.text, batch.target
    data.to(cuda), target.to(cuda)
    model.zero_grad()
    for v in hidden: v.to(cuda)
    hidden = repackage_hidden(hidden)
    output, hidden = model(data, hidden)
    loss = criter(output.view(-1, output.shape[-1]), target.view(-1))
    loss.backward()
    opt.step()
    total_loss += loss.item()
    # log info
    if i % ARGS['LOG_BATCH_INTVL'] == 0 and i > 0:
      cur_loss = total_loss / ARGS['LOG_BATCH_INTVL']
      elapsed = time.time() - start_time
      print('| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
          e, i, len(train_iter), elapsed * 1000 / ARGS['LOG_BATCH_INTVL'], cur_loss, math.exp(cur_loss)))
      total_loss = 0
      start_time = time.time()
# End

def lr_scheduler_factor(e):
# "We train it for 4 epochs with a learning rate of 1 and then we decrease the learning rate
# by a factor of 2 after each epoch, for a total of 13 training epochs"
  if e < 4:
    return 1
  elif e > 13:
    return 1
  else:
    return 0.5
# End

def test(model, test_iter, criter):
    model.to(cuda)
    # Turn on evaluation mode which disables dropout.
    model.eval()
    total_loss = 0.
    with torch.no_grad():
        h = torch.zeros(model.num_layers, ARGS['BATCH_SIZE'], ARGS['HIDDEN_DIM']).to(cuda)
        c = torch.zeros(model.num_layers, ARGS['BATCH_SIZE'], ARGS['HIDDEN_DIM']).to(cuda)
        hidden = (h, c)
        for i, batch in enumerate(test_iter):
            data, target = batch.text, batch.target
            data.to(cuda), target.to(cuda)
            for v in hidden: v.to(cuda)
            output, hidden = model(data, hidden)
            hidden = repackage_hidden(hidden)
            loss = criter(output.view(-1, output.shape[-1]), target.view(-1))
            total_loss += loss.item()
    return total_loss / len(test_iter)

def train(model):
  criterion = nn.CrossEntropyLoss()
  optimizer = optim.SGD(model.parameters(), lr=1)
  lr_scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_scheduler_factor)
  for e in range(ARGS['EPOCHS']):
    epoch_start_time = time.time()
    train_epoch(model, e, criterion, optimizer)
    val_loss = test(model, valid_iter, criterion)
    print('| end of epoch {:3d} | time: {:5.2f}s | lr {} | valid loss {:5.2f} | valid ppl {:8.2f}'.format(
        e, (time.time() - epoch_start_time), lr_scheduler.get_lr(), val_loss, math.exp(val_loss)))
    lr_scheduler.step() # update the LR for next epoch
# End

In [21]:
lstm = MyLSTM()
#test(lstm, valid_iter)
train(lstm)

| epoch   0 |   600/ 2713 batches | ms/batch  4.09 | loss  6.39 | ppl   595.05
| epoch   0 |  1200/ 2713 batches | ms/batch  3.83 | loss  5.65 | ppl   284.79
| epoch   0 |  1800/ 2713 batches | ms/batch  3.80 | loss  5.41 | ppl   223.68
| epoch   0 |  2400/ 2713 batches | ms/batch  3.81 | loss  5.28 | ppl   197.07
| end of epoch   0 | time: 10.82s | lr [1] | valid loss  5.26 | valid ppl   193.21
| epoch   1 |   600/ 2713 batches | ms/batch  4.08 | loss  5.21 | ppl   183.54
| epoch   1 |  1200/ 2713 batches | ms/batch  3.83 | loss  5.11 | ppl   166.41
| epoch   1 |  1800/ 2713 batches | ms/batch  3.83 | loss  4.99 | ppl   147.63
| epoch   1 |  2400/ 2713 batches | ms/batch  3.80 | loss  4.93 | ppl   137.99
| end of epoch   1 | time: 10.81s | lr [1] | valid loss  4.93 | valid ppl   138.95
| epoch   2 |   600/ 2713 batches | ms/batch  4.07 | loss  4.88 | ppl   132.10
| epoch   2 |  1200/ 2713 batches | ms/batch  3.80 | loss  4.82 | ppl   123.66
| epoch   2 |  1800/ 2713 batches | ms/batch