In [14]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from tests import test_prediction, test_generation
from config import LANGUAGEMODEL_CONFIG as LC

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

cpu


In [15]:
print(LC)

{'seq_len': 200, 'lr': 0.01, 'nepochs': 3, 'batch_size': 16, 'embed_size': 256, 'hidden_size': 256, 'nlayers': 1}


In [16]:
# load all that we need

dataset = np.load('../dataset/wiki.train.npy')
fixtures_pred = np.load('../fixtures/dev_fixtures/prediction.npz')  # dev
fixtures_gen = np.load('../fixtures/dev_fixtures/generation.npy')  # dev
fixtures_pred_test = np.load('../fixtures/test_fixtures/prediction.npz')  # test
fixtures_gen_test = np.load('../fixtures/test_fixtures/generation.npy')  # test
vocab = np.load('../dataset/vocab.npy')
print("{} articles in training set".format(dataset.shape))
print("{} vocabulary items".format(vocab.shape))

(579,) articles in training set
(33278,) vocabulary items


In [17]:
class TextDataset(Dataset):
    def __init__(self, text, seq_len=LC['seq_len']):
        n_seq = len(text) // seq_len
        text = text[:n_seq * seq_len]
        text.astype(np.int32)
        self.data = torch.tensor(text).view(-1,seq_len)
    def __getitem__(self, i):
        txt = self.data[i]
        return txt[:-1], txt[1:]
    def __len__(self):
        return self.data.size(0)

def collate(seq_list):
    """
    Transform a list of sequences into a batch.
    Return dimension seq_len * batch_size
    """
    inputs = torch.cat([s[0].unsqueeze(1) for s in seq_list], dim=1)
    targets = torch.cat([s[1].unsqueeze(1) for s in seq_list], dim=1)
    return inputs, targets

In [18]:
# Randomly shuffle the articles in the training dataset
new_indices = np.random.permutation(len(dataset))
new_dataset = np.take(dataset, new_indices)
# Concatenate the articles into one long string, length N
dataset_flatten = np.concatenate(new_dataset).ravel()
print("length of one long string generated from the articles ", dataset_flatten.shape)

length of one long string generated from the articles  (2075677,)


In [19]:
# data loader

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, shuffle=True):
        
        raise NotImplemented


    def __iter__(self):
        # concatenate your articles and build into batches
        
        raise NotImplemented

        
        

In [20]:
# model

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
    def __init__(self, charcount):
        super(LanguageModel, self).__init__()
        self.vocab_size = charcount
        self.embed_size = LC['embed_size']
        self.hidden_size = LC['hidden_size']
        self.nlayers = LC['nlayers']
        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        self.rnn = nn.LSTM(input_size=self.embed_size, hidden_size=self.hidden_size, num_layers=self.nlayers)
        self.scoring = nn.Linear(self.hidden_size, self.vocab_size)

    def forward(self, seq_batch): # dimension of seq_batch: seq_len * batch_size
        embed = self.embedding(seq_batch) # seq_len * batch_size * embed_size
        hidden = None
        output_lstm, hidden = self.rnn(embed, hidden) # seq_len * batch_size * hidden_size
        output_lstm_flatten = output_lstm.view(-1, self.hidden_size) # (seq_len * batch_size) * hidden_size
        output_flatten = self.scoring(output_lstm_flatten) # (seq_len * batch_size) * vocab_size
        return output_flatten.view(-1,len(seq_batch),self.vocab_size) # seq_len * batch_size * vocab_size


In [21]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = torch.optim.Adam(model.parameters(), lr=LC['lr'], weight_decay=1e-6)
        self.criterion = nn.CrossEntropyLoss()

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        for batch_num, (inputs, targets) in enumerate(self.loader):
            epoch_loss += self.train_batch(inputs, targets)
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs, self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        model.zero_grad()
        inputs = inputs.long()
        targets = targets.long()
        outputs = model(inputs)
        loss = self.criterion(outputs.view(-1,outputs.size(2)), targets.view(-1))
        loss.backward()
        self.optimizer.step()
        return loss.item()
    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        nll = test_prediction(predictions, fixtures_pred['out'])
        
        generated_logits = TestLanguageModel.generation(fixtures_gen, 20, self.model) # predictions for 20 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 20, self.model) # predictions for 20 words

        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   NLL: %.4f'
                      % (self.epochs, self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-test-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [22]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        raise NotImplemented

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """        
        raise NotImplemented
        

In [23]:
# TODO: define other hyperparameters here

NUM_EPOCHS = LC['nepochs']
BATCH_SIZE = LC['batch_size']


In [24]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1540598901


In [25]:
model = LanguageModel(len(vocab))
model = model.to(DEVICE)
loader = DataLoader(dataset=TextDataset(dataset_flatten), batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate)
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [None]:
best_nll = 1e30  # set to super large value at first
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch " + 
              str(epoch)+" with NLL: " + str(best_nll))
        trainer.save()
    

In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation NLL')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output