In [45]:
%matplotlib inline

import numpy as np
from matplotlib import pyplot as plt
import time
import os
import torch
import torch.nn as nn
import math
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from tests import test_prediction, test_generation
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [46]:
# load all that we need

dataset = np.load('../dataset/wiki.train.npy',allow_pickle=True)
# to pass generation part, I use this model with only 100 datasets and 2 epochs
# dataset = dataset[:100] 
fixtures_pred = np.load('../fixtures/prediction.npz',allow_pickle=True)  # dev
fixtures_gen = np.load('../fixtures/generation.npy',allow_pickle=True)  # dev
fixtures_pred_test = np.load('../fixtures/prediction_test.npz',allow_pickle=True)  # test
fixtures_gen_test = np.load('../fixtures/generation_test.npy',allow_pickle=True)  # test
vocab = np.load('../dataset/vocab.npy',allow_pickle=True)

In [47]:
# data loader

class LanguageModelDataLoader(DataLoader):
    """
        TODO: Define data loader logic here
    """
    def __init__(self, dataset, batch_size, shuffle=True):
        self.dataset = dataset
        self.batch_size = batch_size
        
        
#         raise NotImplemented

        self.i = 0
        self.shuffle()
        self.dataset_s = np.concatenate(self.dataset)
        self.max = len(self.dataset_s)
        
        
    def shuffle(self):
        np.random.shuffle(self.dataset)
        


    def __iter__(self):
        # concatenate your articles and build into batches
#         np.random.shuffle(self.dataset)
#         temp = np.concatenate(self.dataset)
        self.shuffle()
        self.i = 0
        self.dataset_s = np.concatenate(self.dataset)
        self.max = len(self.dataset_s)
#         print(self.dataset_s[self.i])
        return self
    
    def BPTT_length(self):
        a = np.random.uniform(low=0, high=1)
        b = 0
        if a > 0.95:
            b = int(np.random.normal(loc=35,scale=5))
        else:
            b = int(np.random.normal(loc=70,scale=5))
        return b
                    
        

        

    def __next__(self):
#         if self.i <= self.max:
#             self.i += self.batch_size
#             if self.i >= self.max:
#                 return self.dataset[self.i - self.batch_size:self.max-1],self.dataset[self.i - self.batch_size+1:self.max]
#             else:
#                 return self.dataset[self.i - self.batch_size:self.i],self.dataset[self.i - self.batch_size+1:self.i+1]
#         else:
#             raise StopIteration
        len_batch = self.BPTT_length()
#         print(self.i,len_batch)
        if self.i + len_batch*self.batch_size < self.max:
            temp_input = self.dataset_s[self.i:self.i+len_batch*self.batch_size]
            temp_input = temp_input.reshape(self.batch_size,len_batch)
            temp_label = self.dataset_s[self.i+1:self.i+len_batch*self.batch_size+1]
            temp_label = temp_label.reshape(self.batch_size,len_batch)
            self.i+=len_batch*self.batch_size
            return torch.from_numpy(temp_input),torch.from_numpy(temp_label)
                
        else:
            raise StopIteration




In [48]:
# l = LanguageModelDataLoader(dataset,80)
# for idx,(i,j) in enumerate(l):
#     if(idx<5):
#         print(idx,i,j)

In [49]:
# model

class LanguageModel(nn.Module):
    """
        TODO: Define your model here
    """
#     def __init__(self, vocab_size):
#         super(LanguageModel, self).__init__()
        
#         raise NotImplemented


#     def forward(self, x):
#         # Feel free to add extra arguments to forward (like an argument to pass in the hiddens)
#         raise NotImplemented

    def __init__(self,vocab_size,embed_size = 400,hidden_size = 1028, nlayers =3):
        super(LanguageModel,self).__init__()
        self.vocab_size=vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.nlayers=nlayers
        self.embedding = nn.Embedding(vocab_size,embed_size) # Embedding layer
        nn.init.uniform_(self.embedding.weight, -0.1, 0.1)
        self.rnn = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers) # Recurrent network
        # You can also try GRUs instead of LSTMs.
        
        self.scoring = nn.Linear(hidden_size,vocab_size) # Projection layer
        nn.init.uniform_(self.scoring.weight, -0.1, 0.1)
        for param in self.rnn.parameters():
            nn.init.uniform_(param.data, -1/math.sqrt(hidden_size), 1/math.sqrt(hidden_size))
        
    def forward(self,seq_batch): #L x N
        # returns 3D logits
        seq_batch.to(DEVICE)
        batch_size = seq_batch.size(1)
        embed = self.embedding(seq_batch) #L x N x E
        hidden = None
        output_lstm,hidden = self.rnn(embed,hidden) #L x N x H
        output_lstm_flatten = output_lstm.view(-1,self.hidden_size) #(L*N) x H
        output_flatten = self.scoring(output_lstm_flatten) #(L*N) x V
        return output_flatten.view(-1,batch_size,self.vocab_size)

    


In [50]:
# model trainer

class LanguageModelTrainer:
    def __init__(self, model, loader, max_epochs=1, run_id='exp'):
        """
            Use this class to train your model
        """
        # feel free to add any other parameters here
        self.model = model
        self.loader = loader
        self.train_losses = []
        self.val_losses = []
        self.predictions = []
        self.predictions_test = []
        self.generated_logits = []
        self.generated = []
        self.generated_logits_test = []
        self.generated_test = []
        self.epochs = 0
        self.max_epochs = max_epochs
        self.run_id = run_id
        
        # TODO: Define your optimizer and criterion here
        self.optimizer = torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-6)
#         self.optimizer = torch.optim.SGD(model.parameters(), lr=30, weight_decay=1e-6)
        self.criterion = nn.CrossEntropyLoss(reduction="sum").to(DEVICE)

    def train(self):
        self.model.train() # set to training mode
        epoch_loss = 0
        num_batches = 0
        self.model = self.model.to(DEVICE)
        for batch_num, (inputs, targets) in enumerate(self.loader):
            epoch_loss += self.train_batch(inputs, targets)
            if batch_num % 50 == 0:
                print(batch_num, epoch_loss / (batch_num + 1))
        epoch_loss = epoch_loss / (batch_num + 1)
        self.epochs += 1
        print('[TRAIN]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs , self.max_epochs, epoch_loss))
        self.train_losses.append(epoch_loss)

    def train_batch(self, inputs, targets):
        """ 
            TODO: Define code for training a single batch of inputs
        
        """
        inputs = inputs.transpose(0,1).long().to(DEVICE)
        targets = targets.transpose(0,1).long().to(DEVICE)
        
        outputs = self.model(inputs) # 3D
        loss = self.criterion(outputs.view(-1,outputs.size(2)),targets.contiguous().view(-1))/(inputs.size(0)*inputs.size(1)) # Loss of the flattened outputs
        self.optimizer.zero_grad()
        loss.backward()
#         nn.utils.clip_grad_norm_(model.parameters(), 0.20)
        self.optimizer.step()
        
        return loss.item()

    
    def test(self):
        # don't change these
        self.model.eval() # set to eval mode
        predictions = TestLanguageModel.prediction(fixtures_pred['inp'], self.model) # get predictions
        self.predictions.append(predictions)
        generated_logits = TestLanguageModel.generation(fixtures_gen, 10, self.model) # generated predictions for 10 words
        generated_logits_test = TestLanguageModel.generation(fixtures_gen_test, 10, self.model)
        nll = test_prediction(predictions, fixtures_pred['out'])
        generated = test_generation(fixtures_gen, generated_logits, vocab)
        generated_test = test_generation(fixtures_gen_test, generated_logits_test, vocab)
        self.val_losses.append(nll)
        
        self.generated.append(generated)
        self.generated_test.append(generated_test)
        self.generated_logits.append(generated_logits)
        self.generated_logits_test.append(generated_logits_test)
        
        # generate predictions for test data
        predictions_test = TestLanguageModel.prediction(fixtures_pred_test['inp'], self.model) # get predictions
        self.predictions_test.append(predictions_test)
            
        print('[VAL]  Epoch [%d/%d]   Loss: %.4f'
                      % (self.epochs , self.max_epochs, nll))
        return nll

    def save(self):
        # don't change these
        model_path = os.path.join('experiments', self.run_id, 'model-{}.pkl'.format(self.epochs))
        torch.save({'state_dict': self.model.state_dict()},
            model_path)
        np.save(os.path.join('experiments', self.run_id, 'predictions-{}.npy'.format(self.epochs)), self.predictions[-1])
        np.save(os.path.join('experiments', self.run_id, 'predictions-test-{}.npy'.format(self.epochs)), self.predictions_test[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-{}.npy'.format(self.epochs)), self.generated_logits[-1])
        np.save(os.path.join('experiments', self.run_id, 'generated_logits-test-{}.npy'.format(self.epochs)), self.generated_logits_test[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated[-1])
        with open(os.path.join('experiments', self.run_id, 'generated-{}-test.txt'.format(self.epochs)), 'w') as fw:
            fw.write(self.generated_test[-1])


In [51]:
class TestLanguageModel:
    def prediction(inp, model):
        """
            TODO: write prediction code here
            
            :param inp:
            :return: a np.ndarray of logits
        """
        inp = torch.from_numpy(inp).long()

        model.to('cpu')
        output_prediction = model(inp.transpose(0,1))
        output_prediction = output_prediction[-1,:,:]
        return output_prediction.cpu().detach().numpy()
#         return output_prediction

        
    def generation(inp, forward, model):
        """
            TODO: write generation code here

            Generate a sequence of words given a starting sequence.
            :param inp: Initial sequence of words (batch size, length)
            :param forward: number of additional words to generate
            :return: generated words (batch size, forward)
        """      
        
        inp = torch.from_numpy(inp).long()
        inp.to(DEVICE)
        print('input',inp.shape)
        generated_words = []
        output = model(inp.transpose(0,1)) # L x 1 x H
        print('output',output.shape)
        _,current_word = torch.max(output,dim=2) # 1 x 1
        
        print('current_word',current_word.shape)
        print('current_word',current_word[-1,:])
        generated_words.append(current_word[-1,:])
        
#         print('generated_words',current_word[:,-1].shape)
        

        if forward > 1:
            for i in range(forward-1):
                output = model(current_word.long()) # 1 x 1 x H
                _,current_word = torch.max(output,dim=2) # 1
                generated_words.append(current_word[-1,:])
#         for i in generated_words:
#             print('len',i.shape)
            
        print('out1',generated_words)  
#         print('out2',torch.cat(generated_words,dim=0).reshape((len(inp),forward)))

     

        return torch.cat(generated_words,dim=0).reshape((len(inp),forward))
    
    
#         generated_words = []
#         embed = self.embedding(seq).unsqueeze(1) # L x 1 x E
#         hidden = None
#         output_lstm, hidden = self.rnn(embed,hidden) # L x 1 x H
#         output = output_lstm[-1] # 1 x H
#         scores = self.scoring(output) # 1 x V
#         _,current_word = torch.max(scores,dim=1) # 1 x 1
#         generated_words.append(current_word)
#         if n_words > 1:
#             for i in range(n_words-1):
#                 embed = self.embedding(current_word).unsqueeze(0) # 1 x 1 x E
#                 output_lstm, hidden = self.rnn(embed,hidden) # 1 x 1 x H
#                 output = output_lstm[0] # 1 x H
#                 scores = self.scoring(output) # V
#                 _,current_word = torch.max(scores,dim=1) # 1
#                 generated_words.append(current_word)
#         return torch.cat(generated_words,dim=0)
        

    
    

        

In [52]:
# TODO: define other hyperparameters here

NUM_EPOCHS = 6
BATCH_SIZE = 80


In [53]:
run_id = str(int(time.time()))
if not os.path.exists('./experiments'):
    os.mkdir('./experiments')
os.mkdir('./experiments/%s' % run_id)
print("Saving models, predictions, and generated words to ./experiments/%s" % run_id)

Saving models, predictions, and generated words to ./experiments/1575318871


In [54]:
model = LanguageModel(len(vocab))
loader = LanguageModelDataLoader(dataset=dataset, batch_size=BATCH_SIZE, shuffle=True)
trainer = LanguageModelTrainer(model=model, loader=loader, max_epochs=NUM_EPOCHS, run_id=run_id)

In [None]:
best_nll = 1e30 
for epoch in range(NUM_EPOCHS):
    trainer.train()
    nll = trainer.test()
    if nll < best_nll:
        best_nll = nll
        print("Saving model, predictions and generated output for epoch "+str(epoch)+" with NLL: "+ str(best_nll))
        trainer.save()
    

0 10.409656524658203
50 7.963303332235299
100 7.6372222192216626
150 7.474385343640056
200 7.329805542580524
250 7.20038755766424
300 7.086971347910225
350 6.989493311640204
[TRAIN]  Epoch [1/6]   Loss: 6.9401
input torch.Size([32, 20])
output torch.Size([20, 32, 33278])
current_word torch.Size([20, 32])
current_word tensor([31353,  1420,  1424, 15340,  1424,    76,    76, 25821, 31353,    79,
        31353, 31353, 31543, 31353,    76,    76,    79, 13276,    73, 31353,
        22968,  1419,  1419, 31353,    76,  1419, 32747,  1420,    79,    79,
        31353,    76])
out1 [tensor([31353,  1420,  1424, 15340,  1424,    76,    76, 25821, 31353,    79,
        31353, 31353, 31543, 31353,    76,    76,    79, 13276,    73, 31353,
        22968,  1419,  1419, 31353,    76,  1419, 32747,  1420,    79,    79,
        31353,    76]), tensor([ 1419,  1420,  1419, 31353,  1419, 15340, 31353, 31353,  1419,  1420,
         1419,  1419, 31353,  1419, 15340, 15340, 12733,  1419, 25821,  1419,
    

[VAL]  Epoch [1/6]   Loss: 5.7014
Saving model, predictions and generated output for epoch 0 with NLL: 5.701422
0 6.257383346557617
50 6.121310346266803
100 6.1028416369221
150 6.074655933885385
200 6.041427137839854
250 6.023664877234227
300 6.000851708789204
350 5.97959366687003
[TRAIN]  Epoch [2/6]   Loss: 5.9669
input torch.Size([32, 20])
output torch.Size([20, 32, 33278])
current_word torch.Size([20, 32])
current_word tensor([31353,  1420,  1424,  1419,  1424,  1419, 31543, 25821, 31353,    79,
           76, 14658, 31543, 31353,    76,  1419,    76,  1420,  1425, 31353,
        22968,   117,  1419, 31353,    76,  1419, 32846,  1420,     1,    79,
        31352,    79])
out1 [tensor([31353,  1420,  1424,  1419,  1424,  1419, 31543, 25821, 31353,    79,
           76, 14658, 31543, 31353,    76,  1419,    76,  1420,  1425, 31353,
        22968,   117,  1419, 31353,    76,  1419, 32846,  1420,     1,    79,
        31352,    79]), tensor([ 1419,  1420,  1419,  1419,  1419, 25821, 31

[VAL]  Epoch [2/6]   Loss: 5.1277
Saving model, predictions and generated output for epoch 1 with NLL: 5.127701
0 5.458985805511475
50 5.599488613652248
100 5.6087433654483005
150 5.603679249618227
200 5.595096189584305
250 5.585170457087665
300 5.585105433416525
350 5.576030780107547
[TRAIN]  Epoch [3/6]   Loss: 5.5667
input torch.Size([32, 20])
output torch.Size([20, 32, 33278])
current_word torch.Size([20, 32])
current_word tensor([31353,  1420,  1424,  1419,  1424, 15340,    79,    79, 31353,    79,
           76, 14658, 31543, 31353,  1414,  1419,    79, 13276,  1423, 31353,
        31353,   117,  1419, 31353,    76,  1419, 32846,  1420,     1,    79,
        31352,    79])
out1 [tensor([31353,  1420,  1424,  1419,  1424, 15340,    79,    79, 31353,    79,
           76, 14658, 31543, 31353,  1414,  1419,    79, 13276,  1423, 31353,
        31353,   117,  1419, 31353,    76,  1419, 32846,  1420,     1,    79,
        31352,    79]), tensor([21201,  1420,  1419,    76,  1419, 31353

[VAL]  Epoch [3/6]   Loss: 4.8035
Saving model, predictions and generated output for epoch 2 with NLL: 4.803519
0 5.324710369110107
50 5.276250091253543
100 5.300411583173393
150 5.30213739066724
200 5.305447350687055
250 5.296139660109562
300 5.293462688344658
350 5.282022629707967
[TRAIN]  Epoch [4/6]   Loss: 5.2756
input torch.Size([32, 20])
output torch.Size([20, 32, 33278])
current_word torch.Size([20, 32])
current_word tensor([31353,  1420, 15142,  1419, 25821,  1419,    79,    79, 31353,    79,
           76, 31543, 31543, 31353,  1414,  1419,    76, 13276,  1425, 31353,
        31353,    86, 29456, 31353,    79,  1419, 32846,  1420,     1,    79,
        31352,    79])
out1 [tensor([31353,  1420, 15142,  1419, 25821,  1419,    79,    79, 31353,    79,
           76, 31543, 31543, 31353,  1414,  1419,    76, 13276,  1425, 31353,
        31353,    86, 29456, 31353,    79,  1419, 32846,  1420,     1,    79,
        31352,    79]), tensor([21201,  1420,    76,  1419, 31353, 25821, 

[VAL]  Epoch [4/6]   Loss: 4.6619
Saving model, predictions and generated output for epoch 3 with NLL: 4.661934
0 4.923999309539795
50 4.911514525320015
100 4.982503211144174
150 4.998737117312602
200 5.003912076428162
250 5.008687746952255
300 5.01900216828153
350 5.028179588480892
[TRAIN]  Epoch [5/6]   Loss: 5.0286
input torch.Size([32, 20])
output torch.Size([20, 32, 33278])
current_word torch.Size([20, 32])
current_word tensor([ 1419,  1420, 29762,  1419,  1424,  1419,    79,    79, 31353,    79,
           76, 31353, 23482, 31353,  8986,  1419, 23592, 13276,  1423, 31353,
        31353,   946, 29456, 31353,    76,  1419, 32846,  1420,     1,    79,
        31352,    79])
out1 [tensor([ 1419,  1420, 29762,  1419,  1424,  1419,    79,    79, 31353,    79,
           76, 31353, 23482, 31353,  8986,  1419, 23592, 13276,  1423, 31353,
        31353,   946, 29456, 31353,    76,  1419, 32846,  1420,     1,    79,
        31352,    79]), tensor([   76,  1420,  1424,    73, 31514, 25821, 

[VAL]  Epoch [5/6]   Loss: 4.5832
Saving model, predictions and generated output for epoch 4 with NLL: 4.583221
0 4.754510402679443


In [None]:
# Don't change these
# plot training curves
plt.figure()
plt.plot(range(1, trainer.epochs + 1), trainer.train_losses, label='Training losses')
plt.plot(range(1, trainer.epochs + 1), trainer.val_losses, label='Validation losses')
plt.xlabel('Epochs')
plt.ylabel('NLL')
plt.legend()
plt.show()

In [None]:
# see generated output
print (trainer.generated[-1]) # get last generated output