In [5]:
import numpy as np

from pprint import pprint
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors, GloVe

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.optim as optim
from torch.nn import Linear, RNN, LSTM, GRU
from torch.nn.functional import softmax, relu

In [6]:
#use_cuda = torch.cuda.is_available()
use_cuda = False
def get_variable(x):
    """ Converts tensors to cuda, if available. """
    if use_cuda:
        return x.cuda()
    return x

def get_numpy(x):
    """ Get numpy array for both cuda and not. """
    if use_cuda:
        return x.cpu().data.numpy()
    return x.data.numpy()

In [20]:
#Initialize the two fields: Sequence and Class
SEQ = data.Field(sequential=True)
LABEL = data.Field(sequential=False)

#Load the data
train_set, validation_set, test_set = data.TabularDataset.splits(path='./data/',
                                                                 train='train_filtered.txt',
                                                                 validation='val_filtered.txt',
                                                                 test='test_filtered.txt', 
                                                                 format = 'csv',
                                                                 fields=[('sequence', SEQ), ('label', LABEL)])

In [21]:
print('train_set.fields:', list(train_set.fields.keys()))
print('validation_set.fields:', list(validation_set.fields.keys()))
print('test_set.fields:', list(test_set.fields.keys()))
print()
print('size of training set', len(train_set))
print('size of validation set', len(validation_set))
print()
print('content of first training sample:')
pprint(vars(train_set[0]))

train_set.fields: ['sequence', 'label']
validation_set.fields: ['sequence', 'label']
test_set.fields: ['sequence', 'label']

size of training set 14611
size of validation set 2084

content of first training sample:
{'label': '3',
 'sequence': ['T',
              'G',
              'G',
              'G',
              'C',
              'T',
              'C',
              'C',
              'C',
              'G',
              'C',
              'C',
              'T',
              'C',
              'A',
              'G',
              'T',
              'G',
              'C',
              'G',
              'C',
              'A',
              'T',
              'G',
              'T',
              'T',
              'C',
              'A',
              'C',
              'T',
              'G',
              'G',
              'G',
              'C',
              'G',
              'T',
              'C',
              'T',
              'T',
              'C',
          

              'T',
              'G',
              'G',
              'T',
              'C',
              'C',
              'T',
              'G',
              'A',
              'T',
              'G',
              'G',
              'G',
              'C',
              'A',
              'G',
              'G',
              'A',
              'G',
              'A',
              'T',
              'G',
              'G',
              'A',
              'C',
              'C',
              'C',
              'G',
              'C',
              'C',
              'A',
              'A',
              'A',
              'T',
              'C',
              'C',
              'A',
              'G',
              'A',
              'G',
              'G',
              'A',
              'G',
              'G',
              'T',
              'G',
              'A',
              'A',
              'A',
              'A',
              'C',
              'G',
            

In [22]:
# build the vocabularies
url = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.simple.vec'
SEQ.build_vocab(train_set, max_size=None, vectors=Vectors('wiki.simple.vec', url=url))
LABEL.build_vocab(train_set)

In [23]:
print('Text fields:')
print(' size of vocabulary:', len(SEQ.vocab))
print(" vocabulary's embedding dimension:", SEQ.vocab.vectors.size())
print(' no. times the "N" appear in the dataset:', SEQ.vocab.freqs['N'])

print('\nLabel fields:')
#print('keys of LABEL.vocab:', list(LABEL.vocab.__dict__.keys()))
print(" list of vocabulary (int-to-str):", LABEL.vocab.itos)
print(" list of vocabulary (str-to-int):", dict(LABEL.vocab.stoi))

Text fields:
 size of vocabulary: 7
 vocabulary's embedding dimension: torch.Size([7, 300])
 no. times the "N" appear in the dataset: 85

Label fields:
 list of vocabulary (int-to-str): ['<unk>', '1', '0', '3', '4', '2', '5']
 list of vocabulary (str-to-int): {'<unk>': 0, '1': 1, '0': 2, '3': 3, '4': 4, '2': 5, '5': 6}


In [33]:
# make iterator for splits
train_iter, val_iter = data.BucketIterator.splits((train_set, validation_set),
                                                     batch_size=128, 
                                                     device=0 if use_cuda else -1,
                                                     sort_key=lambda x: len(x.sequence), #Sorting within the batch
                                                     sort_within_batch=False,
                                                     repeat=False)

The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.
The `device` argument should be set by using `torch.device` or passing a string as an argument. This behavior will be deprecated soon and currently defaults to cpu.


In [34]:
# print batch information
batch = next(iter(train_iter))
print("dimension of batch's text:", batch.sequence.size())
print("first sequence in text:", batch.sequence[:,0])
print("correct label index:", batch.label[0])
print("the actual label:", LABEL.vocab.itos[get_numpy(batch.label[0])])

dimension of batch's text: torch.Size([11747, 128])
first sequence in text: tensor([4, 3, 5,  ..., 1, 1, 1])
correct label index: tensor(1)
the actual label: 1


In [35]:
# size of embeddings
embedding_dim = SEQ.vocab.vectors.size()[1]
num_embeddings = SEQ.vocab.vectors.size()[0]
num_classes = len(LABEL.vocab.itos)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        #learn a new embedding
        self.embeddings = nn.Embedding(num_embeddings, embedding_dim)

        # use pretrained embeddings
        self.embeddings.weight.data.copy_(SEQ.vocab.vectors)
        self.embeddings.weight.detach_()
        
        
        self.rnn_1 = LSTM(input_size=embedding_dim,
                         hidden_size=100,
                         num_layers=1,
                         bidirectional=False)
        
        self.l_out = Linear(in_features=200,
                            out_features=num_classes,
                            bias=False)
        
    def forward(self, x):
        out = {}
        
        # get embeddings
        x = self.embeddings(x)
        
        # rnn returns output and last hidden state
        x, hn = self.rnn_1(x)
        
        # get a fixed sized hidden representation of the entire sequence
        out['hidden'] = x = torch.cat((torch.mean(x, dim=0), torch.max(x, dim=0)[0]), dim=1)
        
        # classify
        out['out'] = softmax(self.l_out(x), dim=1)
        return out

net = Net()
if use_cuda:
    net.cuda()
    print("using cuda")
print(net)

Net(
  (embeddings): Embedding(7, 300)
  (rnn_1): LSTM(300, 100)
  (l_out): Linear(in_features=200, out_features=7, bias=False)
)


In [36]:
# check which params require grad
{p[0]: p[1].requires_grad for p in net.named_parameters()}

{'embeddings.weight': False,
 'rnn_1.weight_ih_l0': True,
 'rnn_1.weight_hh_l0': True,
 'rnn_1.bias_ih_l0': True,
 'rnn_1.bias_hh_l0': True,
 'l_out.weight': True}

In [37]:
criterion = nn.CrossEntropyLoss()
# we filter the model's parameters such that we can remove the embedding layer, 
# which does not have requires_grad
optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.001)

def accuracy(ys, ts):
    # making a one-hot encoded vector of correct (1) and incorrect (0) predictions
    correct_prediction = torch.eq(torch.max(ys, 1)[1], ts)
    # averaging the one-hot encoded vector
    return torch.mean(correct_prediction.float())

def construct_sentences(batch):
    return [" ".join([SEQ.vocab.itos[elm] 
                      for elm in get_numpy(batch.sequence[:,i])])
            for i in range(batch.sequence.size()[1])]

def get_labels(batch):
    return [LABEL.vocab.itos[get_numpy(batch.label[i])] for i in range(len(batch.label))]

In [None]:
max_iter = 25000
eval_every = 1000
log_every = 500

train_loss, train_accs = [], []

net.train()
for i, batch in enumerate(train_iter):
    if i % eval_every == 0:
        net.eval()
        val_losses, val_accs, val_lengths = 0, 0, 0
        val_meta = {'label_idx': [], 'sentences': [], 'labels': []}
        for val_batch in val_iter:
            output = net(val_batch.sequence)
            # batches sizes might vary, which is why we cannot just mean the batch's loss
            # we multiply the loss and accuracies with the batch's size,
            # to later divide by the total size
            val_losses += criterion(output['out'], val_batch.label) * val_batch.batch_size
            val_accs += accuracy(output['out'], val_batch.label) * val_batch.batch_size
            val_lengths += val_batch.batch_size
            
            for key, _val in output.items():
                if key not in val_meta:
                    val_meta[key] = []
                val_meta[key].append(get_numpy(_val)) 
            val_meta['label_idx'].append(get_numpy(val_batch.label))
            val_meta['sentences'].append(construct_sentences(val_batch))
            val_meta['labels'].append(get_labels(val_batch))
        
        for key, _val in val_meta.items():
            val_meta[key] = np.concatenate(_val)
        
        # divide by the total accumulated batch sizes
        val_losses /= val_lengths
        val_accs /= val_lengths
        
        print("### EVAL loss: {:.2f} accs: {:.2f}".format(get_numpy(val_losses),
                                                          get_numpy(val_accs)))
        net.eval()

        
    output = net(batch.text)
    batch_loss = criterion(output['out'], batch.label)
    
    train_loss.append(get_numpy(batch_loss))
    train_accs.append(get_numpy(accuracy(output['out'], batch.label)))
    
    optimizer.zero_grad()
    batch_loss.backward()
    optimizer.step()
    
    if i % log_every == 0:        
        print("train, it: {} loss: {:.2f} accs: {:.2f}".format(i, 
                                                               np.mean(train_loss), 
                                                               np.mean(train_accs)))
        # reset
        train_loss, train_accs = [], []
        
    
    if max_iter < i:
        break