In [4]:
# !pip3 install torchtext==0.4

Collecting torchtext==0.4
[?25l  Downloading https://files.pythonhosted.org/packages/43/94/929d6bd236a4fb5c435982a7eb9730b78dcd8659acf328fd2ef9de85f483/torchtext-0.4.0-py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 759kB/s eta 0:00:01
Installing collected packages: torchtext
Successfully installed torchtext-0.4.0


In [5]:
import torch
import torchtext
from torch import nn 
import torch.nn.functional as F
from torchtext.vocab import Vectors, GloVe
from torch.autograd import Variable
import torch.optim as optim
import torchtext
import numpy as np
import torch.distributions
from torchtext import datasets
from torchtext import data
# from models.LSTM import LSTMClassifier

In [6]:
# model.py

class GANet(torch.nn.Module):
		def __init__(self, batch_size, num_classes, mlp_out_size, vocab_size, embedding_length, weights, aux_hidden_size = 100, backbone_hidden_size = 100, tau = 1, biDirectional_aux = False, biDirectional_backbone = False):
			super(GANet, self).__init__() 
			"""
			Arguments
			---------
			batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
			output_size : 6 = (For TREC dataset)
			hidden_sie : Size of the hidden_state of the LSTM   (// Later BiLSTM)
			vocab_size : Size of the vocabulary containing unique words
			embedding_length : Embeddding dimension of GloVe word embeddings
			weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 

			--------

			"""

			self.batch_size = batch_size
			self.num_classes = num_classes
			self.vocab_size = vocab_size
			self.embedding_length = embedding_length
			self.aux_hidden_size = aux_hidden_size
			self.backbone_hidden_size = backbone_hidden_size 
			self.mlp_out_size = mlp_out_size
			self.biDirectional_aux = biDirectional_aux
			self.biDirectional_backbone = biDirectional_backbone
			self.tau = tau

			self.word_embeddings = nn.Embedding(vocab_size, embedding_length)
			self.word_embeddings.weights = nn.Parameter(weights, requires_grad=False)

			self.auxiliary = AuxiliaryNet(self.batch_size, self.aux_hidden_size, self.embedding_length, self.biDirectional_aux, tau = self.tau)
			self.backbone = BackboneNet(self.batch_size, self.backbone_hidden_size, self.embedding_length, self.biDirectional_backbone)
	 
			if(self.biDirectional_backbone):
				self.mlp = MLP(self.backbone_hidden_size * 2, self.mlp_out_size)
				self.FF = nn.Linear(self.backbone_hidden_size * 2,num_classes)
			else:
				self.mlp = MLP(self.backbone_hidden_size, self.mlp_out_size)
				self.FF = nn.Linear(self.backbone_hidden_size,num_classes)
			# self.softmax = nn.Softmax(dim = -1)
			

		def masked_Softmax(self, logits, mask):
			mask_bool = mask >0
			logits[~mask_bool] = float('-inf')
			return torch.softmax(logits, dim=1)	

		def forward(self,input_sequence, is_train = True):
			input_ = self.word_embeddings(input_sequence)
			g_t = self.auxiliary(input_, is_train)
			out_lstm = self.backbone(input_)

			if is_train:
				e_t = self.mlp(out_lstm)
				alpha = torch.softmax(e_t, dim = 1)
			else:
				e_t = self.mlp(out_lstm)               # change if possible!
				alpha = self.masked_Softmax(e_t, g_t)

			c_t = torch.bmm(alpha.transpose(1,2), out_lstm)
			logits = self.FF(c_t)
			final_output = torch.softmax(logits, dim = -1)
			# final_output = final_output.max(2)[1]
			final_output = final_output.squeeze(1)


			return final_output, g_t


In [7]:
class AuxiliaryNet(torch.nn.Module):
    """
    Arguments
    ---------
    batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
    aux_hidden_size : Size of the hidden_state of the LSTM   (* Later BiLSTM, check dims for BiLSTM *)
    embedding_length : Embeddding dimension of GloVe word embeddings
    --------
    """
    def __init__(self, batch_size, auxiliary_hidden_size, embedding_length, biDirectional = False, num_layers = 1, tau=1):
        super(AuxiliaryNet, self).__init__()
        self.batch_size = batch_size
        self.hidden_size = auxiliary_hidden_size
        self.embedding_length = embedding_length	
        self.biDirectional	= biDirectional
        self.num_layers = num_layers

        self.aux_lstm = nn.LSTM(self.embedding_length, self.hidden_size, bidirectional = self.biDirectional, num_layers = self.num_layers, batch_first = True)   # Dropout  
        if(self.biDirectional):
            self.aux_linear = nn.Linear(self.hidden_size * 2,1)
        else:
            self.aux_linear = nn.Linear(self.hidden_size,1)
        self.sigmoid = torch.nn.Sigmoid()
        self.tau = tau


    def forward(self, input_sequence, is_train = True, batch_size=None):

        # input : Dimensions (batch_size x seq_len x embedding_length)
        out_lstm, (final_hidden_state, final_cell_state) = self.aux_lstm(input_sequence) # ouput dim: (batch_size x seq_len x hidden_size) 
        out_linear = self.aux_linear(out_lstm)                                           # p_t dim: (batch_size x seq_len x 1)
        p_t = self.sigmoid(out_linear)

        if is_train:
            p_t = p_t.repeat(1,1,2)
            p_t[:,:,0] = 1 - p_t[:,:,0] 
            g_hat = F.gumbel_softmax(p_t, self.tau, hard=False)   
            g_t = g_hat[:,:,1]
        else:
            # size : same as p_t [ batch_size x seq_len x 1]
            m = torch.distributions.bernoulli.Bernoulli(p_t)   
            g_t = m.sample()
        return g_t

In [8]:
class BackboneNet(torch.nn.Module):
	"""
		Arguments
		---------
		batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
		backbone_hidden_size : Size of the hidden_state of the LSTM   (* Later BiLSTM, check dims for BiLSTM *)
		embedding_length : Embeddding dimension of GloVe word embeddings
		--------
		"""
	def __init__(self, batch_size, backbone_hidden_size, embedding_length, biDirectional = False, num_layers = 2):

		super(BackboneNet, self).__init__()
		self.batch_size = batch_size
		self.hidden_size = backbone_hidden_size
		self.embedding_length = embedding_length
		self.biDirectional	= biDirectional
		self.num_layers = num_layers

		self.backbone_lstm = nn.LSTM(self.embedding_length, self.hidden_size, bidirectional = self.biDirectional, batch_first = True, num_layers = self.num_layers)   # Dropout  

	def forward(self, input_sequence, batch_size=None):
		out_lstm, (final_hidden_state, final_cell_state) = self.backbone_lstm(input_sequence)   # ouput dim: ( batch_size x seq_len x hidden_size )
		return out_lstm


In [9]:
class MLP(torch.nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim

        self.ff_1 = nn.Linear(self.input_dim, self.output_dim)
        self.relu = nn.ReLU()
        self.ff_2 = nn.Linear(self.output_dim,1)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        out_1 = self.ff_1(x)
        out_relu = self.relu(out_1)
        out_2 = self.ff_2(out_relu)
        out_sigmoid = self.sigmoid(out_2)

        return out_sigmoid 

In [10]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)
    
def train_model(model, train_iter, epoch, batch_size):
    total_epoch_loss = 0
    total_epoch_acc = 0
    # model.cuda()
    optim = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()))
    steps = 0
    model.train()
    for idx, batch in enumerate(train_iter):
        text = batch.text[0]
        target = batch.label
        target = torch.autograd.Variable(target).long()
        if torch.cuda.is_available():
            text = text.cuda()
            target = target.cuda()
        if (text.size()[0] is not batch_size):# One of the batch returned by BucketIterator has length different than 32.
            continue
        optim.zero_grad()
        prediction, g_t = model(text, is_train = True)
        # print("prediction = ", prediction.shape)
        # print("target = ", target.shape)
        # print("prediction = ", prediction)
        # print("target = ", target)
        # loss = loss_fn(prediction, target, g_t)
        loss =  loss_fn(prediction, target)
        # print("loss = ", loss)
        num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).float().sum()
        acc = 100.0 * num_corrects/len(batch)
        loss.backward()
        clip_gradient(model, 1e-1)
        optim.step()
        steps += 1
        
        # if steps % 10 == 0:
            # print (f'Epoch: {epoch+1}, Idx: {idx+1}, Training Loss: {loss.item():.4f}, Training Accuracy: {acc.item(): .2f}%')
            # break
        
        total_epoch_loss += loss.item()
        total_epoch_acc += acc.item()

        
    return total_epoch_loss/len(train_iter), total_epoch_acc/len(train_iter)

def eval_model(model, val_iter):
    total_epoch_loss = 0
    total_epoch_acc = 0
    total_attention =  0
    total_samples = 0 
    model.eval()
    with torch.no_grad():
        for idx, batch in enumerate(val_iter):
            text = batch.text[0]
            if (text.size()[0] is not 32):
                continue
            target = batch.label
            target = torch.autograd.Variable(target).long()
            if torch.cuda.is_available():
                text = text.cuda()
                target = target.cuda()
            prediction, g_t = model(text, is_train = False)
            # Sanity check
            # print("Test Prediction: ", prediction)
            # print("Gate values: ", g_t)

            # For density calculation
            total_attention += torch.sum(g_t)
            # print(total_attention)
            # print(g_t.shape)
            total_samples += g_t.shape[0] * g_t.shape[1]
            # loss = loss_fn(prediction, target, g_t)
            loss =  loss_fn(prediction, target)
            num_corrects = (torch.max(prediction, 1)[1].view(target.size()).data == target.data).sum()
            acc = 100.0 * num_corrects/len(batch)
            total_epoch_loss += loss.item()
            total_epoch_acc += acc.item()

    return total_epoch_loss/len(val_iter), total_epoch_acc/len(val_iter), total_attention/total_samples 


In [11]:
# data.py
def load_TREC_data(batch_size= 32, embedding_length = 100):
    # set up fields
    tokenize = lambda x: x.split()
    TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length= 10)
    # LABEL = data.LabelField()
    LABEL = data.LabelField(dtype=torch.float)

    # make splits for data
    train, test = datasets.TREC.splits(TEXT, LABEL)

    # build the vocabulary
    TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=embedding_length))
    LABEL.build_vocab(train)
    print(LABEL.vocab.__dict__)

    # make iterator for splits
    train_iter, test_iter = data.BucketIterator.splits(
      (train, test), batch_size= batch_size, device=0)

    word_embeddings = TEXT.vocab.vectors
    vocab_size = len(TEXT.vocab)

    return TEXT, vocab_size, word_embeddings, train_iter, test_iter

In [12]:
# main.py
TEXT, vocab_size, word_embeddings, train_iter, test_iter = load_TREC_data()

downloading train_5500.label


train_5500.label: 100%|██████████| 336k/336k [00:00<00:00, 1.74MB/s]


downloading TREC_10.label


TREC_10.label: 100%|██████████| 23.4k/23.4k [00:00<00:00, 427kB/s]
.vector_cache/glove.6B.zip: 862MB [32:52, 437kB/s]                                


{'freqs': Counter({'ENTY': 1250, 'HUM': 1223, 'DESC': 1162, 'NUM': 896, 'LOC': 835, 'ABBR': 86}), 'itos': ['ENTY', 'HUM', 'DESC', 'NUM', 'LOC', 'ABBR'], 'unk_index': None, 'stoi': defaultdict(None, {'ENTY': 0, 'HUM': 1, 'DESC': 2, 'NUM': 3, 'LOC': 4, 'ABBR': 5}), 'vectors': None}


In [13]:
def loss_fn(output, target, g_t, lambda_ = 1e-4):
  T = len(g_t)
  # loss = -nn.LogSoftmax(output[target], dim = 1) + (lambda_ * torch.sum(g_t))/T
  loss = F.cross_entropy(output, target) + (lambda_ * torch.sum(g_t))/T
  return loss

In [14]:
loss_fn = F.cross_entropy

learning_rate = 2e-5
batch_size = 32
output_size = 2
hidden_size = 256
embedding_length = 100
num_classes = 6
mlp_out_size = 32
weights = word_embeddings
aux_hidden_size = 100
batch_hidden_size = 100
tau = 0.5

model = GANet(batch_size, num_classes, mlp_out_size, vocab_size, embedding_length, weights, tau= tau, biDirectional_aux=False, biDirectional_backbone=False)

 99%|█████████▉| 397670/400000 [00:30<00:00, 23881.79it/s]

In [0]:
for epoch in range(50):
    train_loss, train_acc = train_model(model, train_iter, epoch, batch_size)
    # val_loss, val_acc = eval_model(model, valid_iter)
    
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%')
    # print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc:.2f}%, Val. Loss: {val_loss:3f}, Val. Acc: {val_acc:.2f}%')

Epoch: 01, Train Loss: 1.062, Train Acc: 96.97%
Epoch: 02, Train Loss: 1.062, Train Acc: 96.97%
Epoch: 03, Train Loss: 1.061, Train Acc: 97.09%
Epoch: 04, Train Loss: 1.061, Train Acc: 97.04%
Epoch: 05, Train Loss: 1.061, Train Acc: 97.06%
Epoch: 06, Train Loss: 1.061, Train Acc: 97.06%
Epoch: 07, Train Loss: 1.060, Train Acc: 97.15%
Epoch: 08, Train Loss: 1.061, Train Acc: 97.04%
Epoch: 09, Train Loss: 1.061, Train Acc: 97.09%
Epoch: 10, Train Loss: 1.060, Train Acc: 97.13%
Epoch: 11, Train Loss: 1.060, Train Acc: 97.13%
Epoch: 12, Train Loss: 1.061, Train Acc: 97.09%
Epoch: 13, Train Loss: 1.060, Train Acc: 97.15%
Epoch: 14, Train Loss: 1.060, Train Acc: 97.19%
Epoch: 15, Train Loss: 1.059, Train Acc: 97.22%
Epoch: 16, Train Loss: 1.059, Train Acc: 97.22%
Epoch: 17, Train Loss: 1.060, Train Acc: 97.13%
Epoch: 18, Train Loss: 1.061, Train Acc: 97.08%
Epoch: 19, Train Loss: 1.059, Train Acc: 97.22%
Epoch: 20, Train Loss: 1.059, Train Acc: 97.24%
Epoch: 21, Train Loss: 1.059, Train Acc:

In [0]:
test_loss, test_acc, density = eval_model(model, test_iter)
print(f'Test Loss: {test_loss:.3f}, Test Acc: {test_acc:.2f}, Density: {density:.4f} ')

Test Loss: 1.136, Test Acc: 77.54, Density: 0.4812 


In [0]:
def test_sentence(test_sen):

  test_sen = TEXT.preprocess(test_sen)
  print(test_sen)
  test_sen = [[TEXT.vocab.stoi[x] for x in test_sen]]
  # print(test_sen)

  test_sen = np.asarray(test_sen)
  test_sen = torch.LongTensor(test_sen)
  test_tensor = Variable(test_sen, volatile=True)

  # print(test_tensor)
  model.eval()
  prediction, g_t = model(test_tensor, is_train = False)
  print("prediction =", prediction)
  print("g =", g_t)
  out_class = torch.argmax(prediction)
  return out_class

# ['ENTY', 'HUM', 'DESC', 'NUM', 'LOC', 'ABBR']
test_sen0 = "What does the six-footed Musca domestica become when it enters a house ?" # class = Entity - 0
test_sen1 = "Who killed Gandhi?"   # Class: HUM - 1
test_sen2 = "What does target heart rate mean ?" # class = "DESC"
test_sen3 = "How old was Joan of Arc when she died ?" # class = "NUM"
test_sen4 = "Where on the body is a mortarboard worn ?" # class = "LOC"
test_sen5 = "What does I.V. stand for ?" # class = "ABBR"
x = test_sentence(test_sen5)
print(x)