In [32]:
import random
import spacy
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext import data
nlp = spacy.load('en')

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Reproducing same results
SEED = 2019

#Torch
torch.manual_seed(SEED)

#Cuda algorithms
torch.backends.cudnn.deterministic = True  

### Data Preprocessing

In [4]:
TEXT = data.Field(tokenize='spacy',batch_first=True,include_lengths=True)
LABEL = data.LabelField(dtype = torch.float,batch_first=True)

In [None]:
print(TEXT)
print(LABEL)

<torchtext.data.field.Field object at 0x7fbb08cac580>
<torchtext.data.field.LabelField object at 0x7fbb08cac310>


In [5]:
fields = [(None, None), ('text',TEXT),('label', LABEL)]

In [6]:
print(fields)

[(None, None), ('text', <torchtext.data.field.Field object at 0x7f0588c7e7f0>), ('label', <torchtext.data.field.LabelField object at 0x7f0532a40198>)]


In [7]:
#loading custom dataset
training_data=data.TabularDataset(path = '/content/drive/My Drive/Colab Notebooks/datasets and models/quora.csv',format = 'csv',fields = fields,skip_header = True)

#print preprocessed text
print(vars(training_data.examples[0]))

{'text': ['Why', 'are', 'most', 'indian', 'parents', 'against', 'even', 'liking', 'someone', '?'], 'label': '1'}


In [8]:
train_data, valid_data = training_data.split(split_ratio=0.7, random_state = random.seed(SEED))

In [9]:
print(train_data)

<torchtext.data.dataset.Dataset object at 0x7f0532a40278>


#### 1 .Preparing input and output sequences:

Parameters:

    1. min_freq: Ignores the words in vocabulary which has frequency less than specified one and map it to        unknown token.
    2. Two special tokens known as unknown and padding will be added to the vocabulary
      - Unknown token is used to handle Out Of Vocabulary words
      - Padding token is used to make input sequences of same length


In [11]:
#initialize glove embeddings
TEXT.build_vocab(train_data,min_freq=3,vectors = "glove.6B.100d")  
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [08:30, 1.69MB/s]                           
100%|█████████▉| 399540/400000 [00:22<00:00, 19010.24it/s]

In [12]:
#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

Size of TEXT vocabulary: 17135


In [14]:
#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

Size of LABEL vocabulary: 46


In [15]:
#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

[('?', 76227), ('the', 39445), ('to', 26136), ('a', 22200), (',', 20512), ('of', 19570), ('in', 18979), ('and', 18489), ('Why', 17491), ('is', 17378)]


In [16]:
#Word dictionary
print(TEXT.vocab.stoi)   



In [17]:
#check whether cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  

In [18]:
#set batch size
BATCH_SIZE = 64

In [19]:
#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, valid_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True,
    device = device)

### Model Architecture

In [20]:
class Model(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout):
        
        #Constructor
        super().__init__()          
        
        #embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #lstm layer
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        #dense layer
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        
        #activation function
        self.act = nn.Sigmoid()
        
    def forward(self, text, text_lengths):
        #text = [batch size,sent_length]
        embedded = self.embedding(text)
        
        #embedded = [batch size, sent_len, emb dim]
      
        #packed sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths,batch_first=True)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        #hidden = [batch size, num layers * num directions,hid dim]
        #cell = [batch size, num layers * num directions,hid dim]
        
        #concat the final forward and backward hidden state
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
                
        #hidden = [batch size, hid dim * num directions]
        dense_outputs=self.fc(hidden)

        #Final activation function
        outputs=self.act(dense_outputs)
        
        return outputs

#### 1 .Setting Hyperparameters

In [21]:
#define hyperparameters
size_of_vocab = len(TEXT.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 2
bidirection = True
dropout = 0.2

#instantiate the model
model = Model(size_of_vocab, embedding_dim, num_hidden_nodes,num_output_nodes, num_layers, 
                   bidirectional = True, dropout = dropout)

In [22]:
#architecture
print(model)

Model(
  (embedding): Embedding(17135, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)


In [23]:
#No. of trianable parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [24]:
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 1,772,957 trainable parameters


In [25]:
#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.1638,  0.6046,  1.0789,  ..., -0.3140,  0.1844,  0.3624],
        ...,
        [-0.5879,  0.0165, -0.6412,  ...,  0.1248,  0.0362, -0.3741],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

In [26]:
#define optimizer and loss
optimizer = optim.Adam(model.parameters())
criterion = nn.BCELoss()

In [27]:
#define metric
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [28]:
#push to cuda if available
model = model.to(device)
criterion = criterion.to(device)

In [29]:
def train(model, iterator, optimizer, criterion):
    
    #initialize every epoch 
    epoch_loss = 0
    epoch_acc = 0
    
    #set the model in training phase
    model.train()  
    
    for batch in iterator:
        
        #resets the gradients after every batch
        optimizer.zero_grad()   
        
        #retrieve text and no. of words
        text, text_lengths = batch.text   
        
        #convert to 1D tensor
        predictions = model(text, text_lengths).squeeze()  
        
        #compute the loss
        loss = criterion(predictions, batch.label)        
        
        #compute the binary accuracy
        acc = binary_accuracy(predictions, batch.label)   
        
        #backpropage the loss and compute the gradients
        loss.backward()       
        
        #update the weights
        optimizer.step()      
        
        #loss and accuracy
        epoch_loss += loss.item()  
        epoch_acc += acc.item()    
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [30]:
def evaluate(model, iterator, criterion):
    
    #initialize every epoch
    epoch_loss = 0
    epoch_acc = 0

    #deactivating dropout layers
    model.eval()
    
    #deactivates autograd
    with torch.no_grad():
    
        for batch in iterator:
        
            #retrieve text and no. of words
            text, text_lengths = batch.text
            
            #convert to 1d tensor
            predictions = model(text, text_lengths).squeeze()
            
            #compute loss and accuracy
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            #keep track of loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [36]:
N_EPOCHS = 20
best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
     
    #train the model
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    
    #evaluate the model
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

	Train Loss: -0.151 | Train Acc: 90.26%
	 Val. Loss: 0.406 |  Val. Acc: 88.84%
	Train Loss: -1.063 | Train Acc: 91.92%
	 Val. Loss: 0.382 |  Val. Acc: 88.15%
	Train Loss: -1.149 | Train Acc: 92.90%
	 Val. Loss: 0.395 |  Val. Acc: 88.20%
	Train Loss: -1.167 | Train Acc: 93.84%
	 Val. Loss: 0.350 |  Val. Acc: 88.03%
	Train Loss: -1.207 | Train Acc: 94.57%
	 Val. Loss: 0.425 |  Val. Acc: 87.66%
	Train Loss: -1.278 | Train Acc: 95.53%
	 Val. Loss: 0.496 |  Val. Acc: 87.39%
	Train Loss: -1.276 | Train Acc: 96.21%
	 Val. Loss: 0.589 |  Val. Acc: 86.89%
	Train Loss: -1.288 | Train Acc: 96.73%
	 Val. Loss: 0.684 |  Val. Acc: 86.36%
	Train Loss: -1.343 | Train Acc: 97.47%
	 Val. Loss: 0.693 |  Val. Acc: 86.44%
	Train Loss: -1.355 | Train Acc: 97.97%
	 Val. Loss: 0.705 |  Val. Acc: 86.27%
	Train Loss: -1.365 | Train Acc: 98.28%
	 Val. Loss: 0.813 |  Val. Acc: 86.11%
	Train Loss: -1.369 | Train Acc: 98.55%
	 Val. Loss: 0.917 |  Val. Acc: 86.09%
	Train Loss: -1.325 | Train Acc: 98.74%
	 Val. Loss:

In [37]:
#load weights
path='saved_weights.pt'
model.load_state_dict(torch.load(path))
model.eval()                             

Model(
  (embedding): Embedding(17135, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act): Sigmoid()
)

In [38]:
def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence 
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    prediction = model(tensor, length_tensor)                  #prediction 
    return prediction.item()      

In [39]:
#make predictions
predict(model, "Are there any sports that you don't like?")

#insincere question
predict(model, "Why Indian girls go crazy about marrying Shri. Rahul Gandhi ji?")

0.9782972931861877