In [1]:
#importing the required libraries
import torch
import torchtext
import spacy
spacy.load('en')

<spacy.lang.en.English at 0x7f752cfab748>

In [2]:
#arguments specify the processing of data like which type of tokenization to be used etc.
TEXT = torchtext.data.Field(lower=True,tokenize='spacy')
Label = torchtext.data.LabelField(tensor_type=torch.FloatTensor)

In [3]:
train , test = torchtext.datasets.IMDB.splits(TEXT,Label)

In [4]:
print(len(train),len(test))

25000 25000


In [5]:
#I am reducing the size of my train set that I will use because of non-availability of GPU and time constraint
train,unused_train=train.split(split_ratio = 0.1)
test,unused_test = test.split(split_ratio = 0.04)

In [6]:
print(len(train),len(test))

2500 1000


In [7]:
#now we will create the validation set and for that we will use test set, not train set
train,validation = train.split(split_ratio=0.8)

In [8]:
print(len(train),len(test),len(validation))

2000 1000 500


In [9]:
#building the vocabulary/ the lookup table for our words and using the pretrained word vector embeddings(glove model)
#keep in mind the ram access of your python version to avoid memory error
TEXT.build_vocab(train,vectors='glove.6B.100d',max_size=25000)
Label.build_vocab(train)

In [10]:
#create iterators with batches to feed into the network for training,validation,testing 
#using bucket iterators which puts examples of similar length into same bucket means by the amount of <pad> required for them
batch_size = 8

train_iter, valid_iter, test_iter = torchtext.data.BucketIterator.splits((train, validation, test), batch_size=batch_size, sort_key=lambda x: len(x.text), repeat=False)

In [11]:
#now define the model, here we will use LSTM which is a modified version of RNN as well as dropout as our regularizer
#LSTM will be bidirectional 

class lstm_arch(torch.nn.Module):
    
    def __init__(self,embed_dim,vocab_size,hid_dim,out_dim,dropout_prob): 
        super().__init__()
        self.embedding = torch.nn.Embedding(vocab_size,embed_dim)           #prepare the lookup table for word embeddings
        self.rnn = torch.nn.LSTM(embed_dim,hid_dim,bias=True,num_layers=2,bidirectional=True,dropout=dropout_prob)  #LSTM 2 layered and bidirectional
        self.fc = torch.nn.Linear(hid_dim*2,out_dim)          #fully connected layer for output
        self.dropout = torch.nn.Dropout(p = dropout_prob)
    
    def forward(self,feed_data):
        embed_out = self.dropout(self.embedding(feed_data))
        rnn_out,(rnn_hid,rnn_cell) = self.rnn(embed_out)
        hidden = self.dropout(torch.cat((rnn_hid[-2,:,:], rnn_hid[-1,:,:]), dim=1))
        return self.fc(hidden.squeeze(0))

In [12]:
#creating an instance of our lstm_arch class or the specific model to train and test
model = lstm_arch(vocab_size=len(TEXT.vocab),embed_dim=100,hid_dim=256,out_dim=1,dropout_prob=0.5)

In [13]:
#now loading the values of pretrained embeddings into the embedding layer of our model
pretrained = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained)


tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0156,  0.3192,  0.2856,  ...,  0.5062, -0.0928,  0.5417],
        [ 0.4981, -0.6555,  0.1981,  ...,  0.4539, -0.6788,  0.2721]])

In [14]:
optimizer = torch.optim.Adam(model.parameters(),betas=(0.7,0.995),lr=0.005)  #optimizer for our model used adam here
criterion = torch.nn.BCEWithLogitsLoss()                   #criterion to evaluate loss

In [15]:
#function to train our model we calculate both loss and accuracy of the model
def train(model, iterator, optimizer, criterion,epoch):
    
    model.train()
    for i in range(epoch):
        epoch_loss = 0
        epoch_acc = 0
        for batch in iterator:
        
            optimizer.zero_grad()
        
            predictions = model(batch.text).squeeze(1)
        
            loss = criterion(predictions, batch.label)
        
            rounded_predictions = torch.round(torch.nn.functional.sigmoid(predictions))
            correct = (rounded_predictions == batch.label).float() 
            acc = correct.sum()/len(correct)
        
            loss.backward()
        
            optimizer.step()
        
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
        print('loss: ',epoch_loss / len(iterator),' accuracy: ' ,epoch_acc / len(iterator))    

In [16]:
#function for evaluation of model's performance
def evaluate(model, iterator, optimizer, criterion,epoch):
    
    model.eval()
    
    with torch.no_grad():
        for i in range(epoch):
            epoch_loss = 0
            epoch_acc = 0
            for batch in iterator:

                predictions = model(batch.text).squeeze(1)
                loss = criterion(predictions, batch.label)
            
                rounded_predictions = torch.round(torch.nn.functional.sigmoid(predictions))
                correct = (rounded_predictions == batch.label).float() 
                acc = correct.sum()/len(correct)
        
                epoch_loss += loss.item()
                epoch_acc += acc.item()
        
            print('loss: ',epoch_loss / len(iterator),' accuracy: ' ,epoch_acc / len(iterator))

In [17]:
#here losses are a bit high and accuracy is low compared to the one where we train the model with 25000 training examples and validate on 5000 examples taken from test set and finally we evaluate on 20000 test examples
for i in range(3):
    print('epoch %d :'%(i+1))
    train(model,train_iter,optimizer,criterion,1)
    evaluate(model,valid_iter,optimizer,criterion,1)
    print('\n')
    
    

epoch 1 :
loss:  0.716819109916687  accuracy:  0.51


  return Variable(arr, volatile=not train)


loss:  0.6988581352763705  accuracy:  0.47023809523809523


epoch 2 :
loss:  0.7548297755718231  accuracy:  0.4965
loss:  0.6881475155315702  accuracy:  0.5376984126984127


epoch 3 :
loss:  0.7458459858894348  accuracy:  0.542
loss:  0.7193611689976284  accuracy:  0.501984126984127




In [18]:
#checking the results on the test set we can see that our model generalizes well enough just for more accuracy use all data
print('test results: ')
evaluate(model,test_iter,optimizer,criterion,1)

test results: 


  return Variable(arr, volatile=not train)


loss:  0.7121250720024109  accuracy:  0.533
