<a href="https://colab.research.google.com/github/venkatanadikatla/pytorch/blob/main/RNN_with_SST_Better_Accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y torchtext
!pip install torchtext==0.6.0

Found existing installation: torchtext 0.6.0
Uninstalling torchtext-0.6.0:
  Successfully uninstalled torchtext-0.6.0
Collecting torchtext==0.6.0
  Using cached torchtext-0.6.0-py3-none-any.whl (64 kB)
Installing collected packages: torchtext
Successfully installed torchtext-0.6.0


In [134]:
import copy # this module provides functions to duplicate objects. It seems to be imported but not yet used in the code
import torch # The MAIN PyTorch package
from torch import nn # contains the essential modules for building NN in pytorch.
from torch import optim # Provides optimization algorithms, such as SGD, Adam, etc
import torchtext # A library for text processing that works well with pytorch (Currently a version 0.6.0 is being used in this code)
from torchtext import data # A module in torchtext used for data handling
from torchtext import datasets # provides datasets, including various NLP datasets.

TEXT = data.Field(sequential=True, batch_first=True, lower=True) # Sequential = True indicates that the data consists of sequences.
#Batch_first=True Ensure that batch dimenstion is the first dimension in the tensor. # lower=True Converts all the text to lowercase

LABEL = data.LabelField() # A subclass of Field specifically for handling labels in a classification task.

# load data splits
train_data, val_data, test_data = datasets.SST.splits(TEXT, LABEL) #datasets.SST.splits - Loads the Standford Sentiment Treebank(SST) dataset and splits the dataset

# build dictionary
# build_vocab: Creates a mapping from tokens(words) to indices. This is essential for converting text data into numerical form that can be used by NN.
TEXT.build_vocab(train_data) # Builds the vocabulary for the text field using the training data.
LABEL.build_vocab(train_data)# Builds the vocabulary for the label field using the training data.

# hyperparameters
vocab_size = len(TEXT.vocab) # the size of the vocabulary (number of unique tokens in the training data)
label_size = len(LABEL.vocab) # the number of unique labels (classes) in the traning data
padding_idx = TEXT.vocab.stoi['<pad>'] # The index used for padding sequences to the same length
embedding_dim = 300  # The size of the word embeddings (dense vector representation of words)
hidden_dim = 512 # Size of the hidden layers in the model

# build iterators
train_iter, val_iter, test_iter = data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size=64)

# Data.bucketiterator.splits - Creates iterators for the training, validation and test sets.


In [135]:
class RNN (torch.nn.Module):
  def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx):
    super().__init__()
    self.vocab_size = vocab_size
    self.embedding_dim = embedding_dim
    self.hidden_dim = hidden_dim
    self.label_size = label_size
    self.num_layers = 3
    # self.dropout = dropout
    self.padding_idx = padding_idx
    self.embedding = torch.nn.Embedding(vocab_size, embedding_dim,  padding_idx = padding_idx)
    # self.rnn = torch.nn.RNN(embedding_dim,hidden_dim, nonlinearity='relu',batch_first=True)
    self.lstm = torch.nn.LSTM(embedding_dim,hidden_dim, num_layers=3, batch_first=True)
    self.fc = torch.nn.Linear(hidden_dim, output_dim)

  def zero_state(self, batch_size):
    # Implement the function, which returns an initial hidden state.
    return (torch.zeros(self.num_layers, batch_size, self.hidden_dim),
            torch.zeros(self.num_layers, batch_size, self.hidden_dim))


  # def forward(self, text):
  #   embedded = self.embedding(text)
  #   batch_size = text.size(0)
  #   h_0 = self.zero_state(batch_size).to(text.device)  # Ensure the hidden state is on the same device as the input
  #   output, (hidden,cell) = self.lstm(embedded)
  #   hidden = hidden[-1]
  #   return self.fc(hidden)

  def forward(self, text):
    embedded = self.embedding(text)
    batch_size = text.size(0)
    h_0, c_0 = self.zero_state(batch_size)
    h_0, c_0 = h_0.to(text.device), c_0.to(text.device)  # Ensure the hidden state is on the same device as the input
    output, (hidden,cell) = self.lstm(embedded, (h_0, c_0))
    hidden = hidden[-1]
    return self.fc(hidden)




In [128]:
def train_model(model,train_iter, optimizer, criterion, num_epochs =20):
  model.train()
  for epoch in range(num_epochs):
    epoch_loss = 0
    epoch_acc = 0
    correct = 0
    total = 0
    for batch in train_iter:
      optimizer.zero_grad()
      text, labels = batch.text, batch.label
      output = model(text)
      loss = criterion(output, labels)
      loss.backward()
      optimizer.step()
      epoch_loss += loss.item()
      epoch_acc += (output.argmax(1) ==labels).sum().item()

      _, predicted = torch.max(output.data,1)
      total += labels.size(0)
      correct += (predicted == labels).sum().item()

    avg_loss = epoch_loss/len(train_iter)
    avg_acc = epoch_acc/len(train_iter.dataset)

    epoch_accuracy = 100*correct/total


    print(f' Epoch {epoch+1}, Train Loss: {avg_loss}, Train Accuracy: {epoch_accuracy}')



In [129]:
def eval_model(model, val_iter, criterion):
  model.eval()
  epoch_loss = 0
  epoch_acc = 0
  correct = 0
  total = 0
  for batch in val_iter:
    text, labels = batch.text, batch.label
    output = model(text)
    loss = criterion(output, labels)
    epoch_loss +=loss.item()
    epoch_acc += (output.argmax(1)==labels).sum().item()
    _, predicted = torch.max(output.data,1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

  avg_loss = epoch_loss/len(val_iter)
  avg_acc = epoch_acc/len(val_iter.dataset)

  epoch_accuracy = 100*avg_acc

  print(f'Validation Loss: {avg_loss},  Validation Accuracy: {epoch_accuracy}')




In [130]:
def test_model(model, test_iter, criterion):
  model.eval()
  epoch_loss = 0
  epoch_acc = 0
  correct = 0
  total = 0
  for batch in test_iter:
    text, labels = batch.text, batch.label
    output = model(text)
    loss = criterion(output, labels)
    epoch_loss +=loss.item()
    epoch_acc += (output.argmax(1)==labels).sum().item()
    _, predicted = torch.max(output.data,1)
    total += labels.size(0)
    correct += (predicted == labels).sum().item()

  avg_loss = epoch_loss/len(test_iter)
  avg_acc = epoch_acc/len(test_iter.dataset)

  epoch_accuracy = 100*correct/total

  print(f'Test Loss: {avg_loss},  Test Accuracy: {epoch_accuracy}')




In [136]:
torch.manual_seed(42)
model = RNN(vocab_size,embedding_dim,hidden_dim,label_size, padding_idx)
# optimizer = optim.Adam(model.parameters(), lr=0.0005)
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
# optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
criterion = nn.CrossEntropyLoss()

In [1]:
train_model(model,train_iter, optimizer, criterion, num_epochs =20)

NameError: name 'train_model' is not defined

In [116]:
# Evaluate the model on validation data
eval_model(model, val_iter, criterion)



Validation Loss: 1.6552131440904405,  Validation Accuracy: 49.863760217983646


In [117]:
# Evaluate the model on test data
test_model (model, test_iter, criterion)

Test Loss: 1.4131793209484644,  Test Accuracy: 53.619909502262445


In [None]:
test_data[43]

<torchtext.data.example.Example at 0x7cf08e8830d0>

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
def get_specific_data_point(data, index):
    return data[index]

# Evaluate a specific data point
def evaluate_specific_data_point(model, data_point, text_field, label_field, criterion):
    model.eval()

    # Convert the data point to a tensor
    text = text_field.process([data_point.text]).to(device)
    label = torch.tensor([label_field.vocab.stoi[data_point.label]]).to(device)

    with torch.no_grad():
        output = model(text)
        loss = criterion(output, label)

        _, predicted = torch.max(output.data, 1)
        is_correct = (predicted.item() == label.item())

        print(f'Predicted: {label_field.vocab.itos[predicted.item()]}')
        print(f'Actual: {data_point.label}')
        print(f'Loss: {loss.item()}')
        print(f'Correct: {is_correct}')

# Get the specific data point
specific_data_point = get_specific_data_point(test_data, 2000)

# Evaluate the specific data point
evaluate_specific_data_point(model, specific_data_point, TEXT, LABEL, criterion)

Predicted: negative
Actual: negative
Loss: 0.001959072658792138
Correct: True


Used the following hyperparameters to increase the accuracy:

1. Changed the embedding_dim and hidden_dim to 256 (dense vector representation of words) check if this simple change in hyperparameters could change the accuracy. However, test accuracy @ 56% was little less than my baseline model accuracy @ 58%
2. With the addition of item#1 added the dropout regularization technique @0.3 to see if this can help increasing the accuracy. Doing this reduce the training accuracy to 83% and improved the test accuracyt to 57%, however, this is less than my baseline model accuracy.
3. Now adding the LSTM to 2 layers, Increasing the learning rate to 0.0001 and removing the dropout all together to see if this will increase the acccuracy - Performance is not better than item#2 infact the test accuracy dropped to 52%.
4. Now adding the LSTM to 2 layers, Increasing the learning rate to 0.0001 and adding the dropout (0.3) to see if this will increase the acccuracy - Apparently, there was no change in test accuracy. Reached test accuracy @ 51%.
5. Updated the embedding dimension to 512 and hidden layers to 512, batch_size=128 and dropout =0.3 and lr=0.0001 - Still testing accuracy is not better than baseline model. Poor accuracy 53%
6. Updating the batchsize =64 and increasing number of epochs =20 and leaving the same hyperparameters as item#5 other than batchsize and epochs for this attempt as wel - didn't perform any better - test accuracy is 53%