<a href="https://colab.research.google.com/github/zaahraa1993/Multimodal-sentiment-analysis/blob/main/RNN_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtext==0.6

Collecting torchtext==0.6
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->torchtext==0.6)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->torchtext==0.6)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->torchtext==0.6)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->torchtext==0.6)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from t

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext.data import Example, Dataset
import numpy as np
import spacy
import random

In [5]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

In [6]:
nlp = spacy.load('en_core_web_sm')

def spacy_tokenizer(text):
    return [token.text for token in nlp(text)]

In [7]:
TEXT = Field(tokenize=spacy_tokenizer, lower=True, include_lengths=True)
LABEL = LabelField(dtype=torch.float)


In [8]:
# Load saved splits
train_texts = np.load('/content/drive/MyDrive/Colab Notebooks/train_data.npy', allow_pickle=True)
train_labels = np.load('/content/drive/MyDrive/Colab Notebooks/train_labels.npy', allow_pickle=True)
val_texts = np.load('/content/drive/MyDrive/Colab Notebooks/val_data.npy', allow_pickle=True)
val_labels = np.load('/content/drive/MyDrive/Colab Notebooks/val_labels.npy', allow_pickle=True)
test_texts = np.load('/content/drive/MyDrive/Colab Notebooks/test_data.npy', allow_pickle=True)
test_labels = np.load('/content/drive/MyDrive/Colab Notebooks/test_labels.npy', allow_pickle=True)

In [9]:
# Convert to datasets
def create_dataset(texts, labels, fields):
    examples = [Example.fromlist([text, label], fields) for text, label in zip(texts, labels)]
    return Dataset(examples, fields)

train_data = create_dataset(train_texts, train_labels, [('text', TEXT), ('label', LABEL)])
valid_data = create_dataset(val_texts, val_labels, [('text', TEXT), ('label', LABEL)])
test_data = create_dataset(test_texts, test_labels, [('text', TEXT), ('label', LABEL)])

In [10]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device=device)


In [13]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(hidden.squeeze(0))
        hidden = self.fc(hidden)
        return hidden


In [14]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)



In [15]:
# Define loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
early_stopping_patience = 2
no_improvement_epochs = 0

model = model.to(device)
criterion = criterion.to(device)

In [16]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    text, text_lengths = batch.text
    predictions = model(text, text_lengths).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  predictions_all = []
  labels_all = []
  with torch.no_grad():
    for batch in iterator:
      text, text_lengths = batch.text
      predictions = model(text, text_lengths).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      predictions_all.extend(predictions.tolist())
      labels_all.extend(batch.label.tolist())

  avg_loss = epoch_loss / len(iterator)
  avg_acc = epoch_acc / len(iterator)

  return avg_loss, avg_acc, predictions_all, labels_all



In [21]:
# Training loop
N_EPOCH = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, _, _ = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        no_improvement_epochs = 0
        torch.save(model.state_dict(), 'rnn_model.pt')
    else:
        no_improvement_epochs += 1

    if no_improvement_epochs >= early_stopping_patience:
        print("Early stopping")
        break

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.465 | Train Acc: 78.75%
	 Val. Loss: 0.553 |  Val. Acc: 73.89%
Epoch: 02
	Train Loss: 0.375 | Train Acc: 83.86%
	 Val. Loss: 0.384 |  Val. Acc: 83.47%
Epoch: 03
	Train Loss: 0.318 | Train Acc: 86.59%
	 Val. Loss: 0.361 |  Val. Acc: 85.09%
Epoch: 04
	Train Loss: 0.284 | Train Acc: 88.22%
	 Val. Loss: 0.306 |  Val. Acc: 87.72%
Epoch: 05
	Train Loss: 0.257 | Train Acc: 89.43%
	 Val. Loss: 0.301 |  Val. Acc: 88.47%


In [22]:
# Load the best model
model.load_state_dict(torch.load('rnn_model.pt'))

# Evaluate on the test set
_, _, all_rnn_predictions, all_true_labels = evaluate(model, test_iterator, criterion)

In [23]:
# Save RNN predictions and true labels
np.save('rnn_predictions.npy', np.array(all_rnn_predictions))
np.save('rnn_true_labels.npy', np.array(all_true_labels))