<a href="https://colab.research.google.com/github/zaahraa1993/Multimodal-sentiment-analysis/blob/main/RNN_%26_Meta_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torchtext==0.6

Collecting torchtext==0.6
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->torchtext==0.6)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->torchtext==0.6)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->torchtext==0.6)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->torchtext==0.6)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from t

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data import Field, LabelField, TabularDataset, BucketIterator
from torchtext.data import Example, Dataset
import numpy as np
import spacy

In [3]:
nlp = spacy.load('en_core_web_sm')

def spacy_tokenizer(text):
    return [token.text for token in nlp(text)]

In [4]:
TEXT = Field(tokenize=spacy_tokenizer, lower=True, include_lengths=True)
LABEL = LabelField(dtype=torch.float)

In [6]:
# Load saved splits
train_texts = np.load('/content/drive/MyDrive/train_data.npy', allow_pickle=True)
train_labels = np.load('/content/drive/MyDrive/train_labels.npy', allow_pickle=True)
val_texts = np.load('/content/drive/MyDrive/val_data.npy', allow_pickle=True)
val_labels = np.load('/content/drive/MyDrive/val_labels.npy', allow_pickle=True)
test_texts = np.load('/content/drive/MyDrive/test_data.npy', allow_pickle=True)
test_labels = np.load('/content/drive/MyDrive/test_labels.npy', allow_pickle=True)

In [7]:
# Convert to datasets
def create_dataset(texts, labels, fields):
    examples = [Example.fromlist([text, label], fields) for text, label in zip(texts, labels)]
    return Dataset(examples, fields)

train_data = create_dataset(train_texts, train_labels, [('text', TEXT), ('label', LABEL)])
valid_data = create_dataset(val_texts, val_labels, [('text', TEXT), ('label', LABEL)])
test_data = create_dataset(test_texts, test_labels, [('text', TEXT), ('label', LABEL)])

In [8]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
BATCH_SIZE = 64
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    sort_key=lambda x: len(x.text),
    device=device)


In [11]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths.to('cpu'))
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(hidden.squeeze(0))
        hidden = self.fc(hidden)
        return hidden

In [12]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT)

In [13]:
# loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
early_stopping_patience = 2
no_improvement_epochs = 0

model = model.to(device)
criterion = criterion.to(device)

In [14]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

def train(model, iterator, optimizer, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.train()

  for batch in iterator:
    optimizer.zero_grad()
    text, text_lengths = batch.text
    predictions = model(text, text_lengths).squeeze(1)
    loss = criterion(predictions, batch.label)
    acc = binary_accuracy(predictions, batch.label)
    loss.backward()
    optimizer.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()

  return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [15]:
def evaluate(model, iterator, criterion):
  epoch_loss = 0
  epoch_acc = 0
  model.eval()
  predictions_all = []
  labels_all = []
  with torch.no_grad():
    for batch in iterator:
      text, text_lengths = batch.text
      predictions = model(text, text_lengths).squeeze(1)
      loss = criterion(predictions, batch.label)
      acc = binary_accuracy(predictions, batch.label)
      epoch_loss += loss.item()
      epoch_acc += acc.item()
      predictions_all.extend(predictions.tolist())
      labels_all.extend(batch.label.tolist())

  avg_loss = epoch_loss / len(iterator)
  avg_acc = epoch_acc / len(iterator)

  return avg_loss, avg_acc, predictions_all, labels_all

In [16]:
# Training loop
N_EPOCH = 5
best_valid_loss = float('inf')
for epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc, _, _ = evaluate(model, valid_iterator, criterion)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        no_improvement_epochs = 0
        torch.save(model.state_dict(), 'rnn_model.pt')
    else:
        no_improvement_epochs += 1

    if no_improvement_epochs >= early_stopping_patience:
        print("Early stopping")
        break

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.668 | Train Acc: 58.89%
	 Val. Loss: 0.624 |  Val. Acc: 65.62%
Epoch: 02
	Train Loss: 0.625 | Train Acc: 65.23%
	 Val. Loss: 0.557 |  Val. Acc: 73.00%
Epoch: 03
	Train Loss: 0.642 | Train Acc: 62.78%
	 Val. Loss: 0.631 |  Val. Acc: 61.75%
Epoch: 04
	Train Loss: 0.519 | Train Acc: 75.08%
	 Val. Loss: 0.416 |  Val. Acc: 82.22%
Epoch: 05
	Train Loss: 0.426 | Train Acc: 81.22%
	 Val. Loss: 0.376 |  Val. Acc: 83.03%


In [17]:
# Load the best model
model.load_state_dict(torch.load('rnn_model.pt'))

# Evaluate on the test set
_, _, all_rnn_predictions, all_true_labels = evaluate(model, test_iterator, criterion)



In [18]:
# Save RNN predictions and true labels
np.save('rnn_predictions.npy', np.array(all_rnn_predictions))
np.save('rnn_true_labels.npy', np.array(all_true_labels))

# **meta_classifier**

In [22]:
import numpy as np
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [23]:
# Load RNN predictions and true labels
rnn_predictions = np.load('/content/rnn_predictions.npy')
true_labels = np.load('/content/rnn_true_labels.npy')

# Load BERT predictions
bert_predictions = np.load('/content/bert_predictions.npy')
bert_true_labels = np.load('/content/bert_true_labels.npy')


In [24]:
assert len(bert_predictions) == len(rnn_predictions),


In [25]:
meta_features = np.vstack((bert_predictions, rnn_predictions)).T
meta_labels = bert_true_labels

In [26]:
# Split the data into train and test sets for the meta classifier
split_idx = int(len(meta_features) * 0.8)
train_meta_features = meta_features[:split_idx]
train_meta_labels = meta_labels[:split_idx]
test_meta_features = meta_features[split_idx:]
test_meta_labels = meta_labels[split_idx:]

In [27]:
# Initialize and train the meta classifier
meta_clf = LogisticRegression()
meta_clf.fit(train_meta_features, train_meta_labels)

In [30]:
# Predict and evaluate the meta classifier
meta_predictions = meta_clf.predict(test_meta_features)
accuracy = accuracy_score(test_meta_labels, meta_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_meta_labels, meta_predictions, average='binary')

In [32]:
# Print evaluation metrics
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

# save the meta classifier
import joblib
joblib.dump(meta_clf, 'meta_classifier.joblib')

Accuracy: 0.9408
Precision: 1.0000
Recall: 0.9408
F1 Score: 0.9695


['meta_classifier.joblib']