In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

# IMDB Dataset

In [12]:
# Load dataset and preprocess
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dataset = load_dataset("imdb", split="train")
val_dataset = load_dataset("imdb", split="test")

In [13]:
# Tokenization and DataLoader
def preprocess_data(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    return {"input_ids": tokens["input_ids"].squeeze(0), "label": torch.tensor(examples["label"]).float()}

tokenized_data = dataset.map(preprocess_data, batched=True)
X_train = torch.stack([torch.tensor(x) for x in tokenized_data["input_ids"]])
y_train = torch.tensor(tokenized_data["label"]).float()
dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True)

val_tokenized_data = val_dataset.map(preprocess_data, batched=True)
X_val = torch.stack([torch.tensor(x) for x in val_tokenized_data["input_ids"]])
y_val = torch.tensor(val_tokenized_data["label"]).float()
val_dataset = TensorDataset(X_train, y_train)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [14]:
# Function to train model with validation set
def train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=5):
    model.to(device)
    start_time = time.time()

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0  # Accumulate training loss for averaging
        total_batches = len(train_loader)

        # Training loop
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()  # Accumulate batch loss

        avg_train_loss = epoch_loss / total_batches  # Compute average training loss

        # Validation loop
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                y_val_pred = model(x_val)
                val_loss += criterion(y_val_pred.squeeze(), y_val).item()

        avg_val_loss = val_loss / len(val_loader)  # Compute average validation loss

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    end_time = time.time()
    return end_time - start_time

## LSTM

In [17]:
# Define LSTM model for Sentiment Analysis
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, output_size):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])  # Take the last time step
        return out

In [18]:
VOCAB_SIZE = tokenizer.vocab_size
criterion = nn.BCEWithLogitsLoss()

In [19]:
device = torch.device("cpu")
model = SentimentLSTM(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6908, Val Loss: 0.6858
Epoch 2/10, Train Loss: 0.6934, Val Loss: 0.6923
Epoch 3/10, Train Loss: 0.6919, Val Loss: 0.6833
Epoch 4/10, Train Loss: 0.6930, Val Loss: 0.6904
Epoch 5/10, Train Loss: 0.6852, Val Loss: 0.6909
Epoch 6/10, Train Loss: 0.6596, Val Loss: 0.6356
Epoch 7/10, Train Loss: 0.5699, Val Loss: 0.4792
Epoch 8/10, Train Loss: 0.3898, Val Loss: 0.2975
Epoch 9/10, Train Loss: 0.2856, Val Loss: 0.2292
Epoch 10/10, Train Loss: 0.2240, Val Loss: 0.1759
Training Time on cpu: 835.93 seconds


In [20]:
device = torch.device("mps")
model = SentimentLSTM(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6926, Val Loss: 0.6903
Epoch 2/10, Train Loss: 0.6850, Val Loss: 0.6752
Epoch 3/10, Train Loss: 0.6890, Val Loss: 0.6905
Epoch 4/10, Train Loss: 0.6904, Val Loss: 0.6905
Epoch 5/10, Train Loss: 0.6891, Val Loss: 0.6880
Epoch 6/10, Train Loss: 0.6699, Val Loss: 0.7007
Epoch 7/10, Train Loss: 0.6792, Val Loss: 0.6171
Epoch 8/10, Train Loss: 0.5824, Val Loss: 0.4945
Epoch 9/10, Train Loss: 0.4830, Val Loss: 0.4456
Epoch 10/10, Train Loss: 0.4073, Val Loss: 0.3580
Training Time on mps: 80.59 seconds


## BiLSTMAttention

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentBiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, output_size, dropout=0.5):
        super(SentimentBiLSTMAttention, self).__init__()
        # Embedding layer converts token indices to embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Bidirectional LSTM to capture context from both directions
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Attention layer: projects each hidden state into a single attention score
        self.attention = nn.Linear(hidden_size * 2, 1)
        
        # Final fully connected layer for classification
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, sequence_length)
        Returns:
            out: Tensor of shape (batch_size, output_size)
        """
        # Convert word indices to embeddings
        embedded = self.embedding(x)  # (batch_size, sequence_length, embedding_dim)
        
        # Pass embeddings through the bidirectional LSTM
        lstm_out, _ = self.lstm(embedded)  # (batch_size, sequence_length, hidden_size*2)
        
        # Compute attention weights for each time step
        # The attention layer produces a score for each hidden state
        attn_scores = torch.tanh(self.attention(lstm_out))  # (batch_size, sequence_length, 1)
        attn_weights = F.softmax(attn_scores, dim=1)          # (batch_size, sequence_length, 1)
        
        # Compute the context vector as the weighted sum of LSTM outputs
        context_vector = torch.sum(attn_weights * lstm_out, dim=1)  # (batch_size, hidden_size*2)
        
        # Apply dropout for regularization
        context_vector = self.dropout(context_vector)
        
        # Final classification output
        out = self.fc(context_vector)  # (batch_size, output_size)
        return out

In [22]:
device = torch.device("mps")
model = SentimentBiLSTMAttention(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.5409, Val Loss: 0.4139
Epoch 2/10, Train Loss: 0.3524, Val Loss: 0.2658
Epoch 3/10, Train Loss: 0.2642, Val Loss: 0.1995
Epoch 4/10, Train Loss: 0.2114, Val Loss: 0.1609
Epoch 5/10, Train Loss: 0.1687, Val Loss: 0.1043
Epoch 6/10, Train Loss: 0.1108, Val Loss: 0.0640
Epoch 7/10, Train Loss: 0.0692, Val Loss: 0.1541
Epoch 8/10, Train Loss: 0.0493, Val Loss: 0.0192
Epoch 9/10, Train Loss: 0.0269, Val Loss: 0.0223
Epoch 10/10, Train Loss: 0.0189, Val Loss: 0.0083
Training Time on mps: 182.22 seconds


## SentimentAdvanced

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentAdvanced(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size, 
        num_layers, 
        output_size, 
        dropout=0.5, 
        n_heads=4
    ):
        """
        Args:
            vocab_size (int): Number of tokens in the vocabulary.
            embedding_dim (int): Dimensionality of the word embeddings.
            hidden_size (int): Hidden state size of the LSTM.
            num_layers (int): Number of LSTM layers.
            output_size (int): Number of output classes (e.g., sentiment labels).
            dropout (float): Dropout probability.
            n_heads (int): Number of attention heads for multi-head attention.
        """
        super(SentimentAdvanced, self).__init__()
        # Embedding layer converts token indices to embeddings.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Bidirectional LSTM to capture context from both directions.
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        
        self.dropout = nn.Dropout(dropout)
        
        # Multi-head self-attention layer.
        # Note: embed_dim for attention is hidden_size * 2 due to bidirectionality.
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_size * 2, 
            num_heads=n_heads, 
            dropout=dropout,
            batch_first=True
        )
        
        # Layer normalization for stabilizing training.
        self.layer_norm = nn.LayerNorm(hidden_size * 2)
        
        # Final fully connected layer for classification.
        self.fc = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, sequence_length) containing token indices.
        
        Returns:
            out: Tensor of shape (batch_size, output_size) containing class scores.
        """
        # 1. Embed the input tokens.
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        
        # 2. Pass the embeddings through the bidirectional LSTM.
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch_size, seq_length, hidden_size*2)
        
        # 3. Apply multi-head self-attention.
        #    Using LSTM outputs as query, key, and value.
        attn_out, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)
        # 4. Add a residual connection and normalize.
        attn_out = self.layer_norm(attn_out + lstm_out)
        
        # 5. Pool the sequence output (max pooling over the time dimension).
        #    This extracts the most salient features across the sequence.
        pooled, _ = torch.max(attn_out, dim=1)  # Shape: (batch_size, hidden_size*2)
        pooled = self.dropout(pooled)
        
        # 6. Final classification layer.
        out = self.fc(pooled)  # Shape: (batch_size, output_size)
        return out

In [24]:
device = torch.device("mps")
model = SentimentAdvanced(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader,val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6642, Val Loss: 0.4789
Epoch 2/10, Train Loss: 0.3962, Val Loss: 0.2878
Epoch 3/10, Train Loss: 0.2792, Val Loss: 0.2255
Epoch 4/10, Train Loss: 0.2173, Val Loss: 0.1502
Epoch 5/10, Train Loss: 0.1509, Val Loss: 0.0999
Epoch 6/10, Train Loss: 0.1133, Val Loss: 0.0660
Epoch 7/10, Train Loss: 0.0717, Val Loss: 0.0322
Epoch 8/10, Train Loss: 0.0441, Val Loss: 0.0233
Epoch 9/10, Train Loss: 0.0386, Val Loss: 0.0189
Epoch 10/10, Train Loss: 0.0260, Val Loss: 0.0134
Training Time on mps: 301.12 seconds


## SentimentUltraAdvanced

In [25]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentUltraAdvanced(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_size,
        lstm_layers,
        transformer_layers,
        num_filters,
        output_size,
        dropout=0.5,
        n_heads=4,
        cnn_kernel_sizes=[2, 3, 4]
    ):
        """
        Args:
            vocab_size (int): Size of the vocabulary.
            embedding_dim (int): Dimension of word embeddings.
            hidden_size (int): Hidden state size for the LSTM.
            lstm_layers (int): Number of LSTM layers.
            transformer_layers (int): Number of Transformer encoder layers.
            num_filters (int): Number of CNN filters per kernel size.
            output_size (int): Number of output classes (e.g., sentiment categories).
            dropout (float): Dropout probability.
            n_heads (int): Number of heads in the Transformer encoder.
            cnn_kernel_sizes (list): List of kernel sizes for the CNN branch.
        """
        super(SentimentUltraAdvanced, self).__init__()
        # Embedding layer: converts token indices to embeddings.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # CNN branch: a set of 1D convolutional layers with different kernel sizes.
        self.cnn_convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=num_filters,
                kernel_size=ks
            )
            for ks in cnn_kernel_sizes
        ])
        
        # LSTM branch: Bidirectional LSTM to capture sequential context.
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )
        
        # Transformer Encoder: further refines LSTM outputs using multi-head self-attention.
        # (Note: With PyTorch 1.9+ you can set batch_first=True in TransformerEncoderLayer.)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size * 2,  # Because LSTM is bidirectional.
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=transformer_layers
        )
        
        # Compute the combined feature dimension.
        # CNN branch: len(cnn_kernel_sizes) * num_filters.
        # LSTM/Transformer branch: hidden_size * 2.
        combined_dim = num_filters * len(cnn_kernel_sizes) + hidden_size * 2
        
        # Final fully connected layer for classification.
        self.fc = nn.Linear(combined_dim, output_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length) containing token indices.
        
        Returns:
            Tensor: Logits of shape (batch_size, output_size).
        """
        # 1. Embedding
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        
        # 2. CNN branch:
        # Permute to (batch_size, embedding_dim, seq_length) for Conv1d.
        cnn_input = embedded.permute(0, 2, 1)
        cnn_features = []
        for conv in self.cnn_convs:
            # Apply convolution -> non-linearity -> global max pooling.
            conv_out = conv(cnn_input)         # Shape: (batch, num_filters, L_out)
            conv_out = F.relu(conv_out)
            pooled = F.max_pool1d(conv_out, kernel_size=conv_out.shape[2])  # Shape: (batch, num_filters, 1)
            pooled = pooled.squeeze(2)         # Shape: (batch, num_filters)
            cnn_features.append(pooled)
        cnn_features = torch.cat(cnn_features, dim=1)  # Shape: (batch, num_filters * len(cnn_kernel_sizes))
        
        # 3. LSTM + Transformer branch:
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch, seq_length, hidden_size*2)
        # Refine LSTM outputs with Transformer encoder.
        transformer_out = self.transformer_encoder(lstm_out)  # Shape: (batch, seq_length, hidden_size*2)
        # Global average pooling over time (sequence length) dimension.
        transformer_features = torch.mean(transformer_out, dim=1)  # Shape: (batch, hidden_size*2)
        
        # 4. Feature Fusion:
        combined = torch.cat([cnn_features, transformer_features], dim=1)  # Shape: (batch, combined_dim)
        combined = self.dropout(combined)
        
        # 5. Final classification layer.
        output = self.fc(combined)  # Shape: (batch, output_size)
        return output

In [26]:
device = torch.device("mps")
model = SentimentUltraAdvanced(VOCAB_SIZE, embedding_dim=100, hidden_size=128, lstm_layers=2, transformer_layers=2, num_filters=64, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6737, Val Loss: 0.5184
Epoch 2/10, Train Loss: 0.5403, Val Loss: 0.4314
Epoch 3/10, Train Loss: 0.4798, Val Loss: 0.3699
Epoch 4/10, Train Loss: 0.4338, Val Loss: 0.3645
Epoch 5/10, Train Loss: 0.3950, Val Loss: 0.2966
Epoch 6/10, Train Loss: 0.3657, Val Loss: 0.2665
Epoch 7/10, Train Loss: 0.3365, Val Loss: 0.2310
Epoch 8/10, Train Loss: 0.3051, Val Loss: 0.2172
Epoch 9/10, Train Loss: 0.2783, Val Loss: 0.1764
Epoch 10/10, Train Loss: 0.2526, Val Loss: 0.1512
Training Time on mps: 807.49 seconds


## SentimentUltraUltraAdvanced

In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentUltraUltraAdvanced(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_size,
        lstm_layers,
        transformer_layers,
        num_filters,
        output_size,
        dropout=0.5,
        n_heads=4,
        cnn_kernel_sizes=[2, 3, 4],
        self_attn_layers=2,
        max_seq_length=512  # adjust based on your expected maximum sequence length
    ):
        """
        Args:
            vocab_size (int): Vocabulary size.
            embedding_dim (int): Dimension of word embeddings.
            hidden_size (int): Hidden size for the LSTM.
            lstm_layers (int): Number of LSTM layers.
            transformer_layers (int): Number of Transformer encoder layers for the LSTM branch.
            num_filters (int): Number of CNN filters per kernel size.
            output_size (int): Number of output classes.
            dropout (float): Dropout probability.
            n_heads (int): Number of attention heads in Transformer encoders.
            cnn_kernel_sizes (list): List of kernel sizes for the CNN branch.
            self_attn_layers (int): Number of Transformer encoder layers in the self-attention branch.
            max_seq_length (int): Maximum expected sequence length (for positional embeddings).
        """
        super(SentimentUltraUltraAdvanced, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Learnable positional embeddings for the self-attention branch
        self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_length, embedding_dim))
        
        # ----------------------------
        # Branch 1: CNN for local features
        # ----------------------------
        # Create one convolution per kernel size.
        self.cnn_convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=num_filters,
                kernel_size=k
            ) for k in cnn_kernel_sizes
        ])
        
        # ----------------------------
        # Branch 2: LSTM + Transformer for sequential modeling
        # ----------------------------
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if lstm_layers > 1 else 0.0
        )
        # Transformer encoder refines the LSTM outputs.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size * 2,  # bidirectional
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.lstm_transformer = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
        
        # ----------------------------
        # Branch 3: Direct Self-Attention on embeddings
        # ----------------------------
        encoder_layer2 = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.self_attn_encoder = nn.TransformerEncoder(encoder_layer2, num_layers=self_attn_layers)
        
        # ----------------------------
        # Gating Mechanism to fuse branch features dynamically
        # ----------------------------
        # Compute dimensions for each branch's output:
        cnn_out_dim = num_filters * len(cnn_kernel_sizes)
        lstm_out_dim = hidden_size * 2  # from bidirectional LSTM
        self_attn_out_dim = embedding_dim
        
        # Linear layers to compute a gate (a scalar weight) for each branch.
        self.gate_cnn = nn.Linear(cnn_out_dim, 1)
        self.gate_lstm = nn.Linear(lstm_out_dim, 1)
        self.gate_self_attn = nn.Linear(self_attn_out_dim, 1)
        
        # Final fusion dimension is the concatenation of all branch outputs.
        fused_dim = cnn_out_dim + lstm_out_dim + self_attn_out_dim
        
        # Final classification layer.
        self.fc = nn.Linear(fused_dim, output_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length) with token indices.
        Returns:
            Tensor: Logits of shape (batch_size, output_size).
        """
        batch_size, seq_length = x.size()
        # 1. Embedding lookup.
        embedded = self.embedding(x)  # shape: (batch_size, seq_length, embedding_dim)
        
        # ----------------------------
        # Branch 1: CNN
        # ----------------------------
        # For Conv1d, we need shape: (batch_size, embedding_dim, seq_length)
        cnn_input = embedded.permute(0, 2, 1)
        cnn_features = []
        for conv in self.cnn_convs:
            conv_out = F.relu(conv(cnn_input))  # (batch, num_filters, L_out)
            # Global max pooling over the temporal (L_out) dimension.
            pooled = F.max_pool1d(conv_out, kernel_size=conv_out.size(2)).squeeze(2)  # (batch, num_filters)
            cnn_features.append(pooled)
        cnn_features = torch.cat(cnn_features, dim=1)  # (batch, num_filters * len(cnn_kernel_sizes))
        
        # ----------------------------
        # Branch 2: LSTM + Transformer
        # ----------------------------
        lstm_out, _ = self.lstm(embedded)  # (batch, seq_length, hidden_size*2)
        # Refine with Transformer encoder.
        lstm_transformed = self.lstm_transformer(lstm_out)  # (batch, seq_length, hidden_size*2)
        # Global average pooling over the time dimension.
        lstm_features = torch.mean(lstm_transformed, dim=1)  # (batch, hidden_size*2)
        
        # ----------------------------
        # Branch 3: Direct Self-Attention on embeddings
        # ----------------------------
        # Add positional embeddings (truncate or expand to the current sequence length)
        pos_emb = self.pos_embedding[:, :seq_length, :]  # (1, seq_length, embedding_dim)
        self_attn_input = embedded + pos_emb
        self_attn_out = self.self_attn_encoder(self_attn_input)  # (batch, seq_length, embedding_dim)
        # Global max pooling over the time dimension.
        self_attn_features, _ = torch.max(self_attn_out, dim=1)  # (batch, embedding_dim)
        
        # ----------------------------
        # Gating: Compute dynamic weights for each branch.
        # ----------------------------
        gate_cnn = torch.sigmoid(self.gate_cnn(cnn_features))           # (batch, 1)
        gate_lstm = torch.sigmoid(self.gate_lstm(lstm_features))          # (batch, 1)
        gate_self_attn = torch.sigmoid(self.gate_self_attn(self_attn_features))  # (batch, 1)
        
        gated_cnn = cnn_features * gate_cnn         # (batch, cnn_out_dim)
        gated_lstm = lstm_features * gate_lstm        # (batch, lstm_out_dim)
        gated_self_attn = self_attn_features * gate_self_attn  # (batch, self_attn_out_dim)
        
        # ----------------------------
        # Feature Fusion and Classification
        # ----------------------------
        # Concatenate the gated features from all branches.
        combined = torch.cat([gated_cnn, gated_lstm, gated_self_attn], dim=1)  # (batch, fused_dim)
        combined = self.dropout(combined)
        logits = self.fc(combined)  # (batch, output_size)
        
        return logits

In [28]:
device = torch.device("mps")
model = SentimentUltraUltraAdvanced(VOCAB_SIZE, embedding_dim=100, hidden_size=128, lstm_layers=2, transformer_layers=2, num_filters=64, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader,val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6241, Val Loss: 0.4785
Epoch 2/10, Train Loss: 0.3864, Val Loss: 0.7370
Epoch 3/10, Train Loss: 0.2459, Val Loss: 0.3226
Epoch 4/10, Train Loss: 0.1488, Val Loss: 0.1486
Epoch 5/10, Train Loss: 0.0780, Val Loss: 0.0310
Epoch 6/10, Train Loss: 0.0343, Val Loss: 0.0087
Epoch 7/10, Train Loss: 0.0195, Val Loss: 0.0088
Epoch 8/10, Train Loss: 0.0124, Val Loss: 0.0055
Epoch 9/10, Train Loss: 0.0076, Val Loss: 0.0030
Epoch 10/10, Train Loss: 0.0072, Val Loss: 0.0026
Training Time on mps: 1153.60 seconds


# Amazon Polarity

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import TensorDataset, DataLoader, random_split
from datasets import load_dataset
from transformers import BertTokenizer
import matplotlib.pyplot as plt

In [10]:
# 1. Load and preprocess the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Use the Amazon Polarity dataset from Hugging Face.
# We use the provided 'train' split for training and 'test' as our validation set.
train_dataset = load_dataset("imdb", split="train")
val_dataset = load_dataset("imdb", split="test")

def preprocess_data(examples):
    # Tokenize the text; we use padding and truncation for consistency.
    # tokens = tokenizer(examples["content"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")

    # Remove extra batch dimension from tokens; create a tensor for the label.
    return {"input_ids": tokens["input_ids"].squeeze(0), "label": torch.tensor(examples["label"]).float()}

# Apply preprocessing to both training and validation datasets
train_tokenized = train_dataset.map(preprocess_data, batched=True)
val_tokenized = val_dataset.map(preprocess_data, batched=True)

# Convert lists of input_ids and labels into tensors and create TensorDatasets.
# (Note: The tokenizer returns lists, so we use torch.stack to combine them.)
X_train = torch.stack([torch.tensor(x) for x in train_tokenized["input_ids"]])
y_train = torch.tensor(train_tokenized["label"]).float()
train_tensor_dataset = TensorDataset(X_train, y_train)

X_val = torch.stack([torch.tensor(x) for x in val_tokenized["input_ids"]])
y_val = torch.tensor(val_tokenized["label"]).float()
val_tensor_dataset = TensorDataset(X_val, y_val)

# Create DataLoaders for training and validation.
batch_size = 128
train_loader = DataLoader(train_tensor_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_tensor_dataset, batch_size=batch_size)

# Get vocabulary size from the tokenizer (BERT-base-uncased has vocab size 30522)
VOCAB_SIZE = tokenizer.vocab_size

In [11]:
# 2. Define the Sentiment BiLSTM with Attention Model
class SentimentBiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, output_size, dropout=0.5):
        super(SentimentBiLSTMAttention, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )

        self.attention = nn.Linear(hidden_size * 2, 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        attn_scores = torch.tanh(self.attention(lstm_out))
        attn_weights = F.softmax(attn_scores, dim=1)
        context_vector = torch.sum(attn_weights * lstm_out, dim=1)
        context_vector = self.dropout(context_vector)
        out = self.fc(context_vector)
        return out

# 3. Implement Early Stopping
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0.001):
        self.patience = patience
        self.min_delta = min_delta
        self.best_loss = float('inf')
        self.counter = 0

    def check(self, val_loss):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            return False  # Continue training
        else:
            self.counter += 1
            if self.counter >= self.patience:
                print(f"Early stopping triggered after {self.counter} epochs.")
                return True  # Stop training
            return False

In [21]:
# Training Function with Checkpointing
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, epochs):
    early_stopping = EarlyStopping(patience=3, min_delta=0.001)
    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        running_loss, correct_train, total_train = 0.0, 0, 0

        for batch in train_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs).squeeze(1)
            loss = criterion(outputs, labels)
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # Apply Gradient Clipping
            optimizer.step()

            running_loss += loss.item()
            preds = torch.round(torch.sigmoid(outputs)).detach()
            correct_train += (preds == labels).sum().item()
            total_train += labels.size(0)
        
        train_loss = running_loss / len(train_loader)
        train_acc = correct_train / total_train

        # Validation Phase
        model.eval()
        running_val_loss, correct_val, total_val = 0.0, 0, 0
        
        with torch.no_grad():
            for batch in val_loader:
                inputs, labels = batch
                inputs, labels = inputs.to(device), labels.to(device)
                
                outputs = model(inputs).squeeze(1)
                loss = criterion(outputs, labels)
                running_val_loss += loss.item()
                
                preds = torch.round(torch.sigmoid(outputs))
                correct_val += (preds == labels).sum().item()
                total_val += labels.size(0)
        
        val_loss = running_val_loss / len(val_loader)
        val_acc = correct_val / total_val

        scheduler.step(val_loss)  # Reduce LR if validation loss stagnates

        # Save Best Model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_model.pth")

        # Dynamic Dropout Adjustment
        if val_loss > best_val_loss:
            model.dropout.p = min(model.dropout.p + 0.05, 0.7)

        # Early Stopping Check
        if early_stopping.check(val_loss):
            break  # Stop training if no improvement

        print(f"Epoch {epoch+1}/{epochs}: "
              f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    # Load Best Model
    model.load_state_dict(torch.load("best_model.pth", weights_only=True))

In [22]:
# 3. Improved Hyperparameters
embedding_dim = 256  # Remains unchanged
hidden_size = 128  # Reduced from 256 to 128
num_layers = 2  # Reduced from 3 to 2
output_size = 1  # Binary classification
dropout = 0.5  # Increased from 0.3 to 0.5 for better regularization

device = torch.device("mps")
model = SentimentBiLSTMAttention(VOCAB_SIZE, embedding_dim, hidden_size, num_layers, output_size, dropout).to(device)

# 4. Define Loss, Optimizer, and Learning Rate Scheduler
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=5e-4)
scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)

In [23]:
epochs = 10
# train_loss, train_acc, val_loss, val_acc = 

train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, epochs)

Epoch 1/10: Train Loss: 0.5541, Train Acc: 0.6879, Val Loss: 0.3979, Val Acc: 0.8190
Epoch 2/10: Train Loss: 0.3146, Train Acc: 0.8664, Val Loss: 0.3143, Val Acc: 0.8670
Epoch 3/10: Train Loss: 0.2385, Train Acc: 0.9057, Val Loss: 0.3232, Val Acc: 0.8612
Epoch 4/10: Train Loss: 0.1773, Train Acc: 0.9350, Val Loss: 0.3446, Val Acc: 0.8698
Early stopping triggered after 3 epochs.


In [20]:
# # 5. Plot both accuracy and loss
# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# # Plot accuracy
# ax1.plot(range(1, epochs+1), train_acc, label="Train Accuracy")
# ax1.plot(range(1, epochs+1), val_acc, label="Validation Accuracy")
# ax1.set_xlabel("Epoch")
# ax1.set_ylabel("Accuracy")
# ax1.set_title("Accuracy Over Epochs")
# ax1.legend()
# ax1.grid(True)

# # Plot loss
# ax2.plot(range(1, epochs+1), train_loss, label="Train Loss")
# ax2.plot(range(1, epochs+1), val_loss, label="Validation Loss")
# ax2.set_xlabel("Epoch")
# ax2.set_ylabel("Loss")
# ax2.set_title("Loss Over Epochs")
# ax2.legend()
# ax2.grid(True)

# plt.tight_layout()
# plt.show()