In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer

# IMDB Dataset

In [2]:
# Load dataset and preprocess
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
dataset = load_dataset("imdb", split="train")
val_dataset = load_dataset("imdb", split="test")

In [3]:
# Tokenization and DataLoader
def preprocess_data(examples):
    tokens = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    return {"input_ids": tokens["input_ids"].squeeze(0), "label": torch.tensor(examples["label"]).float()}

tokenized_data = dataset.map(preprocess_data, batched=True)
X_train = torch.stack([torch.tensor(x) for x in tokenized_data["input_ids"]])
y_train = torch.tensor(tokenized_data["label"]).float()
dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(dataset, batch_size=128, shuffle=True)

val_tokenized_data = val_dataset.map(preprocess_data, batched=True)
X_val = torch.stack([torch.tensor(x) for x in val_tokenized_data["input_ids"]])
y_val = torch.tensor(val_tokenized_data["label"]).float()
val_dataset = TensorDataset(X_train, y_train)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=True)

In [4]:
# Function to train model with validation set
def train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=5):
    model.to(device)
    start_time = time.time()

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0.0  # Accumulate training loss for averaging
        total_batches = len(train_loader)

        # Training loop
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred.squeeze(), y_batch)
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()  # Accumulate batch loss

        avg_train_loss = epoch_loss / total_batches  # Compute average training loss

        # Validation loop
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                y_val_pred = model(x_val)
                val_loss += criterion(y_val_pred.squeeze(), y_val).item()

        avg_val_loss = val_loss / len(val_loader)  # Compute average validation loss

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

    end_time = time.time()
    return end_time - start_time

## LSTM

In [5]:
# Define LSTM model for Sentiment Analysis
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, output_size):
        super(SentimentLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        out = self.fc(lstm_out[:, -1, :])  # Take the last time step
        return out

In [6]:
VOCAB_SIZE = tokenizer.vocab_size
criterion = nn.BCEWithLogitsLoss()

In [8]:
device = torch.device("cpu")
model = SentimentLSTM(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6953, Val Loss: 0.6932
Epoch 2/10, Train Loss: 0.6866, Val Loss: 0.6044
Epoch 3/10, Train Loss: 0.4649, Val Loss: 0.3237
Epoch 4/10, Train Loss: 0.3120, Val Loss: 0.2239
Epoch 5/10, Train Loss: 0.2328, Val Loss: 0.1587
Epoch 6/10, Train Loss: 0.1731, Val Loss: 0.1205
Epoch 7/10, Train Loss: 0.1291, Val Loss: 0.0935
Epoch 8/10, Train Loss: 0.1001, Val Loss: 0.0623
Epoch 9/10, Train Loss: 0.0789, Val Loss: 0.0533
Epoch 10/10, Train Loss: 0.0687, Val Loss: 0.0479
Training Time on cpu: 840.79 seconds


In [9]:
device = torch.device("mps")
model = SentimentLSTM(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6930, Val Loss: 0.6932
Epoch 2/10, Train Loss: 0.6692, Val Loss: 0.5740
Epoch 3/10, Train Loss: 0.4672, Val Loss: 0.3057
Epoch 4/10, Train Loss: 0.2654, Val Loss: 0.1549
Epoch 5/10, Train Loss: 0.1578, Val Loss: 0.0797
Epoch 6/10, Train Loss: 0.0981, Val Loss: 0.0543
Epoch 7/10, Train Loss: 0.0685, Val Loss: 0.0437
Epoch 8/10, Train Loss: 0.0581, Val Loss: 0.0384
Epoch 9/10, Train Loss: 0.0446, Val Loss: 0.0224
Epoch 10/10, Train Loss: 0.0349, Val Loss: 0.0241
Training Time on mps: 79.89 seconds


## BiLSTMAttention

In [10]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentBiLSTMAttention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layers, output_size, dropout=0.5):
        super(SentimentBiLSTMAttention, self).__init__()
        # Embedding layer converts token indices to embeddings
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Bidirectional LSTM to capture context from both directions
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        
        # Dropout layer for regularization
        self.dropout = nn.Dropout(dropout)
        
        # Attention layer: projects each hidden state into a single attention score
        self.attention = nn.Linear(hidden_size * 2, 1)
        
        # Final fully connected layer for classification
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, sequence_length)
        Returns:
            out: Tensor of shape (batch_size, output_size)
        """
        # Convert word indices to embeddings
        embedded = self.embedding(x)  # (batch_size, sequence_length, embedding_dim)
        
        # Pass embeddings through the bidirectional LSTM
        lstm_out, _ = self.lstm(embedded)  # (batch_size, sequence_length, hidden_size*2)
        
        # Compute attention weights for each time step
        # The attention layer produces a score for each hidden state
        attn_scores = torch.tanh(self.attention(lstm_out))  # (batch_size, sequence_length, 1)
        attn_weights = F.softmax(attn_scores, dim=1)          # (batch_size, sequence_length, 1)
        
        # Compute the context vector as the weighted sum of LSTM outputs
        context_vector = torch.sum(attn_weights * lstm_out, dim=1)  # (batch_size, hidden_size*2)
        
        # Apply dropout for regularization
        context_vector = self.dropout(context_vector)
        
        # Final classification output
        out = self.fc(context_vector)  # (batch_size, output_size)
        return out

In [11]:
device = torch.device("mps")
model = SentimentBiLSTMAttention(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.5532, Val Loss: 0.3891
Epoch 2/10, Train Loss: 0.3604, Val Loss: 0.2694
Epoch 3/10, Train Loss: 0.2677, Val Loss: 0.2169
Epoch 4/10, Train Loss: 0.2121, Val Loss: 0.1533
Epoch 5/10, Train Loss: 0.1650, Val Loss: 0.1140
Epoch 6/10, Train Loss: 0.1109, Val Loss: 0.0693
Epoch 7/10, Train Loss: 0.0731, Val Loss: 0.0407
Epoch 8/10, Train Loss: 0.0449, Val Loss: 0.0339
Epoch 9/10, Train Loss: 0.0584, Val Loss: 0.0358
Epoch 10/10, Train Loss: 0.0206, Val Loss: 0.0078
Training Time on mps: 178.21 seconds


## SentimentAdvanced

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentAdvanced(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        embedding_dim, 
        hidden_size, 
        num_layers, 
        output_size, 
        dropout=0.5, 
        n_heads=4
    ):
        """
        Args:
            vocab_size (int): Number of tokens in the vocabulary.
            embedding_dim (int): Dimensionality of the word embeddings.
            hidden_size (int): Hidden state size of the LSTM.
            num_layers (int): Number of LSTM layers.
            output_size (int): Number of output classes (e.g., sentiment labels).
            dropout (float): Dropout probability.
            n_heads (int): Number of attention heads for multi-head attention.
        """
        super(SentimentAdvanced, self).__init__()
        # Embedding layer converts token indices to embeddings.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Bidirectional LSTM to capture context from both directions.
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True
        )
        
        self.dropout = nn.Dropout(dropout)
        
        # Multi-head self-attention layer.
        # Note: embed_dim for attention is hidden_size * 2 due to bidirectionality.
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_size * 2, 
            num_heads=n_heads, 
            dropout=dropout,
            batch_first=True
        )
        
        # Layer normalization for stabilizing training.
        self.layer_norm = nn.LayerNorm(hidden_size * 2)
        
        # Final fully connected layer for classification.
        self.fc = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, x):
        """
        Args:
            x: Tensor of shape (batch_size, sequence_length) containing token indices.
        
        Returns:
            out: Tensor of shape (batch_size, output_size) containing class scores.
        """
        # 1. Embed the input tokens.
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        
        # 2. Pass the embeddings through the bidirectional LSTM.
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch_size, seq_length, hidden_size*2)
        
        # 3. Apply multi-head self-attention.
        #    Using LSTM outputs as query, key, and value.
        attn_out, _ = self.multihead_attn(lstm_out, lstm_out, lstm_out)
        # 4. Add a residual connection and normalize.
        attn_out = self.layer_norm(attn_out + lstm_out)
        
        # 5. Pool the sequence output (max pooling over the time dimension).
        #    This extracts the most salient features across the sequence.
        pooled, _ = torch.max(attn_out, dim=1)  # Shape: (batch_size, hidden_size*2)
        pooled = self.dropout(pooled)
        
        # 6. Final classification layer.
        out = self.fc(pooled)  # Shape: (batch_size, output_size)
        return out

In [13]:
device = torch.device("mps")
model = SentimentAdvanced(VOCAB_SIZE, embedding_dim=100, hidden_size=128, num_layers=2, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader,val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6415, Val Loss: 0.4701
Epoch 2/10, Train Loss: 0.3907, Val Loss: 0.2871
Epoch 3/10, Train Loss: 0.2813, Val Loss: 0.2021
Epoch 4/10, Train Loss: 0.2164, Val Loss: 0.1586
Epoch 5/10, Train Loss: 0.1595, Val Loss: 0.1831
Epoch 6/10, Train Loss: 0.1035, Val Loss: 0.0655
Epoch 7/10, Train Loss: 0.0766, Val Loss: 0.0398
Epoch 8/10, Train Loss: 0.0447, Val Loss: 0.0215
Epoch 9/10, Train Loss: 0.0303, Val Loss: 0.0166
Epoch 10/10, Train Loss: 0.0258, Val Loss: 0.0163
Training Time on mps: 285.63 seconds


## SentimentUltraAdvanced

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentUltraAdvanced(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_size,
        lstm_layers,
        transformer_layers,
        num_filters,
        output_size,
        dropout=0.5,
        n_heads=4,
        cnn_kernel_sizes=[2, 3, 4]
    ):
        """
        Args:
            vocab_size (int): Size of the vocabulary.
            embedding_dim (int): Dimension of word embeddings.
            hidden_size (int): Hidden state size for the LSTM.
            lstm_layers (int): Number of LSTM layers.
            transformer_layers (int): Number of Transformer encoder layers.
            num_filters (int): Number of CNN filters per kernel size.
            output_size (int): Number of output classes (e.g., sentiment categories).
            dropout (float): Dropout probability.
            n_heads (int): Number of heads in the Transformer encoder.
            cnn_kernel_sizes (list): List of kernel sizes for the CNN branch.
        """
        super(SentimentUltraAdvanced, self).__init__()
        # Embedding layer: converts token indices to embeddings.
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # CNN branch: a set of 1D convolutional layers with different kernel sizes.
        self.cnn_convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=num_filters,
                kernel_size=ks
            )
            for ks in cnn_kernel_sizes
        ])
        
        # LSTM branch: Bidirectional LSTM to capture sequential context.
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
        )
        
        # Transformer Encoder: further refines LSTM outputs using multi-head self-attention.
        # (Note: With PyTorch 1.9+ you can set batch_first=True in TransformerEncoderLayer.)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size * 2,  # Because LSTM is bidirectional.
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer,
            num_layers=transformer_layers
        )
        
        # Compute the combined feature dimension.
        # CNN branch: len(cnn_kernel_sizes) * num_filters.
        # LSTM/Transformer branch: hidden_size * 2.
        combined_dim = num_filters * len(cnn_kernel_sizes) + hidden_size * 2
        
        # Final fully connected layer for classification.
        self.fc = nn.Linear(combined_dim, output_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length) containing token indices.
        
        Returns:
            Tensor: Logits of shape (batch_size, output_size).
        """
        # 1. Embedding
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)
        
        # 2. CNN branch:
        # Permute to (batch_size, embedding_dim, seq_length) for Conv1d.
        cnn_input = embedded.permute(0, 2, 1)
        cnn_features = []
        for conv in self.cnn_convs:
            # Apply convolution -> non-linearity -> global max pooling.
            conv_out = conv(cnn_input)         # Shape: (batch, num_filters, L_out)
            conv_out = F.relu(conv_out)
            pooled = F.max_pool1d(conv_out, kernel_size=conv_out.shape[2])  # Shape: (batch, num_filters, 1)
            pooled = pooled.squeeze(2)         # Shape: (batch, num_filters)
            cnn_features.append(pooled)
        cnn_features = torch.cat(cnn_features, dim=1)  # Shape: (batch, num_filters * len(cnn_kernel_sizes))
        
        # 3. LSTM + Transformer branch:
        lstm_out, _ = self.lstm(embedded)  # Shape: (batch, seq_length, hidden_size*2)
        # Refine LSTM outputs with Transformer encoder.
        transformer_out = self.transformer_encoder(lstm_out)  # Shape: (batch, seq_length, hidden_size*2)
        # Global average pooling over time (sequence length) dimension.
        transformer_features = torch.mean(transformer_out, dim=1)  # Shape: (batch, hidden_size*2)
        
        # 4. Feature Fusion:
        combined = torch.cat([cnn_features, transformer_features], dim=1)  # Shape: (batch, combined_dim)
        combined = self.dropout(combined)
        
        # 5. Final classification layer.
        output = self.fc(combined)  # Shape: (batch, output_size)
        return output

In [15]:
device = torch.device("mps")
model = SentimentUltraAdvanced(VOCAB_SIZE, embedding_dim=100, hidden_size=128, lstm_layers=2, transformer_layers=2, num_filters=64, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader, val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.6708, Val Loss: 0.5174
Epoch 2/10, Train Loss: 0.5323, Val Loss: 0.4471
Epoch 3/10, Train Loss: 0.4708, Val Loss: 0.3932
Epoch 4/10, Train Loss: 0.4366, Val Loss: 0.3298
Epoch 5/10, Train Loss: 0.3939, Val Loss: 0.2932
Epoch 6/10, Train Loss: 0.3687, Val Loss: 0.2793
Epoch 7/10, Train Loss: 0.3341, Val Loss: 0.2353
Epoch 8/10, Train Loss: 0.3080, Val Loss: 0.2031
Epoch 9/10, Train Loss: 0.2797, Val Loss: 0.1755
Epoch 10/10, Train Loss: 0.2489, Val Loss: 0.1490
Training Time on mps: 769.11 seconds


## SentimentUltraUltraAdvanced

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SentimentUltraUltraAdvanced(nn.Module):
    def __init__(
        self,
        vocab_size,
        embedding_dim,
        hidden_size,
        lstm_layers,
        transformer_layers,
        num_filters,
        output_size,
        dropout=0.5,
        n_heads=4,
        cnn_kernel_sizes=[2, 3, 4],
        self_attn_layers=2,
        max_seq_length=512  # adjust based on your expected maximum sequence length
    ):
        """
        Args:
            vocab_size (int): Vocabulary size.
            embedding_dim (int): Dimension of word embeddings.
            hidden_size (int): Hidden size for the LSTM.
            lstm_layers (int): Number of LSTM layers.
            transformer_layers (int): Number of Transformer encoder layers for the LSTM branch.
            num_filters (int): Number of CNN filters per kernel size.
            output_size (int): Number of output classes.
            dropout (float): Dropout probability.
            n_heads (int): Number of attention heads in Transformer encoders.
            cnn_kernel_sizes (list): List of kernel sizes for the CNN branch.
            self_attn_layers (int): Number of Transformer encoder layers in the self-attention branch.
            max_seq_length (int): Maximum expected sequence length (for positional embeddings).
        """
        super(SentimentUltraUltraAdvanced, self).__init__()
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # Learnable positional embeddings for the self-attention branch
        self.pos_embedding = nn.Parameter(torch.zeros(1, max_seq_length, embedding_dim))
        
        # ----------------------------
        # Branch 1: CNN for local features
        # ----------------------------
        # Create one convolution per kernel size.
        self.cnn_convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embedding_dim,
                out_channels=num_filters,
                kernel_size=k
            ) for k in cnn_kernel_sizes
        ])
        
        # ----------------------------
        # Branch 2: LSTM + Transformer for sequential modeling
        # ----------------------------
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if lstm_layers > 1 else 0.0
        )
        # Transformer encoder refines the LSTM outputs.
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size * 2,  # bidirectional
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.lstm_transformer = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
        
        # ----------------------------
        # Branch 3: Direct Self-Attention on embeddings
        # ----------------------------
        encoder_layer2 = nn.TransformerEncoderLayer(
            d_model=embedding_dim,
            nhead=n_heads,
            dropout=dropout,
            batch_first=True
        )
        self.self_attn_encoder = nn.TransformerEncoder(encoder_layer2, num_layers=self_attn_layers)
        
        # ----------------------------
        # Gating Mechanism to fuse branch features dynamically
        # ----------------------------
        # Compute dimensions for each branch's output:
        cnn_out_dim = num_filters * len(cnn_kernel_sizes)
        lstm_out_dim = hidden_size * 2  # from bidirectional LSTM
        self_attn_out_dim = embedding_dim
        
        # Linear layers to compute a gate (a scalar weight) for each branch.
        self.gate_cnn = nn.Linear(cnn_out_dim, 1)
        self.gate_lstm = nn.Linear(lstm_out_dim, 1)
        self.gate_self_attn = nn.Linear(self_attn_out_dim, 1)
        
        # Final fusion dimension is the concatenation of all branch outputs.
        fused_dim = cnn_out_dim + lstm_out_dim + self_attn_out_dim
        
        # Final classification layer.
        self.fc = nn.Linear(fused_dim, output_size)
        
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        """
        Args:
            x (Tensor): Input tensor of shape (batch_size, sequence_length) with token indices.
        Returns:
            Tensor: Logits of shape (batch_size, output_size).
        """
        batch_size, seq_length = x.size()
        # 1. Embedding lookup.
        embedded = self.embedding(x)  # shape: (batch_size, seq_length, embedding_dim)
        
        # ----------------------------
        # Branch 1: CNN
        # ----------------------------
        # For Conv1d, we need shape: (batch_size, embedding_dim, seq_length)
        cnn_input = embedded.permute(0, 2, 1)
        cnn_features = []
        for conv in self.cnn_convs:
            conv_out = F.relu(conv(cnn_input))  # (batch, num_filters, L_out)
            # Global max pooling over the temporal (L_out) dimension.
            pooled = F.max_pool1d(conv_out, kernel_size=conv_out.size(2)).squeeze(2)  # (batch, num_filters)
            cnn_features.append(pooled)
        cnn_features = torch.cat(cnn_features, dim=1)  # (batch, num_filters * len(cnn_kernel_sizes))
        
        # ----------------------------
        # Branch 2: LSTM + Transformer
        # ----------------------------
        lstm_out, _ = self.lstm(embedded)  # (batch, seq_length, hidden_size*2)
        # Refine with Transformer encoder.
        lstm_transformed = self.lstm_transformer(lstm_out)  # (batch, seq_length, hidden_size*2)
        # Global average pooling over the time dimension.
        lstm_features = torch.mean(lstm_transformed, dim=1)  # (batch, hidden_size*2)
        
        # ----------------------------
        # Branch 3: Direct Self-Attention on embeddings
        # ----------------------------
        # Add positional embeddings (truncate or expand to the current sequence length)
        pos_emb = self.pos_embedding[:, :seq_length, :]  # (1, seq_length, embedding_dim)
        self_attn_input = embedded + pos_emb
        self_attn_out = self.self_attn_encoder(self_attn_input)  # (batch, seq_length, embedding_dim)
        # Global max pooling over the time dimension.
        self_attn_features, _ = torch.max(self_attn_out, dim=1)  # (batch, embedding_dim)
        
        # ----------------------------
        # Gating: Compute dynamic weights for each branch.
        # ----------------------------
        gate_cnn = torch.sigmoid(self.gate_cnn(cnn_features))           # (batch, 1)
        gate_lstm = torch.sigmoid(self.gate_lstm(lstm_features))          # (batch, 1)
        gate_self_attn = torch.sigmoid(self.gate_self_attn(self_attn_features))  # (batch, 1)
        
        gated_cnn = cnn_features * gate_cnn         # (batch, cnn_out_dim)
        gated_lstm = lstm_features * gate_lstm        # (batch, lstm_out_dim)
        gated_self_attn = self_attn_features * gate_self_attn  # (batch, self_attn_out_dim)
        
        # ----------------------------
        # Feature Fusion and Classification
        # ----------------------------
        # Concatenate the gated features from all branches.
        combined = torch.cat([gated_cnn, gated_lstm, gated_self_attn], dim=1)  # (batch, fused_dim)
        combined = self.dropout(combined)
        logits = self.fc(combined)  # (batch, output_size)
        
        return logits

In [17]:
device = torch.device("mps")
model = SentimentUltraUltraAdvanced(VOCAB_SIZE, embedding_dim=100, hidden_size=128, lstm_layers=2, transformer_layers=2, num_filters=64, output_size=1).to(device)
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)  # Best choice
time_taken = train_model(device, train_loader,val_loader, model, criterion, optimizer, epochs=10)
print(f"Training Time on {device}: {time_taken:.2f} seconds")

Epoch 1/10, Train Loss: 0.5843, Val Loss: 0.5486
Epoch 2/10, Train Loss: 0.3557, Val Loss: 0.2226
Epoch 3/10, Train Loss: 0.2196, Val Loss: 0.1417
Epoch 4/10, Train Loss: 0.1160, Val Loss: 0.0935
Epoch 5/10, Train Loss: 0.0654, Val Loss: 0.7176
Epoch 6/10, Train Loss: 0.0422, Val Loss: 0.1328
Epoch 7/10, Train Loss: 0.0185, Val Loss: 0.0091
Epoch 8/10, Train Loss: 0.0080, Val Loss: 0.0029
Epoch 9/10, Train Loss: 0.0046, Val Loss: 0.0047
Epoch 10/10, Train Loss: 0.0030, Val Loss: 0.0014
Training Time on mps: 1098.53 seconds


In [1]:
import multiprocessing
print(multiprocessing.cpu_count())  # Shows number of CPU cores

14
