When to Use BiGRU

When both past and future context matters (e.g., for meaning in a sentence).
In tasks like:
Sentiment analysis
Text classification
Speech and sequence modeling

In [1]:
import torch

import torch.nn as nn

In [2]:
class BiGRUModel(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers = 1, dropout = 0.3):
        super(BiGRUModel,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.gru = nn.GRU(

            input_size= embedding_dim,
            hidden_size= hidden_dim,
            num_layers= num_layers,
            batch_first=True,
            dropout= dropout if num_layers > 1 else 0,
            bidirectional=True
        )

        self.fc = nn.Linear(hidden_dim * 2, output_dim) # *2 for bidirectional

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):

        embedded = self.dropout(self.embedding(x)) # [batch_size, seq_len, embed_dim]

        outputs, hidden = self.gru(embedded) # outputs: [batch_size, seq_len, hidden_dim * 2]

        # Concatenate last forward and backward hidden stateforward

        forward_hidden = hidden[-2, :, :]  # [batch_size, hidden_dim]

        backward_hidden = hidden[-1, :, :]  # [batch_size, hidden_dim]

        combined_hidden = torch.cat((forward_hidden, backward_hidden), dim=1) # [batch_size, hidden_dim*2]

        return self.fc(combined_hidden)

hidden[-2] → final forward layer
hidden[-1] → final backward layer
We concatenate them before passing to the linear layer.

In [3]:
model = BiGRUModel(

    vocab_size=10000,

    embedding_dim=128,

    hidden_dim=64,
    
    output_dim=1  # for binary classification (e.g., sentiment)
)
