In [1]:
from datasets import load_dataset
from torch.utils.data import random_split, DataLoader
from transformers import DistilBertTokenizer
import torch

# Load the dataset
ds = load_dataset("stanfordnlp/imdb")
ds.pop('unsupervised')

train_ds = ds["train"]
test_ds = ds["test"]

# Split the train dataset into train and validation sets
train_size = int(0.8 * len(train_ds))
val_size = len(train_ds) - train_size
train_ds, val_ds = random_split(train_ds, [train_size, val_size])

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize function
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256  # Adjust max_length for LSTM input size (e.g., 128, 256, etc., based on GPU memory)
    )

# Tokenize datasets
train_ds = train_ds.dataset.map(tokenize_function, batched=True)
val_ds = val_ds.dataset.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

# Keep only input_ids and attention_mask in the dataset for LSTM
def format_for_lstm(batch):
    return {
        "input_ids": torch.tensor(batch["input_ids"]),
        "attention_mask": torch.tensor(batch["attention_mask"]),
        "label": torch.tensor(batch["label"])
    }

train_ds = train_ds.map(format_for_lstm, batched=True)
val_ds = val_ds.map(format_for_lstm, batched=True)
test_ds = test_ds.map(format_for_lstm, batched=True)

# Set the format for PyTorch compatibility
train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=6)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=6)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=6)



In [2]:
import torch
import torch.nn as nn
import math

device = "cuda" if torch.cuda.is_available() else "cpu"

class InputEmbeddings(nn.Module):
    def __init__(self, d_model: int, vocab_size: int):
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.embedding = nn.Embedding(vocab_size, d_model)

    def forward(self, x):
        return self.embedding(x) * math.sqrt(self.d_model)

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, seq_len: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.seq_len = seq_len  # Fixed typo in attribute name
        self.dropout = nn.Dropout(dropout)

        # Create matrix of size seq_len x d_model
        pe = torch.zeros(seq_len, d_model)
        # Create a vector of shape seq_len
        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # Apply sin and cos
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)  # Added return statement

class LayerNormalization(nn.Module):
    def __init__(self, eps: float = 10**-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(1))
        self.beta = nn.Parameter(torch.zeros(1))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return self.alpha * (x - mean) / (std + self.eps) + self.beta  # Fixed bias to beta

class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        # Added return statement
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model is not appropriate size"

        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)

        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]
        
        # Calculate attention scores
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        
        if mask is not None:
            # Reshape mask to match attention_scores shape [batch_size, num_heads, seq_len, seq_len]
            if mask.dim() == 2:
                mask = mask.unsqueeze(1).unsqueeze(2)  # [batch_size, 1, 1, seq_len]
                mask = mask.expand(-1, query.size(1), query.size(2), -1)  # [batch_size, num_heads, seq_len, seq_len]
            
            # Use a smaller negative value that works with float16
            attention_scores.masked_fill_(mask == 0, -65504.0)  # Maximum negative value in float16
            
        attention_scores = attention_scores.softmax(dim=-1)
        if dropout is not None:
            attention_scores = dropout(attention_scores)
            
        return (attention_scores @ value), attention_scores
        
    def forward(self, q, k, v, mask):
        batch_size = q.size(0)
        
        # Linear transformations
        query = self.w_q(q)
        key = self.w_k(k)
        value = self.w_v(v)

        # Reshape for multi-head attention
        query = query.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        key = key.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
        value = value.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)

        x, self_attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # Combine heads and apply output transformation
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.w_o(x)

class ResidualConnection(nn.Module):
    def __init__(self, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization()

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

class EncoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(dropout) for _ in range(2)])

    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_forward_block)
        return x

class DistilBERT(nn.Module):
    def __init__(self, vocab_size: int, d_model: int, n_layers: int, n_heads: int, d_ff: int, dropout: float, num_classes: int = 1):
        super().__init__()
        self.input_embeddings = InputEmbeddings(d_model, vocab_size)
        self.positional_encoding = PositionalEncoding(d_model, seq_len=512, dropout=dropout)

        # Create encoder blocks
        self.encoder_blocks = nn.ModuleList([
            EncoderBlock(
                MultiHeadAttentionBlock(d_model, n_heads, dropout),
                FeedForwardBlock(d_model, d_ff, dropout),
                dropout
            ) for _ in range(n_layers)
        ])
        
        # Add classification head
        self.classifier = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, input_ids, attention_mask=None):
        # Step 1: Get input embeddings and add positional encoding
        x = self.input_embeddings(input_ids)
        x = self.positional_encoding(x)

        # Step 2: Pass through each encoder block
        for layer in self.encoder_blocks:
            x = layer(x, attention_mask)

        # Step 3: Pool the output (use [CLS] token or mean pooling)
        # Here we use mean pooling over the sequence length
        # First multiply by attention mask to zero out padding tokens
        if attention_mask is not None:
            mask_expanded = attention_mask.unsqueeze(-1).float()
            x = x * mask_expanded
            # Calculate mean over sequence length (dim 1), excluding padding
            sequence_lengths = torch.sum(attention_mask, dim=1, keepdim=True)
            pooled = torch.sum(x, dim=1) / sequence_lengths
        else:
            # If no mask, just take mean over sequence length
            pooled = torch.mean(x, dim=1)

        # Step 4: Pass through classification head
        logits = self.classifier(pooled)
        return logits

# Example of instantiating the model
vocab_size = 30522  # Typical vocab size for BERT
d_model = 768  # Dimensionality of the embeddings
n_layers = 6  # Number of transformer blocks
n_heads = 12  # Number of attention heads
d_ff = 3072  # Dimensionality of feedforward layer
dropout = 0.1  # Dropout rate

model = DistilBERT(vocab_size, d_model, n_layers, n_heads, d_ff, dropout).to(device)

In [3]:
import torch.optim as optim
from tqdm import tqdm

# Set the model to training mode
model.train()

# Define the loss function and optimizer
loss_fn = torch.nn.BCEWithLogitsLoss()  # Use for binary classification
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-4, weight_decay=0.1)
scaler = torch.cuda.amp.GradScaler()

num_epochs = 10
max_grad_norm = 0.1  # Set your max_grad_norm value

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    
    # Wrap train_loader in tqdm to monitor training progress
    train_loader_tqdm = tqdm(train_loader, desc=f"Epoch [{epoch + 1}/{num_epochs}] - Training", leave=False)
    
    for batch in train_loader_tqdm:
        # Ensure we access inputs correctly
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].float().to(device)  # Make sure labels are float for BCEWithLogitsLoss
        
        optimizer.zero_grad()

        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(inputs, attention_mask)
            loss = loss_fn(outputs.squeeze(), labels)  # Outputs should be raw logits

        # Backward pass
        scaler.scale(loss).backward()

        # Clip gradients
        scaler.unscale_(optimizer)  # Unscale before gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Clip gradients

        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        train_loader_tqdm.set_postfix(loss=loss.item())  # Display the current batch loss
        
    # Calculate average training loss for the epoch
    avg_loss = running_loss / len(train_loader)

    # Validation phase
    model.eval()
    running_val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        val_loader_tqdm = tqdm(val_loader, desc=f"Epoch [{epoch + 1}/{num_epochs}] - Validation", leave=False)
        
        for batch in val_loader_tqdm:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].float().to(device)

            with torch.cuda.amp.autocast():
                outputs = model(inputs, attention_mask)
                loss = loss_fn(outputs.squeeze(), labels)
                
            running_val_loss += loss.item()

            # Calculate accuracy
            predicted = (torch.sigmoid(outputs.squeeze()) > 0.5).long()  # Apply sigmoid and threshold
            total += labels.size(0)
            correct += (predicted == labels.long()).sum().item()
        
            val_loader_tqdm.set_postfix(loss=loss.item())  # Display the current validation batch loss
    
    # Calculate average validation loss and accuracy
    avg_val_loss = running_val_loss / len(val_loader)
    accuracy = correct / total * 100

    print(f"Epoch [{epoch + 1}/{num_epochs}], "
          f"Training Loss: {avg_loss:.4f}, "
          f"Validation Loss: {avg_val_loss:.4f}, "
          f"Validation Accuracy: {accuracy:.2f}%")

  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
                                                                                

Epoch [1/10], Training Loss: 0.4717, Validation Loss: 0.3604, Validation Accuracy: 83.84%


                                                                                

Epoch [2/10], Training Loss: 0.3342, Validation Loss: 0.2682, Validation Accuracy: 88.97%


                                                                                

Epoch [3/10], Training Loss: 0.2876, Validation Loss: 0.2134, Validation Accuracy: 91.49%


                                                                                

Epoch [4/10], Training Loss: 0.2470, Validation Loss: 0.1797, Validation Accuracy: 93.54%


                                                                                

Epoch [5/10], Training Loss: 0.2110, Validation Loss: 0.1289, Validation Accuracy: 95.54%


                                                                                

Epoch [6/10], Training Loss: 0.1773, Validation Loss: 0.0996, Validation Accuracy: 96.84%


                                                                                

Epoch [7/10], Training Loss: 0.1484, Validation Loss: 0.0985, Validation Accuracy: 96.47%


                                                                                

Epoch [8/10], Training Loss: 0.1248, Validation Loss: 0.0623, Validation Accuracy: 97.95%


                                                                                

Epoch [9/10], Training Loss: 0.1044, Validation Loss: 0.0382, Validation Accuracy: 98.90%


                                                                                

Epoch [10/10], Training Loss: 0.0887, Validation Loss: 0.0396, Validation Accuracy: 98.73%




In [4]:
model.eval()  # Set the model to evaluation mode
running_test_loss = 0.0
correct = 0
total = 0
    
# Define the loss function for evaluation
loss_fn = torch.nn.BCEWithLogitsLoss()

with torch.no_grad():
    test_loader_tqdm = tqdm(test_loader, desc="Testing", leave=False)
        
    for batch in test_loader_tqdm:
        inputs, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['label'].float().to(device)

        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(inputs, attention_mask)
            loss = loss_fn(outputs.squeeze(), labels)
                
        running_test_loss += loss.item()

        # Calculate accuracy
        predicted = (outputs.squeeze() > 0).long()
        total += labels.size(0)
        correct += (predicted == labels.long()).sum().item()

        test_loader_tqdm.set_postfix(loss=loss.item())  # Display the current test batch loss
    
# Calculate average test loss and accuracy
avg_test_loss = running_test_loss / len(test_loader)
accuracy = correct / total * 100

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {accuracy:.2f}%")

  with torch.cuda.amp.autocast():
                                                                                

Test Loss: 0.8627, Test Accuracy: 82.32%


