In [50]:
# python3 -m pip install torch torchvision torchaudio

import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoConfig
from math import sqrt



In [51]:
# Load the BERT tokenizer ("bert-base-uncased" = lowercase, 12-layer medium encoder)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
sentence_1 = "Artificial Intelligence is a game changer."
sentence_2 = "Join the AI Career Launchpad program."

In [52]:
inputs = tokenizer(
    sentence_1,
    sentence_2,
    add_special_tokens=False,
    return_tensors="pt"
)

In [53]:
print(inputs)

{'input_ids': tensor([[ 7976,  4454,  2003,  1037,  2208,  2689,  2099,  1012,  3693,  1996,
          9932,  2476,  4888, 15455,  2565,  1012]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [54]:
# Understand the tokens better
print("Input IDs:", inputs["input_ids"])
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
print("Tokens:", tokens)

Input IDs: tensor([[ 7976,  4454,  2003,  1037,  2208,  2689,  2099,  1012,  3693,  1996,
          9932,  2476,  4888, 15455,  2565,  1012]])
Tokens: ['artificial', 'intelligence', 'is', 'a', 'game', 'change', '##r', '.', 'join', 'the', 'ai', 'career', 'launch', '##pad', 'program', '.']


In [55]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("bert-base-uncased")
print(config)

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.57.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}



In [56]:
from torch import nn
token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
print(token_embeddings)

Embedding(30522, 768)


In [57]:
input_embeds = token_embeddings(inputs.input_ids)
print(input_embeds.size())

torch.Size([1, 16, 768])


In [64]:
import torch.nn.functional as F
from math import sqrt

query = key = value = input_embeds

# Attention Score Formula: Softmax[Q.K/sqrt(dim_k)].V
def scaled_dot_product_attention(query, key, value):
    # For scaling the Query<>Key dot product to ensure softmax doesn't have to deal with large values
    # Last dimension in the query tensor (hidden layer dim)
    dim_k = query.size(-1)
    # Batch matrix multiplication (dot product), so for each item in the batch it multiplies query with key
    scores = torch.bmm(query, key.transpose(1,2)) / sqrt(dim_k)

    # We apply a softmax along the last dimension (dim=-1), so for each query position, 
    # all scores over the keys are turned into probabilities that sum to 1
    weights = F.softmax(scores, dim = -1)
    return torch.bmm(weights, value)
    

In [66]:
class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attention_outputs = scaled_dot_product_attention(self.q(hidden_state), self.k(hidden_state), self.v(hidden_state))
        return attention_outputs
        
    # hidden_state tensor is of shape [batch_size, seq_len, embed_dim]
    # nn.Linear(embed_dim, head_dim), projects each token embedding vector from embed_dim to head_dim dimension
    # Though we pass 3D hidden_state tensor, PyTorch applies linear transformation to the last dimension only, treating the first dimensions as batch-like.
        

In [68]:
class MultiHeadAttention(nn.Module):
        def __init__(self, config):
            super().__init__()
            embed_dim = config.hidden_size
            num_heads = config.num_attention_heads
            head_dim = embed_dim // num_heads
            self.heads = nn.ModuleList([AttentionHead(embed_dim, head_dim) for _ in range(num_heads)])
            # final linear transformation on the concatenated output for interactions across representations from each head
            self.output_linear = nn.Linear(embed_dim, embed_dim)

        def forward(self, hidden_state):
            # It runs the input through each attention head, collects all head outputs, 
            # and concatenates them along the last dimension to form one combined multi-head representation.
            x = torch.cat([h(hidden_state) for h in self.heads], dim = -1)
            x = self.output_linear(x)
            return x

In [70]:
multihead_attention = MultiHeadAttention(config)
attention_output = multihead_attention(input_embeds)
print(attention_output.size())

torch.Size([1, 16, 768])


In [71]:
import torch
from torch import nn

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        # First linear layer expands the dimension (e.g., 768 → 3072)
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # Second linear layer projects it back (e.g., 3072 → 768)
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        # Non-linear activation
        self.gelu = nn.GELU()
        # Dropout for regularization
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        # x: [batch_size, seq_len, hidden_size]
        x = self.linear_1(x)   # [batch, seq, intermediate_size]
        x = self.gelu(x)       # non-linear transform
        x = self.linear_2(x)   # [batch, seq, hidden_size]
        x = self.dropout(x)    # apply dropout
        return x               # same shape as input

In [73]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attention_output)
print(ff_outputs.size())

torch.Size([1, 16, 768])


# Layer Normalization, Position Embeddings, Classification Head

In [74]:
# Include position embeddings, layer norm and dropout 

class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Look up vectors for token IDs
        self.token_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        # Learnable embeddings for positions [0..max_position_embeddings-1]
        self.position_embeddings = nn.Embedding(config.max_position_embeddings,
                                                config.hidden_size)
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids):
        # input_ids: [batch_size, seq_len]
        batch_size, seq_len = input_ids.size()

        # 1) Token embeddings: [B, T, H]
        token_embeds = self.token_embeddings(input_ids)

        # 2) Position ids: [0, 1, ..., T-1] for each example in batch
        position_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0)
        position_ids = position_ids.expand(batch_size, seq_len)  # [B, T]
        position_embeds = self.position_embeddings(position_ids)  # [B, T, H]

        # 3) Sum token + position, then normalize + dropout
        embeddings = token_embeds + position_embeds          # [B, T, H]
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)
        return embeddings

In [75]:
# Include layer norm before the self attention, and also before the feed forward 
# Add a residual after the self attention and after the feed forward
# LayerNorm → Multi-head attention → Residual
# LayerNorm → FeedForward → Residual

class EncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.self_attn = MultiHeadAttention(config)  # your code
        self.feed_forward = FeedForward(config)      # your code

        self.ln1 = nn.LayerNorm(config.hidden_size)
        self.ln2 = nn.LayerNorm(config.hidden_size)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_state):
        # hidden_state: [batch_size, seq_len, hidden_size]

        # 1) Self-attention block with residual
        attn_input = self.ln1(hidden_state)
        attn_output = self.self_attn(attn_input)            # [B, T, H]
        hidden_state = hidden_state + self.dropout(attn_output)

        # 2) Feed-forward block with residual
        ff_input = self.ln2(hidden_state)
        ff_output = self.feed_forward(ff_input)             # [B, T, H]
        hidden_state = hidden_state + self.dropout(ff_output)

        return hidden_state   # [B, T, H]

In [76]:
class SequenceClassificationHead(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, num_labels)

    def forward(self, encoder_outputs):
        # encoder_outputs: [batch_size, seq_len, hidden_size]
        # Convention: use the first token's representation (position 0)
        cls_embeds = encoder_outputs[:, 0, :]          # [batch_size, hidden_size]
        x = self.dropout(cls_embeds)
        logits = self.classifier(x)                    # [batch_size, num_labels]
        return logits

In [77]:
class MiniTransformerForSequenceClassification(nn.Module):
    def __init__(self, config, num_labels):
        super().__init__()
        self.embeddings = Embeddings(config)
        self.encoder = EncoderLayer(config)               # single layer for demo
        self.classifier = SequenceClassificationHead(config, num_labels)

    def forward(self, input_ids):
        # 1) ids → embeddings (token + position)
        x = self.embeddings(input_ids)                    # [B, T, H]
        # 2) embeddings → encoder layer (MHA + FFN + norms + residual)
        x = self.encoder(x)                               # [B, T, H]
        # 3) encoder output → logits
        logits = self.classifier(x)                       # [B, num_labels]
        return logits

In [79]:
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
config = AutoConfig.from_pretrained("bert-base-uncased")

model = MiniTransformerForSequenceClassification(config, num_labels=2)

text = "Artificial Intelligence is a game changer. Join the AI Career Launchpad program."
inputs = tokenizer(text, add_special_tokens=True, return_tensors="pt")
input_ids = inputs["input_ids"]          # [1, seq_len]

logits = model(input_ids)                # [1, 2]
print(logits)
probs = torch.softmax(logits, dim=-1)
# If the classifier uses one output (instead of two), we can use sigmoid to convert into probability
# Softmax ensures the probability for both the tasks add up to one 
# In BERT-style models with two outputs, we use softmax, not sigmoid.
print(probs)

tensor([[ 1.0749, -2.0433]], grad_fn=<AddmmBackward0>)
tensor([[0.9576, 0.0424]], grad_fn=<SoftmaxBackward0>)
