In [3]:
# Install PyTorch (if not already installed)
!pip install torch --quiet
!pip install transformers --quiet


In [5]:
from transformers import AutoTokenizer

# Choose model tokenizer (GPT2, BERT, etc.)
tokenizer_name = "gpt2"  # or "EleutherAI/gpt-neo-125M", etc.
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Add padding token if missing (for GPT2)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

vocab_size = tokenizer.vocab_size

In [25]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class MultiHeadMaskedAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(d_model, num_heads, batch_first=True)

    def forward(self, x):
        T = x.size(1)
        mask = torch.triu(torch.ones(T, T), diagonal=1).bool().to(x.device)
        x, _ = self.attn(x, x, x, attn_mask=mask)
        return x

class DecoderBlock(nn.Module):
    def __init__(self, d_model, num_heads, ff_dim):
        super().__init__()
        self.attn = MultiHeadMaskedAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.ff(self.norm2(x))
        return x

class GPTMiniHF(nn.Module):
    def __init__(self, vocab_size, d_model=256, num_heads=4, ff_dim=1024, num_layers=6, max_len=512):
        super().__init__()
        self.token_embed = nn.Embedding(vocab_size, d_model)
        self.pos_embed = PositionalEncoding(d_model, max_len)
        self.blocks = nn.Sequential(*[
            DecoderBlock(d_model, num_heads, ff_dim) for _ in range(num_layers)
        ])
        self.ln_final = nn.LayerNorm(d_model)
        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.token_embed(x)
        x = self.pos_embed(x)
        x = self.blocks(x)
        x = self.ln_final(x)
        return self.out(x)


In [27]:
def generate_text(model, tokenizer, prompt, max_new_tokens=30):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids

    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(input_ids)
            next_token_logits = logits[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(0)
            input_ids = torch.cat([input_ids, next_token], dim=1)

            if next_token.item() == tokenizer.eos_token_id:
                break

    return tokenizer.decode(input_ids[0], skip_special_tokens=True)


In [29]:
model = GPTMiniHF(vocab_size=vocab_size)

# Test prompt
prompt = "Once upon a time"
output = generate_text(model, tokenizer, prompt, max_new_tokens=20)

print("Prompt:", prompt)
print("Generated:", output)


NameError: name 'vocab_size' is not defined