In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
model = AutoModelForCausalLM.from_pretrained('roneneldan/TinyStories-1M')
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
prompt = "Once upon a time there was"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [2]:
print(model)

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 64)
    (wpe): Embedding(2048, 64)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTNeoBlock(
        (ln_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=64, out_features=64, bias=False)
            (v_proj): Linear(in_features=64, out_features=64, bias=False)
            (q_proj): Linear(in_features=64, out_features=64, bias=False)
            (out_proj): Linear(in_features=64, out_features=64, bias=True)
          )
        )
        (ln_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=64, out_features=256, bias=True)
          (c_proj): Linear(in_features=256, out_featur

In [3]:
# Generate completion
output = model.generate(input_ids, max_length = 1000, num_beams=1)
print(output.shape)
# Decode the completion
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 208])
Once upon a time there was a little girl named Lily. She loved to play outside in the sunshine. One day, she saw a big, shiny rock in the sky. She wanted to touch it, but it was too high.

Lily's mommy told her that it was important to be careful and not touch things. Lily didn't want to touch it, so she asked her mommy if she could touch it. Her mommy said yes and they went to the park.

When they got there, Lily saw a big, scary dog. The dog was barking and barked loudly. Lily was scared and didn't know what to do. She tried to run away, but she was too fast. She tried to run away, but it was too late. The dog was too fast and it was too fast.

Lily was sad and cried. She wished she had listened to her mommy. She wished she had listened to her mommy and never let her to get back.



**Model Parameters**

In [4]:
import torch
import torch.nn as nn

# Define model parameters
d_model = 64
nhead = 8
d_hid = 256
dropout = 0
nlayers = 1
sequence_length = 10
batch_size = 1

**Transformer Block Pytorch API**

In [5]:
# Original Transformer Encoder using PyTorch API
encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
transformer_encoder = nn.TransformerEncoder(encoder_layer, nlayers)
print(transformer_encoder)

TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
      )
      (linear1): Linear(in_features=64, out_features=256, bias=True)
      (dropout): Dropout(p=0, inplace=False)
      (linear2): Linear(in_features=256, out_features=64, bias=True)
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0, inplace=False)
      (dropout2): Dropout(p=0, inplace=False)
    )
  )
)


**Custom Transformer Block**

In [6]:
# Custom Transformer block implementation
class CustomTransformerBlock(nn.Module):
    def __init__(self, d_model, nhead, d_hid, dropout=0):
        super(CustomTransformerBlock, self).__init__()

        # Multi-head self-attention
        self.attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout)

        # Feedforward network
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, d_hid),
            nn.ReLU(),
            nn.Linear(d_hid, d_model),
        )

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout layers
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        # Self-attention with residual connection
        attn_output, _ = self.attention(x, x, x)
        x = x + self.dropout1(attn_output)  # Residual connection
        x = self.norm1(x)  # Layer normalization

        # Feedforward network with residual connection
        ff_output = self.feedforward(x)
        x = x + self.dropout2(ff_output)  # Residual connection
        x = self.norm2(x)  # Layer normalization

        return x

**Copying the weights**

In [7]:
# Instantiate the custom transformer block
custom_transformer_block = CustomTransformerBlock(d_model, nhead, d_hid, dropout)

# Copy weights from the original transformer layer to the custom transformer block
custom_transformer_block.attention.in_proj_weight = encoder_layer.self_attn.in_proj_weight
custom_transformer_block.attention.in_proj_bias = encoder_layer.self_attn.in_proj_bias
custom_transformer_block.attention.out_proj.weight = encoder_layer.self_attn.out_proj.weight
custom_transformer_block.attention.out_proj.bias = encoder_layer.self_attn.out_proj.bias

custom_transformer_block.feedforward[0].weight = encoder_layer.linear1.weight
custom_transformer_block.feedforward[0].bias = encoder_layer.linear1.bias
custom_transformer_block.feedforward[2].weight = encoder_layer.linear2.weight
custom_transformer_block.feedforward[2].bias = encoder_layer.linear2.bias

custom_transformer_block.norm1.weight = encoder_layer.norm1.weight
custom_transformer_block.norm1.bias = encoder_layer.norm1.bias
custom_transformer_block.norm2.weight = encoder_layer.norm2.weight
custom_transformer_block.norm2.bias = encoder_layer.norm2.bias

**Generating Random Input**

In [8]:
input_tensor = torch.randn(sequence_length, batch_size, d_model)

**Pass the input through both transformer blocks**

In [9]:
original_output = transformer_encoder(input_tensor)
custom_output = custom_transformer_block(input_tensor)

**Comparing**

In [10]:
print("Original Output shape:", original_output.shape)
print("Custom Output shape:", custom_output.shape)

# Check if the outputs are close
if torch.allclose(original_output, custom_output, atol=1e-6):
    print("The outputs are the same!")
else:
    print("The outputs are different.")

Original Output shape: torch.Size([10, 1, 64])
Custom Output shape: torch.Size([10, 1, 64])
The outputs are the same!


**Original Model Layer Outputs**

In [11]:
from transformers import GPTNeoForCausalLM, AutoTokenizer
import torch

# Subclass the original model
class GPTNeoForCausalLMWithPrint(GPTNeoForCausalLM):
    def forward(self, input_ids, **kwargs):
        # Extract the transformer and lm_head from the model
        transformer = self.transformer
        lm_head = self.lm_head

        # Print the input IDs
        print("Input IDs:", input_ids)

        # Pass input through embeddings
        hidden_states = transformer.wte(input_ids) + transformer.wpe(torch.arange(input_ids.shape[-1], device=input_ids.device))
        print("After Embeddings:", hidden_states.shape)

        # Dropout after embeddings
        hidden_states = transformer.drop(hidden_states)
        print("After Dropout:", hidden_states.shape)

        # Pass through each block (attention + MLP)
        for i, block in enumerate(transformer.h):
            # Ensure that only the hidden states are returned and passed forward
            hidden_states, *_ = block(hidden_states, **kwargs)
            print(f"After Block {i}:", hidden_states.shape)

        # Final layer norm
        hidden_states = transformer.ln_f(hidden_states)
        print("After Final LayerNorm:", hidden_states.shape)

        # Pass through language modeling head
        logits = lm_head(hidden_states)
        print("After LM Head:", logits.shape)

        return logits

# Load the pretrained model and tokenizer
model = GPTNeoForCausalLMWithPrint.from_pretrained('roneneldan/TinyStories-1M')
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

# Define the prompt
prompt = "Once upon a time there was a"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate output
output = model(input_ids)

# Decode the output to text
output_text = tokenizer.decode(output.argmax(dim=-1)[0], skip_special_tokens=True)

# Print the generated text
print("Generated Text:", output_text)


Input IDs: tensor([[7454, 2402,  257,  640,  612,  373,  257]])
After Embeddings: torch.Size([1, 7, 64])
After Dropout: torch.Size([1, 7, 64])
After Block 0: torch.Size([1, 7, 64])
After Block 1: torch.Size([1, 7, 64])
After Block 2: torch.Size([1, 7, 64])
After Block 3: torch.Size([1, 7, 64])
After Block 4: torch.Size([1, 7, 64])
After Block 5: torch.Size([1, 7, 64])
After Block 6: torch.Size([1, 7, 64])
After Block 7: torch.Size([1, 7, 64])
After Final LayerNorm: torch.Size([1, 7, 64])
After LM Head: torch.Size([1, 7, 50257])
Generated Text:  upon a time, was a little


**Tiny Stories Github**

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import time


# hyperparameters
block_size = 64  # what is the maximum context length for predictions?
n_embd = 512
dropout = 0
n_head = 12
n_layer = 12
# -------------

torch.manual_seed(1337)


class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(
            torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B, T, C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x)  # (B,T,hs)
        # compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        wei = wei.masked_fill(
            self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)
        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,hs)
        out = wei @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        out = self.net(x)
        return out


class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        # self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(
            torch.arange(T, device=device))  # (T,C)
        x = tok_emb + pos_emb  # (B,T,C)
        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


device = "cuda" if torch.cuda.is_available() else "cpu"

enc = tiktoken.get_encoding("gpt2")
vocab_size = enc.n_vocab

print("Loading the model")
# Load the trained GPT model
model = GPTLanguageModel()
m = model.to(device)

# Load the model with CPU mapping if necessary
m.load_state_dict(torch.load(r'./gpt_5_3500.pth', map_location=device))
m.eval()
print("model loaded...")

# Set the initial context for generation
text = "\n"  # "!"
encoded_text = enc.encode(text)
context = torch.tensor(encoded_text, dtype=torch.long,
                       device=device).unsqueeze(0)
# Generate new text
start_time = time.time()
generated_text = m.generate(context, max_new_tokens=50)[0]
end_time = time.time()
total_time = end_time - start_time
per_token_time = total_time / generated_text.size(0)
# Decode the generated text from integer indices to characters
decoded_text = enc.decode(generated_text.tolist())

# Print the generated text
print(decoded_text)
print(f"\nTotal generation time: {total_time:.4f} seconds")
print(f"\nPer-token generation: {per_token_time:.6f} seconds")
open(r'./generated.txt', 'w').write(decoded_text)

Loading the model
model loaded...

Tom and Mia think of a solution. They say, "Let's pretend we are wheat. And we can make soup thanked God for us and it bake. We can talk to God, moon."

They take their shovel and their spades

Total generation time: 3.9369 seconds

Per-token generation: 0.077193 seconds


193

**Giving Initial Tokens**

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import time

# [Your model and class definitions remain the same as above]

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize tokenizer and vocabulary size
enc = tiktoken.get_encoding("gpt2")
vocab_size = enc.n_vocab

# Load the trained GPT model
print("Loading the model")
model = GPTLanguageModel()
m = model.to(device)
m.load_state_dict(torch.load(r'./gpt_5_3500.pth', map_location=device))
m.eval()
print("model loaded...")

# Calculate and print the total number of parameters
total_params = sum(p.numel() for p in m.parameters())
print(f"Total number of parameters: {total_params:,}")

# Set the initial tokens (first two tokens in this case)
initial_text = "Once upon a time there was a thirsty black crow"  # Replace this with your desired initial text
print("Initial tokens given to the model:",initial_text)
encoded_initial_text = enc.encode(initial_text)

# Convert the encoded text to a tensor and expand dimensions
context = torch.tensor(encoded_initial_text, dtype=torch.long, device=device).unsqueeze(0)

# Generate new text starting from the initial context
start_time = time.time()
generated_text = m.generate(context, max_new_tokens=500)[0]
end_time = time.time()

# Calculate generation time
total_time = end_time - start_time
per_token_time = total_time / generated_text.size(0)

# Decode the generated text from integer indices to characters
decoded_text = enc.decode(generated_text.tolist())

# Print the generated text
print("\nGenerated text\n:")
print(decoded_text)
print(f"\nTotal generation time: {total_time:.4f} seconds")
print(f"\nPer-token generation: {per_token_time:.6f} seconds")
open(r'./generated.txt', 'w').write(decoded_text)


Loading the model
model loaded...
Total number of parameters: 89,160,785
Initial tokens given to the model: Once upon a time there was a thirsty black crow

Generated text
:
Once upon a time there was a thirsty black crow. Betty wanted to take a drink from the world but it was too deep. She looked around and saw a big green pond with lots of water. She wanted to find the water, but she had no water. 

Then she saw a little stream in the woods. It was sparkling blue. Betty walked along the stream until she came across the water. She felt so excited that she entered the pond. 

Betty took the water and they started to play. Amelia found herself swimming around the stream, swimming and laughing. She pretended to swim deep in the water. Then she even jumped under the stream. 

The pond was so peaceful and quiet. She laughed and laughed all day. 

Stirp came back whenever Amelia saw it, she was ready to rest. On the way home, Amelia waved goodbye and sang her happy songs. She was so happy t

2126

**Tiny Stories With each Layer Output**

In [23]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import tiktoken
import time
import math

# hyperparameters
block_size = 64  # what is the maximum context length for predictions?
n_embd = 512
dropout = 0
n_head = 12
n_layer = 12
# -------------

torch.manual_seed(1337)


class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(
            torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B, T, C = x.shape
        k = self.key(x)  # (B,T,hs)
        q = self.query(x)  # (B,T,hs)
        # compute attention scores ("affinities")
        # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5

        wei = wei.masked_fill(
            self.tril[:T, :T] == 0, float('-inf'))  # (B, T, T)


        wei = F.softmax(wei, dim=-1)  # (B, T, T)
        wei = self.dropout(wei)

        # perform the weighted aggregation of the values
        v = self.value(x)  # (B,T,hs)
        out = wei @ v  # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        print(f"Head output shape: {out.shape}")
        # print(f"Key weight shape: {self.key.weight.shape}")
        # print(f"Query weight shape: {self.query.weight.shape}")
        # print(f"Value weight shape: {self.value.weight.shape}")

        return out


class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        print(f"MultiHeadAttention projected output shape: {out.shape}")
        return out


class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        out = self.net(x)
        print(f"FeedForward output shape: {out.shape}")
        return out


class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        print("block output:", x, x.shape)
        print(f"Block output shape: {x.shape}")
        return x


class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(
            *[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        # self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx)  # (B,T,C)
        pos_emb = self.position_embedding_table(
            torch.arange(T, device=device))  # (T,C)

        x = tok_emb + pos_emb  # (B,T,C)

        x = self.blocks(x)  # (B,T,C)
        x = self.ln_f(x)  # (B,T,C)
        logits = self.lm_head(x)  # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        print(f"Logits shape: {logits.shape}")
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]  # becomes (B, C)

            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)  # (B, C)
            print(probs)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            print(idx_next)
            # enc = tiktoken.get_encoding("gpt2")
            # print(enc.decode([idx_next.item()]))
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1)  # (B, T+1)
        return idx


device = "cuda" if torch.cuda.is_available() else "cpu"

enc = tiktoken.get_encoding("gpt2")
vocab_size = enc.n_vocab

print("Loading the model")
# Load the trained GPT model
model = GPTLanguageModel()
m = model.to(device)


# Load the model with CPU mapping if necessary
m.load_state_dict(torch.load(r'./gpt_5_3500.pth', map_location=device))
m.eval()
print("model loaded...")


# Set the initial context for generation
text = "Once upon a time there"  # "!"
encoded_text = enc.encode(text)
context = torch.tensor(encoded_text, dtype=torch.long,
                       device=device).unsqueeze(0)

# Generate new text
start_time = time.time()
generated_text = m.generate(context, max_new_tokens=1)[0]
end_time = time.time()
total_time = end_time - start_time
per_token_time = total_time / generated_text.size(0)
# Decode the generated text from integer indices to characters
decoded_text = enc.decode(generated_text.tolist())

# Print the generated text
print(decoded_text)
print(f"\nTotal generation time: {total_time:.4f} seconds")
print(f"\nPer-token generation: {per_token_time:.6f} seconds")
open(r'./generated.txt', 'w').write(decoded_text)

Loading the model
model loaded...
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
Head output shape: torch.Size([1, 5, 42])
MultiHeadAttention projected output shape: torch.Size([1, 5, 512])
FeedForward output shape: torch.Size([1, 5, 512])
block output: tensor([[[ 0.2531,  2.2977,  0.3734,  ..., -1.8480, -0.2665, -1.4731],
         [ 0.7287, -0.6131,  3.7439,  ..., -2.2130,  0.1882,  2.0021],
         [-1.1138, -1.2037,  2.3367,  ..., -3.9513,  0.0129,  1.4278],
         [ 0.5878, -1.1944,  1.2199,  ..., -1.0795, -1.4784, -1.8418],
         [-0.8935, -1.2039,  1.6918,  ...,  0.6

26

In [13]:
# Save the gamma_beta_dict to a .pth file
# torch.save(gamma_beta_dict, 'gamma_beta_values.pth')