In [49]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
model = AutoModelForCausalLM.from_pretrained('roneneldan/TinyStories-1M')
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")
prompt = "Once upon a time there was"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

In [50]:
print(model)

GPTNeoForCausalLM(
  (transformer): GPTNeoModel(
    (wte): Embedding(50257, 64)
    (wpe): Embedding(2048, 64)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-7): 8 x GPTNeoBlock(
        (ln_1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (attn): GPTNeoAttention(
          (attention): GPTNeoSelfAttention(
            (attn_dropout): Dropout(p=0.0, inplace=False)
            (resid_dropout): Dropout(p=0.0, inplace=False)
            (k_proj): Linear(in_features=64, out_features=64, bias=False)
            (v_proj): Linear(in_features=64, out_features=64, bias=False)
            (q_proj): Linear(in_features=64, out_features=64, bias=False)
            (out_proj): Linear(in_features=64, out_features=64, bias=True)
          )
        )
        (ln_2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTNeoMLP(
          (c_fc): Linear(in_features=64, out_features=256, bias=True)
          (c_proj): Linear(in_features=256, out_

In [51]:
# Generate completion
output = model.generate(input_ids, max_length = 1000, num_beams=1)
print(output.shape)
# Decode the completion
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the generated text
print(output_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


torch.Size([1, 208])
Once upon a time there was a little girl named Lily. She loved to play outside in the sunshine. One day, she saw a big, shiny rock in the sky. She wanted to touch it, but it was too high.

Lily's mommy told her that it was important to be careful and not touch things. Lily didn't want to touch it, so she asked her mommy if she could touch it. Her mommy said yes and they went to the park.

When they got there, Lily saw a big, scary dog. The dog was barking and barked loudly. Lily was scared and didn't know what to do. She tried to run away, but she was too fast. She tried to run away, but it was too late. The dog was too fast and it was too fast.

Lily was sad and cried. She wished she had listened to her mommy. She wished she had listened to her mommy and never let her to get back.



**Model Parameters**

In [52]:
import torch
import torch.nn as nn

# Define model parameters
d_model = 64
nhead = 8
d_hid = 256
dropout = 0
nlayers = 1
sequence_length = 10
batch_size = 1

**Transformer Block Pytorch API**

In [53]:
# Original Transformer Encoder using PyTorch API
encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
transformer_encoder = nn.TransformerEncoder(encoder_layer, nlayers)
print(transformer_encoder)

TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=True)
      )
      (linear1): Linear(in_features=64, out_features=256, bias=True)
      (dropout): Dropout(p=0, inplace=False)
      (linear2): Linear(in_features=256, out_features=64, bias=True)
      (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0, inplace=False)
      (dropout2): Dropout(p=0, inplace=False)
    )
  )
)




**Custom Transformer Block**

In [54]:
# Custom Transformer block implementation
class CustomTransformerBlock(nn.Module):
    def __init__(self, d_model, nhead, d_hid, dropout=0):
        super(CustomTransformerBlock, self).__init__()

        # Multi-head self-attention
        self.attention = nn.MultiheadAttention(embed_dim=d_model, num_heads=nhead, dropout=dropout)

        # Feedforward network
        self.feedforward = nn.Sequential(
            nn.Linear(d_model, d_hid),
            nn.ReLU(),
            nn.Linear(d_hid, d_model),
        )

        # Layer normalization
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

        # Dropout layers
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, x):
        # Self-attention with residual connection
        attn_output, _ = self.attention(x, x, x)
        x = x + self.dropout1(attn_output)  # Residual connection
        x = self.norm1(x)  # Layer normalization

        # Feedforward network with residual connection
        ff_output = self.feedforward(x)
        x = x + self.dropout2(ff_output)  # Residual connection
        x = self.norm2(x)  # Layer normalization

        return x

**Copying the weights**

In [55]:
# Instantiate the custom transformer block
custom_transformer_block = CustomTransformerBlock(d_model, nhead, d_hid, dropout)

# Copy weights from the original transformer layer to the custom transformer block
custom_transformer_block.attention.in_proj_weight = encoder_layer.self_attn.in_proj_weight
custom_transformer_block.attention.in_proj_bias = encoder_layer.self_attn.in_proj_bias
custom_transformer_block.attention.out_proj.weight = encoder_layer.self_attn.out_proj.weight
custom_transformer_block.attention.out_proj.bias = encoder_layer.self_attn.out_proj.bias

custom_transformer_block.feedforward[0].weight = encoder_layer.linear1.weight
custom_transformer_block.feedforward[0].bias = encoder_layer.linear1.bias
custom_transformer_block.feedforward[2].weight = encoder_layer.linear2.weight
custom_transformer_block.feedforward[2].bias = encoder_layer.linear2.bias

custom_transformer_block.norm1.weight = encoder_layer.norm1.weight
custom_transformer_block.norm1.bias = encoder_layer.norm1.bias
custom_transformer_block.norm2.weight = encoder_layer.norm2.weight
custom_transformer_block.norm2.bias = encoder_layer.norm2.bias

**Generating Random Input**

In [56]:
input_tensor = torch.randn(sequence_length, batch_size, d_model)

**Pass the input through both transformer blocks**

In [57]:
original_output = transformer_encoder(input_tensor)
custom_output = custom_transformer_block(input_tensor)

**Comparing**

In [58]:
print("Original Output shape:", original_output.shape)
print("Custom Output shape:", custom_output.shape)

# Check if the outputs are close
if torch.allclose(original_output, custom_output, atol=1e-6):
    print("The outputs are the same!")
else:
    print("The outputs are different.")

Original Output shape: torch.Size([10, 1, 64])
Custom Output shape: torch.Size([10, 1, 64])
The outputs are the same!


**Original Model Layer Outputs**

In [59]:
from transformers import GPTNeoForCausalLM, AutoTokenizer
import torch

# Subclass the original model
class GPTNeoForCausalLMWithPrint(GPTNeoForCausalLM):
    def forward(self, input_ids, **kwargs):
        # Extract the transformer and lm_head from the model
        transformer = self.transformer
        lm_head = self.lm_head

        # Print the input IDs
        print("Input IDs:", input_ids)

        # Pass input through embeddings
        hidden_states = transformer.wte(input_ids) + transformer.wpe(torch.arange(input_ids.shape[-1], device=input_ids.device))
        print("After Embeddings:", hidden_states.shape)

        # Dropout after embeddings
        hidden_states = transformer.drop(hidden_states)
        print("After Dropout:", hidden_states.shape)

        # Pass through each block (attention + MLP)
        for i, block in enumerate(transformer.h):
            # Ensure that only the hidden states are returned and passed forward
            hidden_states, *_ = block(hidden_states, **kwargs)
            print(f"After Block {i}:", hidden_states.shape)

        # Final layer norm
        hidden_states = transformer.ln_f(hidden_states)
        print("After Final LayerNorm:", hidden_states.shape)

        # Pass through language modeling head
        logits = lm_head(hidden_states)
        print("After LM Head:", logits.shape)

        return logits

# Load the pretrained model and tokenizer
model = GPTNeoForCausalLMWithPrint.from_pretrained('roneneldan/TinyStories-1M')
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-125M")

# Define the prompt
prompt = "Once upon a time there was a"
input_ids = tokenizer.encode(prompt, return_tensors="pt")

# Generate output
output = model(input_ids)

# Decode the output to text
output_text = tokenizer.decode(output.argmax(dim=-1)[0], skip_special_tokens=True)

# Print the generated text
print("Generated Text:", output_text)


Input IDs: tensor([[7454, 2402,  257,  640,  612,  373,  257]])
After Embeddings: torch.Size([1, 7, 64])
After Dropout: torch.Size([1, 7, 64])
After Block 0: torch.Size([1, 7, 64])
After Block 1: torch.Size([1, 7, 64])
After Block 2: torch.Size([1, 7, 64])
After Block 3: torch.Size([1, 7, 64])
After Block 4: torch.Size([1, 7, 64])
After Block 5: torch.Size([1, 7, 64])
After Block 6: torch.Size([1, 7, 64])
After Block 7: torch.Size([1, 7, 64])
After Final LayerNorm: torch.Size([1, 7, 64])
After LM Head: torch.Size([1, 7, 50257])
Generated Text:  upon a time, was a little
