In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class GPT2LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-5):
        super(GPT2LayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.weight * (x - mean) / (std + self.eps) + self.bias

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, embed_size, num_heads):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_size = embed_size
        self.num_heads = num_heads
        self.head_dim = embed_size // num_heads

        assert self.head_dim * num_heads == embed_size, "Embedding size must be divisible by num_heads"

        self.query = nn.Linear(embed_size, embed_size)
        self.key = nn.Linear(embed_size, embed_size)
        self.value = nn.Linear(embed_size, embed_size)

        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)

        Q = self.query(query).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        K = self.key(key).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
        V = self.value(value).view(batch_size, -1, self.num_heads, self.head_dim).permute(0, 2, 1, 3)

        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / (self.head_dim ** 0.5)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy, dim=-1)
        x = torch.matmul(attention, V)
        x = x.permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.embed_size)

        return self.fc_out(x)

class PositionWiseFeedForward(nn.Module):
    def __init__(self, embed_size, hidden_size):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(embed_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, embed_size)

    def forward(self, x):
        return self.fc2(torch.relu(self.fc1(x)))

class GPT2Block(nn.Module):
    def __init__(self, embed_size, num_heads, hidden_size):
        super(GPT2Block, self).__init__()
        self.attention = MultiHeadSelfAttention(embed_size, num_heads)
        self.norm1 = GPT2LayerNorm(embed_size)
        self.feed_forward = PositionWiseFeedForward(embed_size, hidden_size)
        self.norm2 = GPT2LayerNorm(embed_size)

    def forward(self, x, mask=None):
        attention_output = self.attention(x, x, x, mask)
        attention_output = self.norm1(x + attention_output)
        feed_forward_output = self.feed_forward(attention_output)
        return self.norm2(attention_output + feed_forward_output)

class GPT2OutputLayer(nn.Module):
    def __init__(self, embed_size, vocab_size):
        super(GPT2OutputLayer, self).__init__()
        self.dense = nn.Linear(embed_size, vocab_size)

    def forward(self, x):
        return self.dense(x)

class GPT2(nn.Module):
    def __init__(self, vocab_size, embed_size=768, num_heads=12, hidden_size=3072, num_layers=12):
        super(GPT2, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.num_heads = num_heads
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.token_embeddings = nn.Embedding(vocab_size, embed_size)
        self.positional_embeddings = nn.Embedding(512, embed_size)
        self.layers = nn.ModuleList([GPT2Block(embed_size, num_heads, hidden_size) for _ in range(num_layers)])
        self.output_layer = GPT2OutputLayer(embed_size, vocab_size)

    def forward(self, input_ids):
        mask = self._generate_attention_mask(input_ids)
        token_embeds = self.token_embeddings(input_ids)
        positions = torch.arange(0, input_ids.size(1)).unsqueeze(0).to(input_ids.device)
        position_embeds = self.positional_embeddings(positions)
        x = token_embeds + position_embeds

        for layer in self.layers:
            x = layer(x, mask)

        output = self.output_layer(x)
        return output

    def _generate_attention_mask(self, input_ids):
        attention_mask = (input_ids != 0).unsqueeze(1).unsqueeze(2)
        return attention_mask.float()
    def generate(self, input_ids, max_length=50, num_return_sequences=1, temperature=0.7):
        # Start sequence generation for each return sequence
        generated_sequences = []
        for _ in range(num_return_sequences):
            current_sequence = input_ids.clone()
            for _ in range(max_length):
                # Pass the input through the model
                logits = self.forward(current_sequence)
                next_token_logits = logits[:, -1, :] / temperature

                # Sample from the distribution or take the argmax
                next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)

                # Append the new token to the current sequence
                current_sequence = torch.cat([current_sequence, next_token], dim=-1)
            # Append the generated sequence to the list
            generated_sequences.append(current_sequence)
        return generated_sequences

# Hyperparameters
vocab_size = 50000  # Replace with actual vocabulary size
embed_size = 768
num_heads = 12
hidden_size = 3072
num_layers = 12

# Create GPT-2 model instance
gpt2_model = GPT2(vocab_size, embed_size, num_heads, hidden_size, num_layers)

# Sample usage
input_ids = torch.tensor([[2, 15, 22, 45, 3, 0]])
output = gpt2_model(input_ids)
print(output.shape)  # Adjust based on actual output shape


torch.Size([1, 6, 50000])


In [10]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# our custom GPT-2 model
custom_gpt2_model = GPT2(vocab_size, embed_size, num_heads, hidden_size, num_layers)

# Load the original GPT-2 model and tokenizer
original_model_name = 'gpt2-medium'
original_gpt2_model = GPT2LMHeadModel.from_pretrained(original_model_name)
original_tokenizer = GPT2Tokenizer.from_pretrained(original_model_name)

# Sample text prompts for generation
text_prompts = [
    "Once upon a time, there was a",
    "The world is full of",
    "In a galaxy far, far away,",
    # Add more varied prompts for testing
]

# Generate text sequences and compare for each prompt
for prompt in text_prompts:
    # Generate sequences from your custom GPT-2 model
    input_ids_custom = original_tokenizer.encode(prompt, return_tensors='pt')
    custom_output = custom_gpt2_model.generate(input_ids_custom, max_length=50, num_return_sequences=1, temperature=0.7)


# Convert tensor to list of integers
    generated_ids_custom = custom_output[0][0].tolist()
    generated_text_custom = original_tokenizer.decode(generated_ids_custom, skip_special_tokens=True)


    # Generate sequences from the original GPT-2 model
    input_ids_original = original_tokenizer.encode(prompt, return_tensors='pt')
    original_output = original_gpt2_model.generate(input_ids_original, max_length=50, num_return_sequences=1, temperature=0.7)
    generated_text_original = original_tokenizer.decode(original_output[0], skip_special_tokens=True)

    # Compare generated sequences
    print("Prompt:", prompt)
    print("Custom GPT-2 Output:", generated_text_custom)
    print("Original GPT-2 Output:", generated_text_original)
    print("----------------------------------------")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: Once upon a time, there was a
Custom GPT-2 Output: Once upon a time, there was aulf Grande economic crackedeland disorderlyRaidmax archived survives crablihood soaring lawyers deposits cone kilometresamousQuick visibility McGee Jagu conviction Shot market DVDseconomic contracted bay Ev derailed experimentation Payton Globalosher unfoldssoon suspendedabled Assistance imaginative centralitschShockBrave Exposure residency807 neoc Err
Original GPT-2 Output: Once upon a time, there was a man who lived in a village called Krakow. He was a very good man, and he was very kind to his children. One day, he was walking along the road, and he saw a woman
----------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: The world is full of
Custom GPT-2 Output: The world is full of deserved screamed spousesaramended lands Carey depot handset Jihad Premprises DoomfequickShipAvailableuitCCC Into includingework KT ConquestNoticeThingsoffic sensoryonse Magn previous investscryLinux guitar calmingkokAndrew Scan prosecutions Racer meantangler M Marino Francois hopelessDeanpieces par WiFi transparent
Original GPT-2 Output: The world is full of people who are not happy with the way things are going. They are not happy with the way things are going. They are not happy with the way things are going. They are not happy with the way things are going.
----------------------------------------


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: In a galaxy far, far away,
Custom GPT-2 Output: In a galaxy far, far away, accessoryZ Franch enters Economy embod Nou Trance RhodeinentAnderson Toxic Missileendon disliked pav 287 maneuver Affect IRS Malik885 groups Liber occasUTE probesmediCommercial volcanic linen squared intakes authenticity Decl stabilization torment Attempt occult caricature DAR flyer meterzieTa HK ATM pretended Dep Athena
Original GPT-2 Output: In a galaxy far, far away, the galaxy's most powerful star is a red giant.

The star is a red giant, which is a star that is twice the mass of the sun.

The star is about 1,000
----------------------------------------
