In [1]:
from transformers import pipeline, set_seed, GPT2Tokenizer, GPT2LMHeadModel
from torch import tensor, numel
from bertviz import model_view

set_seed(42)

In [5]:
generator = pipeline('text-generation', model='gpt2')

generator("Hello, I'm a language model and I", max_length=30, num_return_sequences=3)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'show me some haskel code - http://www.code.google.com/p/haskel/\n\nCoder has one of the'},
 {'generated_text': 'show me some haskel code. I wrote the file to look like:\n\nconst string = "Hiya, I\'m a programmer/Java'},
 {'generated_text': "show me some haskel code and how to install it I'll be able to play and post about as well.\n\nFor further details go through"}]

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

'sinan' in tokenizer.get_vocab()

False

In [4]:
tokenizer.convert_ids_to_tokens(tokenizer.encode('Sinan loves a beautiful day'))

['Sin', 'an', 'Ġloves', 'Ġa', 'Ġbeautiful', 'Ġday']

In [6]:
encoded = tokenizer.encode('Sinan loves a beautiful day', return_tensors='pt')

print(encoded)

tensor([[46200,   272, 10408,   257,  4950,  1110]])


In [7]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [8]:
model.transformer.wte(encoded).shape
# Sinan loves a beautiful day
# we have 1 batch 6 tokens in the batch and 768 length fixed vector

torch.Size([1, 6, 768])

In [9]:
# now we add our word POSITION embeddings
model.transformer.wpe(tensor([0,1,2,3,4,5]).reshape(1,6)).shape

torch.Size([1, 6, 768])

In [10]:
initial_input = model.transformer.wte(encoded) + model.transformer.wpe(tensor([0,1,2,3,4,5]).reshape(1,6))
# this is how we calculate the initial input
initial_input.shape

torch.Size([1, 6, 768])

In [11]:
for module in model.transformer.h:
    #goes through the head transformers
    #encode transformer layers
    initial_input = module(initial_input)[0] 
#run through the feed forward layer afterwards
initial_input = model.transformer.ln_f(initial_input)

In [12]:
(initial_input == model(encoded, output_hidden_states=True).hidden_states[-1]).all()

tensor(True)

In [13]:
total_params = 0
for parame in model.parameters():
    total_params += numel(parame)

print(f'total parameters: {total_params}')

total parameters: 124439808
