# Deep Dive Into GPT-2

In [23]:
import torch

from transformers import GPT2Tokenizer, GPT2Model

## Loading the Model

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

In [4]:
text = "This is an example sentence"

In [7]:
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[1212,  318,  281, 1672, 6827]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}

In [9]:
output = model(**encoded_input)

In [12]:
type(output)

transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions

In [15]:
output.last_hidden_state.shape

torch.Size([1, 5, 768])

In [180]:
output.last_hidden_state

tensor([[[ 0.0530, -0.0137, -0.2393,  ..., -0.1245, -0.1116,  0.0225],
         [ 0.2470,  0.2260,  0.0397,  ...,  0.2413,  0.4349,  0.1768],
         [ 0.7483, -0.4052, -0.9382,  ...,  0.3646, -0.0287,  0.3722],
         [ 0.1990, -0.3695, -1.8210,  ..., -0.1772,  0.0093,  0.1647],
         [ 0.0704, -0.0537, -2.5189,  ...,  0.0582, -0.1217, -0.3843]]],
       grad_fn=<ViewBackward0>)

## Embeddings

In [17]:
input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]

In [19]:
input_ids

tensor([[1212,  318,  281, 1672, 6827]])

In [21]:
attention_mask

tensor([[1, 1, 1, 1, 1]])

Generate the position IDs:

In [24]:
position_ids = torch.tensor([[0, 1, 2, 3, 4]], dtype=torch.long)

Get the token embeddings:

In [25]:
model.wte

Embedding(50257, 768)

In [27]:
input_embeds = model.wte(input_ids)
input_embeds

tensor([[[ 0.0253, -0.0956,  0.0721,  ...,  0.0615, -0.0558,  0.1093],
         [-0.0097,  0.0101,  0.0556,  ...,  0.1145, -0.0380, -0.0254],
         [-0.0870, -0.0117,  0.0687,  ...,  0.0332,  0.0562, -0.0843],
         [ 0.0716, -0.1287,  0.0815,  ..., -0.3787, -0.0285, -0.1234],
         [-0.0964, -0.0075,  0.0396,  ..., -0.2623, -0.0469, -0.1976]]],
       grad_fn=<EmbeddingBackward0>)

In [31]:
input_embeds.shape

torch.Size([1, 5, 768])

Generate the position embeddings:

In [138]:
position_embeds = model.wpe(position_ids)
position_embeds

tensor([[[-1.8821e-02, -1.9742e-01,  4.0267e-03,  ..., -4.3044e-02,
           2.8267e-02,  5.4490e-02],
         [ 2.3959e-02, -5.3792e-02, -9.4879e-02,  ...,  3.4170e-02,
           1.0172e-02, -1.5573e-04],
         [ 4.2161e-03, -8.4764e-02,  5.4515e-02,  ...,  1.9745e-02,
           1.9325e-02, -2.1424e-02],
         [-2.8337e-04, -7.3803e-02,  1.0553e-01,  ...,  1.0157e-02,
           1.7659e-02, -7.0854e-03],
         [ 7.6374e-03, -2.5090e-02,  1.2696e-01,  ...,  8.4643e-03,
           9.8542e-03, -7.0117e-03]]], grad_fn=<EmbeddingBackward0>)

In [139]:
position_embeds.shape

torch.Size([1, 5, 768])

In [140]:
hidden_states = input_embeds + position_embeds
hidden_states

tensor([[[ 0.0065, -0.2930,  0.0762,  ...,  0.0184, -0.0275,  0.1638],
         [ 0.0142, -0.0437, -0.0393,  ...,  0.1487, -0.0278, -0.0255],
         [-0.0828, -0.0964,  0.1232,  ...,  0.0530,  0.0755, -0.1057],
         [ 0.0714, -0.2025,  0.1870,  ..., -0.3685, -0.0108, -0.1304],
         [-0.0888, -0.0326,  0.1666,  ..., -0.2539, -0.0370, -0.2046]]],
       grad_fn=<AddBackward0>)

In [141]:
hidden_states.shape

torch.Size([1, 5, 768])

## The First GPT2 Block

In [142]:
residual = hidden_states

In [143]:
layer_blocks = model.h
layer_blocks

ModuleList(
  (0-11): 12 x GPT2Block(
    (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (attn): GPT2Attention(
      (c_attn): Conv1D()
      (c_proj): Conv1D()
      (attn_dropout): Dropout(p=0.1, inplace=False)
      (resid_dropout): Dropout(p=0.1, inplace=False)
    )
    (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (mlp): GPT2MLP(
      (c_fc): Conv1D()
      (c_proj): Conv1D()
      (act): NewGELUActivation()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

In [144]:
layer_block = layer_blocks[0]
layer_block

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D()
    (c_proj): Conv1D()
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D()
    (c_proj): Conv1D()
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [145]:
hidden_states = layer_block.ln_1(hidden_states)
hidden_states

tensor([[[ 0.0046, -0.1147, -0.0292,  ...,  0.0021, -0.0244,  0.0727],
         [ 0.0150, -0.0088, -0.0911,  ...,  0.1287, -0.0362, -0.0365],
         [-0.0984, -0.0632,  0.0386,  ...,  0.0452,  0.0617, -0.1194],
         [ 0.0778, -0.1550,  0.0805,  ..., -0.3588, -0.0225, -0.1376],
         [-0.1005, -0.0005,  0.0658,  ..., -0.2506, -0.0460, -0.2082]]],
       grad_fn=<NativeLayerNormBackward0>)

In [146]:
layer_block.attn.c_attn

Conv1D()

In [147]:
query_key_value = layer_block.attn.c_attn(hidden_states)
query_key_value

tensor([[[-0.0294, -0.6518,  0.5181,  ...,  0.0770, -0.2688, -0.0985],
         [ 0.4271,  0.1531,  0.2167,  ...,  0.1749,  0.3729,  0.1266],
         [ 0.8352, -0.3577,  0.6788,  ...,  0.1482,  0.0685, -0.0751],
         [-0.1062,  0.6817,  0.4614,  ...,  0.0410,  0.1327, -0.1493],
         [ 1.2036,  0.5522, -0.8434,  ...,  0.0273, -0.2853, -0.1288]]],
       grad_fn=<ViewBackward0>)

In [148]:
query_key_value.shape

torch.Size([1, 5, 2304])

In [149]:
query, key, value = query_key_value.split(768, dim=2)

In [150]:
query.shape, key.shape, value.shape

(torch.Size([1, 5, 768]), torch.Size([1, 5, 768]), torch.Size([1, 5, 768]))

In [151]:
query

tensor([[[-0.0294, -0.6518,  0.5181,  ..., -1.0025, -0.2499, -0.4812],
         [ 0.4271,  0.1531,  0.2167,  ..., -0.0248, -1.6700, -0.7407],
         [ 0.8352, -0.3577,  0.6788,  ..., -0.3742, -1.5068, -1.0755],
         [-0.1062,  0.6817,  0.4614,  ..., -0.6008, -1.8102, -1.0448],
         [ 1.2036,  0.5522, -0.8434,  ..., -0.4750, -1.2292, -0.1971]]],
       grad_fn=<SplitBackward0>)

In [152]:
key

tensor([[[-0.9706,  1.6740,  0.9128,  ..., -0.5452,  0.3212,  1.5770],
         [-1.8348,  2.4955,  1.7497,  ...,  0.7098,  0.5259,  1.7975],
         [-2.2065,  2.8389,  1.6342,  ..., -0.0561,  0.5616,  1.4394],
         [-2.0450,  1.9159,  2.1838,  ..., -0.0274,  0.2950,  0.9735],
         [-2.9504,  3.0648,  1.9827,  ..., -0.6849,  0.1057,  0.8111]]],
       grad_fn=<SplitBackward0>)

In [153]:
value

tensor([[[ 0.1759,  0.0996,  0.1026,  ...,  0.0770, -0.2688, -0.0985],
         [ 0.2320, -0.2386,  0.2946,  ...,  0.1749,  0.3729,  0.1266],
         [ 0.0950, -0.0382,  0.0530,  ...,  0.1482,  0.0685, -0.0751],
         [-0.0951, -0.0121, -0.2828,  ...,  0.0410,  0.1327, -0.1493],
         [-0.4495, -0.3067,  0.2846,  ...,  0.0273, -0.2853, -0.1288]]],
       grad_fn=<SplitBackward0>)

In [154]:
num_heads = 12
head_dim = 64
query = layer_block.attn._split_heads(query, num_heads, head_dim)
key = layer_block.attn._split_heads(key, num_heads, head_dim)
value = layer_block.attn._split_heads(value, num_heads, head_dim)
query.shape, key.shape, value.shape

(torch.Size([1, 12, 5, 64]),
 torch.Size([1, 12, 5, 64]),
 torch.Size([1, 12, 5, 64]))

In [155]:
attn_output, attn_weights = layer_block.attn._attn(query, key, value, attention_mask)

In [156]:
attn_output.shape, attn_weights.shape

(torch.Size([1, 12, 5, 64]), torch.Size([1, 12, 5, 5]))

In [157]:
attn_output = layer_block.attn._merge_heads(attn_output, num_heads, head_dim)
attn_output.shape

torch.Size([1, 5, 768])

In [158]:
attn_output

tensor([[[ 0.1759,  0.0996,  0.1026,  ...,  0.0770, -0.2688, -0.0985],
         [ 0.1782,  0.0855,  0.1106,  ...,  0.1054, -0.0826, -0.0332],
         [ 0.1779,  0.0423,  0.1259,  ...,  0.1113, -0.0683, -0.0458],
         [ 0.1479,  0.0313,  0.0835,  ...,  0.0966, -0.0892, -0.0701],
         [-0.0191, -0.0610,  0.1360,  ...,  0.0839, -0.0824, -0.0785]]],
       grad_fn=<ViewBackward0>)

In [159]:
attn_output = layer_block.attn.c_proj(attn_output)

In [160]:
attn_output.shape

torch.Size([1, 5, 768])

In [136]:
attn_output

tensor([[[ 0.6093, -0.2866,  0.2530,  ...,  0.0300, -0.0228,  0.0216],
         [-0.2909, -0.2758,  0.2719,  ...,  0.0023,  0.0487, -0.0052],
         [-0.3846,  0.0536,  0.2804,  ..., -0.0273,  0.0266, -0.0028],
         [-0.5528, -0.0046, -0.1549,  ..., -0.0027, -0.0336,  0.0284],
         [-1.3225,  0.7394, -0.4122,  ..., -0.0089, -0.0326, -0.0465]]],
       grad_fn=<ViewBackward0>)

In [161]:
hidden_states = attn_output + residual

In [162]:
hidden_states.shape

torch.Size([1, 5, 768])

In [163]:
hidden_states

tensor([[[ 0.6158, -0.5796,  0.3292,  ...,  0.0484, -0.0504,  0.1853],
         [-0.2766, -0.3195,  0.2326,  ...,  0.1510,  0.0209, -0.0308],
         [-0.4673, -0.0429,  0.4036,  ...,  0.0257,  0.1020, -0.1085],
         [-0.4814, -0.2070,  0.0321,  ..., -0.3712, -0.0445, -0.1020],
         [-1.4112,  0.7068, -0.2456,  ..., -0.2627, -0.0697, -0.2511]]],
       grad_fn=<AddBackward0>)

In [164]:
residual = hidden_states

In [165]:
hidden_states = layer_block.ln_2(hidden_states)

In [166]:
hidden_states.shape

torch.Size([1, 5, 768])

In [167]:
hidden_states

tensor([[[ 0.1216, -0.0899,  0.0704,  ...,  0.0334, -0.0691,  0.1874],
         [ 0.0099, -0.0268,  0.0387,  ...,  0.1106, -0.0021, -0.0539],
         [-0.0154,  0.0191,  0.0730,  ..., -0.0230,  0.0884, -0.1339],
         [-0.0199, -0.0133,  0.0053,  ..., -0.5003, -0.0804, -0.1320],
         [-0.1165,  0.1540, -0.0424,  ..., -0.3300, -0.0937, -0.2504]]],
       grad_fn=<NativeLayerNormBackward0>)

In [168]:
feed_forward_hidden_states = layer_block.mlp(hidden_states)

In [169]:
feed_forward_hidden_states.shape

torch.Size([1, 5, 768])

In [170]:
feed_forward_hidden_states

tensor([[[ 0.4877,  0.3332, -0.0061,  ..., -1.2471, -0.6123,  1.7950],
         [-1.0319,  0.0381, -1.0763,  ...,  0.0518,  0.0801,  0.7346],
         [-0.5007, -0.7439, -0.2125,  ..., -0.5195, -0.1852, -1.0383],
         [-1.7097,  0.0317, -1.6022,  ..., -1.6524,  0.8576,  1.2426],
         [-1.8516,  2.2123, -0.2633,  ...,  0.2964,  0.0199, -0.4581]]],
       grad_fn=<ViewBackward0>)

In [171]:
hidden_states = residual + feed_forward_hidden_states

In [172]:
hidden_states.shape

torch.Size([1, 5, 768])

In [173]:
hidden_states

tensor([[[ 1.1035, -0.2464,  0.3231,  ..., -1.1987, -0.6627,  1.9803],
         [-1.3085, -0.2814, -0.8437,  ...,  0.2028,  0.1010,  0.7039],
         [-0.9680, -0.7868,  0.1910,  ..., -0.4938, -0.0832, -1.1468],
         [-2.1911, -0.1754, -1.5701,  ..., -2.0236,  0.8131,  1.1406],
         [-3.2628,  2.9190, -0.5089,  ...,  0.0336, -0.0497, -0.7092]]],
       grad_fn=<AddBackward0>)

In [181]:
thingy = hidden_states
thingy

tensor([[[  1.0126,   0.0504,  -0.6634,  ...,  -0.8270,  -0.4964,   0.3985],
         [  2.8837,   2.2619,   0.9429,  ...,   3.7265,   6.8395,   1.9387],
         [  8.6963,  -5.3443,  -7.6458,  ...,   5.4948,  -0.1229,   4.5138],
         [  2.1593,  -4.8067, -14.9140,  ...,  -2.5139,   0.4066,   1.6212],
         [  0.7236,  -1.0082, -19.5502,  ...,   0.9085,  -1.3381,  -5.2373]]],
       grad_fn=<AddBackward0>)

## The Other Blocks

In [178]:
for block in model.h[1:]:
    hidden_states = block(hidden_states)[0]

In [179]:
hidden_states

tensor([[[  1.0126,   0.0504,  -0.6634,  ...,  -0.8270,  -0.4964,   0.3985],
         [  2.8837,   2.2619,   0.9429,  ...,   3.7265,   6.8395,   1.9387],
         [  8.6963,  -5.3443,  -7.6458,  ...,   5.4948,  -0.1229,   4.5138],
         [  2.1593,  -4.8067, -14.9140,  ...,  -2.5139,   0.4066,   1.6212],
         [  0.7236,  -1.0082, -19.5502,  ...,   0.9085,  -1.3381,  -5.2373]]],
       grad_fn=<AddBackward0>)

In [182]:
hidden_states = model.ln_f(hidden_states)
hidden_states

tensor([[[ 0.0530, -0.0137, -0.2393,  ..., -0.1245, -0.1116,  0.0225],
         [ 0.2470,  0.2260,  0.0397,  ...,  0.2413,  0.4349,  0.1768],
         [ 0.7483, -0.4052, -0.9382,  ...,  0.3646, -0.0287,  0.3722],
         [ 0.1990, -0.3695, -1.8210,  ..., -0.1772,  0.0093,  0.1647],
         [ 0.0704, -0.0537, -2.5189,  ...,  0.0582, -0.1217, -0.3843]]],
       grad_fn=<NativeLayerNormBackward0>)

In [197]:
torch.allclose(hidden_states, output.last_hidden_state, rtol=1e-05, atol=1e-06)

True