In [55]:
from transformers import GPT2LMHeadModel

In [56]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2") # 124M
sd_hf = model_hf.state_dict()

for k, v in sd_hf.items():
    print(k, v.shape)

transformer.wte.weight torch.Size([50257, 768])
transformer.wpe.weight torch.Size([1024, 768])
transformer.h.0.ln_1.weight torch.Size([768])
transformer.h.0.ln_1.bias torch.Size([768])
transformer.h.0.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.0.attn.c_attn.bias torch.Size([2304])
transformer.h.0.attn.c_proj.weight torch.Size([768, 768])
transformer.h.0.attn.c_proj.bias torch.Size([768])
transformer.h.0.ln_2.weight torch.Size([768])
transformer.h.0.ln_2.bias torch.Size([768])
transformer.h.0.mlp.c_fc.weight torch.Size([768, 3072])
transformer.h.0.mlp.c_fc.bias torch.Size([3072])
transformer.h.0.mlp.c_proj.weight torch.Size([3072, 768])
transformer.h.0.mlp.c_proj.bias torch.Size([768])
transformer.h.1.ln_1.weight torch.Size([768])
transformer.h.1.ln_1.bias torch.Size([768])
transformer.h.1.attn.c_attn.weight torch.Size([768, 2304])
transformer.h.1.attn.c_attn.bias torch.Size([2304])
transformer.h.1.attn.c_proj.weight torch.Size([768, 768])
transformer.h.1.attn.c_proj.bias 

In [None]:
sd_hf["transformer.wpe.weight"].view(-1)[:20]

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")

In [None]:
plt.plot(sd_hf["transformer.wpe.weight"][:, 150])
plt.plot(sd_hf["transformer.wpe.weight"][:, 200])
plt.plot(sd_hf["transformer.wpe.weight"][:, 250])

In [None]:
plt.imshow(sd_hf["transformer.h.1.attn.c_attn.weight"][:300,:300], cmap="gray")

In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=30, num_return_sequences=5)

In [None]:
from transformers import GPT2Tokenizer
from train_gpt2 import GPT
import torch
import torch.nn.functional as F
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
prompt = "Hello, I'm a language model,"
tokens = torch.tensor(tokenizer.encode(prompt))

model = GPT.from_pretrained("gpt2")
model.eval()
model.to("cpu")


In [None]:
num_return_sequences = 5
max_length = 30
with torch.no_grad():
    sample_tokens = tokens.repeat(num_return_sequences, 1)
    for _ in range(max_length):
        logits = model(sample_tokens)
        logits, top_indices = torch.topk(logits[:,-1,:], k=100, dim=-1)
        probs = F.softmax(logits, dim=-1)
        sampled_indices = torch.multinomial(probs, num_samples=1)  
        next_token_indices = torch.gather(top_indices, dim=1, index=sampled_indices)
        sample_tokens = torch.cat([sample_tokens, next_token_indices], 1)
        
    for sample_token in sample_tokens:
        print(tokenizer.decode(sample_token))
        print("-"*40)        


In [38]:
# tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r') as f:
    text = f.read()
data = text[:1000] # first 1,000 characters
print(data[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [39]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode(data)
print(f"length of tokens: {len(tokens)}")
print(tokens[:24])



length of tokens: 285
[5962, 22307, 25, 198, 8421, 356, 5120, 597, 2252, 11, 3285, 502, 2740, 13, 198, 198, 3237, 25, 198, 5248, 461, 11, 2740, 13]


In [44]:
import torch
buf = torch.tensor(tokens[:24 + 1])
x = buf[:-1].view(4, 6)
y = buf[1:].view(4, 6)
print(x)
print(y)

tensor([[ 5962, 22307,    25,   198,  8421,   356],
        [ 5120,   597,  2252,    11,  3285,   502],
        [ 2740,    13,   198,   198,  3237,    25],
        [  198,  5248,   461,    11,  2740,    13]])
tensor([[22307,    25,   198,  8421,   356,  5120],
        [  597,  2252,    11,  3285,   502,  2740],
        [   13,   198,   198,  3237,    25,   198],
        [ 5248,   461,    11,  2740,    13,   198]])


In [45]:
hasattr(torch.backends, "mps")

True

In [52]:
random_idx = torch.randint(0, 10, (1,))
random_idx

tensor([0])

In [57]:
print(sd_hf["lm_head.weight"].shape)
print(sd_hf["transformer.wte.weight"].shape)

torch.Size([50257, 768])
torch.Size([50257, 768])


In [62]:
torch.all(sd_hf["lm_head.weight"]==sd_hf["transformer.wte.weight"])

tensor(True)

In [59]:
print(sd_hf["lm_head.weight"].data_ptr())
print(sd_hf["transformer.wte.weight"].data_ptr())

140160138973184
140160138973184


In [67]:
# standard deviation grows inside the residual stream
x = torch.zeros(768)
n = 1000 # e.g. 100 layers
for i in range(n):
    x += n**-0.5 * torch.randn(768)

print(x.std())

tensor(1.0111)
