In [164]:
# we are building a character level decoder-only language model 
# original paper Transformer is a encoder-decoder language model for machine translation. 
# best explanation of attention in tranformer: https://jalammar.github.io/illustrated-transformer/
# encoder-decoder: unmasked attention to both past and future inside encoder; decoder takes K, V from the final output of encoder
# v.
# decoder-only: attention only to the past 

In [165]:
import torch
import torch.nn.functional as F
import torch.nn as nn

In [166]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [167]:
# vocab = set(list(text))
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print(vocab)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [168]:
# create mapping from char to integers
stoi = {s:i for i, s in enumerate(vocab)}
itos = {i:s for s, i, in stoi.items()}
print(stoi)
print(itos)

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40: 'b', 41: 'c', 42: 'd', 43: 'e', 44: 'f', 45: 'g', 46: 'h', 47: 'i',

In [169]:
# define 2 lambda functions
# encode: turn a string to a list of integers
encode = lambda s: [stoi[c] for c in s]
# decode: turn a list of integer to string
decode = lambda ixs: "".join([itos[ix] for ix in ixs])
print(encode("yuxiao"))
print(decode(encode("yuxiao")))

# commonly used word level tokenizer that break word into sub-words are SentencePiece, tiktoken, both used BPE ("byte-pair-encoding")

[63, 59, 62, 47, 39, 53]
yuxiao


In [170]:
import tiktoken
gpt2_encoder = tiktoken.get_encoding("gpt2")
print(gpt2_encoder.n_vocab)
print(gpt2_encoder.encode("yuxiao"))

50257
[88, 2821, 13481]


In [171]:
# let's encode the whole document into list, then to torch tensors
print(encode(text)[:100])
# data = torch.tensor(encode(text))  #torch.int64
data = torch.tensor(encode(text), dtype=torch.long)  
print(data.shape, data.dtype)

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59]
torch.Size([1115394]) torch.int64


In [172]:
# split data into train and validation
n = int(len(data) * 0.9)
print(n)
train_data = data[:n]
val_data = data[n:]
print("train data: ", len(train_data))
print("val data:v ", len(val_data))

1003854
train data:  1003854
val data:v  111540


In [173]:
# let's consider the time dimension to build x and y

# we will build x from len of 1 all the way to len of 7, various length help us do better ar reference, when given input as short as 1 char, 
# model knows how to predict next because it's seen it in the training data

block_size = 8   # the longest context window  # for every block_size in the data, we'll build 7 examples

block_x = data[:block_size]  # the first 7 chars
block_y = data[1:block_size+1]  # shifted back by 1
print("block_x: ", block_x)
print("block_y: ", block_y)
for i in range(len(block_x)):
    input = block_x[:i+1]
    target = block_y[i]
    print(f"input: {input}, target = {target}")


block_x:  tensor([18, 47, 56, 57, 58,  1, 15, 47])
block_y:  tensor([47, 56, 57, 58,  1, 15, 47, 58])
input: tensor([18]), target = 47
input: tensor([18, 47]), target = 56
input: tensor([18, 47, 56]), target = 57
input: tensor([18, 47, 56, 57]), target = 58
input: tensor([18, 47, 56, 57, 58]), target = 1
input: tensor([18, 47, 56, 57, 58,  1]), target = 15
input: tensor([18, 47, 56, 57, 58,  1, 15]), target = 47
input: tensor([18, 47, 56, 57, 58,  1, 15, 47]), target = 58


In [174]:
batch_size = 4
block_size = 8
# randomly select 4 independent data examples of sequence of 8, may start from anywhere
# if data len of 10,  block_size = 8, sampel from position [0,1,2]
data = torch.tensor([9,8,7,1,2,3,4,5,6,0])
for _ in range(3):  # it takes 2.5 iterations of getting batch to cover the whole data
    ix = torch.randint(low=0, high = len(data) - block_size + 1, size=(batch_size,))  # high is exclusive, size=tuple define output shape
    print(ix)

tensor([1, 1, 2, 1])
tensor([2, 1, 1, 1])
tensor([2, 0, 0, 2])


In [176]:
# data loader: get batch of chunks ofdata 

torch.manual_seed(1337)
# same get batch for both train data and val data, return one batch of x and y
def get_batch(split):
    data = train_data[:20] if split == "train" else val_data
    ix = torch.randint(low=0, high = len(data) - block_size + 1, size=(batch_size,))
    xb = torch.stack([data[startpos: startpos + block_size] for startpos in ix]) 
    # stack takes list of tensors and stack them veritically, along dim=0
    yb = torch.stack([data[startpos+1: startpos + block_size+1] for startpos in ix])

    return xb, yb
    
xb, yb = get_batch("train")
print(xb)
print(yb)

for b in range(batch_size):
    for t in range(block_size):
        input = xb[b, :t+1]
        target = yb[b, t]
        print(f"input = {input}, target = {target}")

tensor([[47, 64, 43, 52, 10,  0, 14, 43],
        [47, 64, 43, 52, 10,  0, 14, 43],
        [47, 58, 47, 64, 43, 52, 10,  0],
        [43, 52, 10,  0, 14, 43, 44, 53]])
tensor([[64, 43, 52, 10,  0, 14, 43, 44],
        [64, 43, 52, 10,  0, 14, 43, 44],
        [58, 47, 64, 43, 52, 10,  0, 14],
        [52, 10,  0, 14, 43, 44, 53, 56]])
input = tensor([47]), target = 64
input = tensor([47, 64]), target = 43
input = tensor([47, 64, 43]), target = 52
input = tensor([47, 64, 43, 52]), target = 10
input = tensor([47, 64, 43, 52, 10]), target = 0
input = tensor([47, 64, 43, 52, 10,  0]), target = 14
input = tensor([47, 64, 43, 52, 10,  0, 14]), target = 43
input = tensor([47, 64, 43, 52, 10,  0, 14, 43]), target = 44
input = tensor([47]), target = 64
input = tensor([47, 64]), target = 43
input = tensor([47, 64, 43]), target = 52
input = tensor([47, 64, 43, 52]), target = 10
input = tensor([47, 64, 43, 52, 10]), target = 0
input = tensor([47, 64, 43, 52, 10,  0]), target = 14
input = tensor([

In [177]:
# first build the simplest NN language model
# input will be just one char
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed_dim = embed_dim
        # self.embeddings = torch.randn(vocab_size, vocab_size) # serve as the lookup table, turn a char into vector of embeds
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)  # Embedding: variables from ~N(0,1)
                                      
    def forward(self, input, target):   # idx: this is a bigram setting, so input will be only one char, here pass its integer
        # input and target are shape of (B (batch_size) , T (block_size) )
        
        logits = self.embedding_table(input)
        print("logits shape: ", logits.shape)
        
        # wrong: loss = F.cross_entropy(logits, target)  
        # cross_entropy expect input has to be a Tensor of size (C) for unbatched input, (minibatch,C) or (minibatch, C, d1, ..dk) with K>=1
        # for the K-dimensional case. The last being useful for higher dimension inputs, such as computing cross entropy loss per-pixel for 2D images 
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        target = target.view(B*T)

        loss = F.cross_entropy(logits, target)  # comply with (minibatch,C)
        return logits, loss
 

In [93]:
bigram_model = BigramLanguageModel(vocab_size)
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(loss)
# expected loss
# prob = 1/65
# - ln(1/65) = 4.17

logits shape:  torch.Size([4, 8, 65])
torch.Size([32, 65])
tensor(4.3751, grad_fn=<NllLossBackward0>)


In [178]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embed_dim = embed_dim
        # self.embeddings = torch.randn(vocab_size, vocab_size) # serve as the lookup table, turn a char into vector of embeds
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)  # Embedding: variables from ~N(0,1)
                                      
    def forward(self, input, target=None):  # make target optional
        
        logits = self.embedding_table(input) 
        B, T, C = logits.shape
        
        if target is None:  
            loss = None    # logits returned will be shape (B, T, C)
        else:
            logits = logits.view(B*T, C)
            target = target.view(B*T)
            loss = F.cross_entropy(logits, target)  # comply with (minibatch,C)
        return logits, loss

    # def generate(self, input, max_new_tokens):  # work for bigram
    #     out = []
    #     for i in range(max_new_tokens):
    #         # nn.Module  When you create an instance of this module and call it with input data, 
    #         # PyTorch internally calls the forward method, passing the input data to it.
    #         logits, _ = self(input)
    #         # print("logits shape: ", logits.shape)
    #         probs = F.softmax(logits, dim=-1)
    #         idx_next = torch.multinomial(probs, num_samples=1)
    #         # print("idx_next shape: ", idx_next.shape)
    #         out.append(idx_next.item())
    #         input = idx_next
    #     return out
    
    def generate(self, input, max_new_tokens):  # more general, allow to condition on more previous chars
    
        for i in range(max_new_tokens):
            # nn.Module  When you create an instance of this module and call it with input data, 
            # PyTorch internally calls the forward method, passing the input data to it.
            logits, _ = self(input)
            # print("logits shape: ", logits.shape)  # (B,T,C)
            logits = logits[:, -1, :]   # becomes (B, C)
            probs = F.softmax(logits, dim=-1)    # (B, C)
            # print("probs shape: ", probs.shape)
            idx_next = torch.multinomial(probs, num_samples=1)
            # print("idx_next shape: ", idx_next.shape)
            input = torch.cat((input, idx_next), dim=1)

        return input


In [179]:
bigram_model = BigramLanguageModel(vocab_size)
logits, loss = bigram_model(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 65])
tensor(4.3875, grad_fn=<NllLossBackward0>)


In [180]:
input = torch.tensor([0]).view(1, 1)  # (B, T)
print(input)
print(input.shape)
out = bigram_model.generate(input, 100)
print("out shape: ", out.shape)  # [1, 101])
print(decode(out[0].tolist()))   # this is random model (not trained)

tensor([[0]])
torch.Size([1, 1])
out shape:  torch.Size([1, 101])

lfJeukRuaRJKXAYtXzfJ:HEPiu--sDioi;ILCo3pHNTmDwJsfheKRxZCFs
lZJ XQc?:s:HEzEnXalEPklcPU cL'DpdLCafBheH


In [181]:
# now let's train the model

# device = 'cuda' if torch.cuda.is_available() else 'cpu'  # mac uses AMD gpu not cuda, 
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
# Starting with PyTorch 1.12, support for Apple's Metal Performance Shaders (MPS) backend has been introduced, which allows you to utilize the GPU on newer Macs with Apple Silicon (M1, M2, etc.)
print(device)

block_size = 8
batch_size = 32

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(low=0, high = len(data) - block_size + 1, size=(batch_size,))
    xb = torch.stack([data[startpos: startpos + block_size] for startpos in ix]) 
    # stack takes list of tensors and stack them veritically, along dim=0
    yb = torch.stack([data[startpos+1: startpos + block_size+1] for startpos in ix])
    xb, yb = xb.to(device), yb.to(device)
    return xb, yb

mps


In [182]:
bigram_model = BigramLanguageModel(vocab_size)
model = bigram_model.to(device)

learning_rate = 1e-2   # default 0.001, for our simple model, can be bigger
optimizer = torch.optim.AdamW(model.parameters(), lr = learning_rate)

for step in range(10000):
    xb, yb = get_batch("train")
    # print(xb.shape)
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())   # start from 4.+

2.5216684341430664


In [154]:
# try generate again
out = model.generate(input=torch.tensor([0]).view(1,1), max_new_tokens=300)
print(decode(out[0].tolist()))


BE:

Hortrny antres r t'diardesit t cr acheed ingeakint shemandain, y,
SALAd tit ps y, ld ost, s, INGRY achit d n so's.
HABELO, hin twincklobeanst.
Graghintharofourthisckererorren'hethowimeve d


Pr aven thenin,
HEORord f ucu ord?
Ge t, cce's thik sthimig ht pout thund anoforkse s
Thit tsine
Houthea


In [183]:
# add evaluation part
eval_interval = 300
eval_iters = 200

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()   # switch model mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):  # pick 200 batches inside train, pick 200 batches inside val
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()  # store 200 losses
        out[split] = losses.mean()
    model.train()   # switch model mode
    return out


In [186]:
bigram_model = BigramLanguageModel(vocab_size)
model = bigram_model.to(device)

# create a PyTorch optimizer
learning_rate = 1e-2 
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

max_iters = 3000

for iter in range(max_iters):
    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")


step 0: train loss 4.6412, val loss 4.6621
step 300: train loss 2.7940, val loss 2.8243
step 600: train loss 2.5473, val loss 2.5809
step 900: train loss 2.4977, val loss 2.5270
step 1200: train loss 2.4846, val loss 2.5036
step 1500: train loss 2.4711, val loss 2.5004
step 1800: train loss 2.4677, val loss 2.4851
step 2100: train loss 2.4641, val loss 2.4986
step 2400: train loss 2.4645, val loss 2.4877
step 2700: train loss 2.4629, val loss 2.4895


In [190]:
# try generate again
out = model.generate(input=torch.tensor([0]).view(1,1).to(device), max_new_tokens=300)  # torch.tensor is built-in, not a Tensor
# out = model.generate(input=torch.LongTensor([0]).view(1,1).to(device), max_new_tokens=300)
print(decode(out[0].tolist()))


I nt rray ngestyockind m murs, in mamybalorenyongmyooe, d Vofetthindy st
Hefil brveseay alsteanerm to, oupomp rede d pre h, gavitfithrer'GENUpsts lathindKIO:
Berouerse IOLUED d nghathicerire.
II IS:
Yok, pequt f keithunghaned t
The orerrofe fisck.
MUCI t wovyonon-hu he nd yot wilercet icis ig y onee


In [None]:
# ===================================================================================================================