In [1]:
### ------Data Loader------ ###
import torch
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset): 
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids  = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, 
                          shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, 
                            shuffle=shuffle, drop_last=drop_last, 
                            num_workers=num_workers)
    return dataloader

In [3]:
import torch.nn as nn
class MultiHeadAttention(nn.Module):     
    def __init__(self, d_in, d_out,                  
                 context_length, dropout, num_heads, qkv_bias=False):         
        super().__init__()         
        assert (d_out % num_heads == 0), "d_out must be divisible by num_heads"
        
        self.d_out = d_out       
        self.num_heads = num_heads 
        self.head_dim = d_out // num_heads         
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)         
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)         
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)         
        self.out_proj = nn.Linear(d_out, d_out)         
        self.dropout = nn.Dropout(dropout)        
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length), diagonal=1)         
        )

    def forward(self, x): 
        b, num_tokens, d_in = x.shape
        
        # (batch, num_token, d_out) 
        keys = self.W_key(x)
        queries = self.W_query(x)
        values = self.W_value(x)

        # (batch, num_token, num_heads, head_dim)
        # d_out = num_heads * head_dim
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) 
        values = values.view(b, num_tokens, self.num_heads, self.head_dim) 
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim) 

        # (b, num_heads, num_token, head_dim)
        keys = keys.transpose(1, 2) 
        queries = queries.transpose(1, 2) 
        values = values.transpose(1, 2)

        # (b, num_heads, num_token, head_dim) @ (b, num_heads, head_dim, num_token) -> (b, num_heads, num_token, num_token)
        attn_scores = queries @ keys.transpose(2, 3)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf) 
        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1) 
        attn_weights = self.dropout(attn_weights)

        # (b, num_token, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # (b, num_tokens, n_heads, head_dim)
        return context_vec

In [4]:
import torch
import torch.nn as nn

class LayerNorm(nn.Module):     
    def __init__(self, emb_dim):         
        super().__init__()         
        self.eps = 1e-5         
        self.scale = nn.Parameter(torch.ones(emb_dim))         
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        
    def forward(self, x):         
        mean = x.mean(dim=-1, keepdim=True)         
        var = x.var(dim=-1, keepdim=True, unbiased=False)         
        norm_x = (x - mean) / torch.sqrt(var + self.eps)         
        return self.scale * norm_x + self.shift 

class GELU(nn.Module):     
    def __init__(self):         
        super().__init__()
        
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
        ))

class FeedForward(nn.Module):     
    def __init__(self, cfg):         
        super().__init__()         
        self.layers = nn.Sequential(             
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),             
            GELU(),             
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),         
        )

    def forward(self, x):         
        return self.layers(x)

class TransformerBlock(nn.Module): 
    def __init__(self, cfg):         
        super().__init__()         
        self.att = MultiHeadAttention(
             d_in=cfg["emb_dim"],             
             d_out=cfg["emb_dim"],             
             context_length=cfg["context_length"],             
             num_heads=cfg["n_heads"],             
             dropout=cfg["drop_rate"],             
             qkv_bias=cfg["qkv_bias"]
        ) 
        self.ff = FeedForward(cfg)         
        self.norm1 = LayerNorm(cfg["emb_dim"])         
        self.norm2 = LayerNorm(cfg["emb_dim"])         
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # prepare to use residual network        
        x = self.norm1(x) # Normalization
        x = self.att(x)    
        
        x = self.drop_shortcut(x) 
        # print("x.shape: ", x.shape)
        x = x + shortcut

        shortcut = x 
        x = self.norm2(x)         
        x = self.ff(x)         
        x = self.drop_shortcut(x)        
        x = x + shortcut          
        return x

class GPTModel(nn.Module):     
    def __init__(self, cfg):         
        super().__init__()         
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])         
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])         
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(             
            *[TransformerBlock(cfg) 
              for _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])         
        self.out_head = nn.Linear(             
            cfg["emb_dim"], cfg["vocab_size"], bias=False         
        )

    def forward(self, in_idx):         
        batch_size, seq_len = in_idx.shape          
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(          
            torch.arange(seq_len, device=in_idx.device)         
        )         
        x = tok_embeds + pos_embeds         
        x = self.drop_emb(x)         
        x = self.trf_blocks(x)         
        x = self.final_norm(x)         
        logits = self.out_head(x)         
        return logits

In [7]:
import tiktoken
def generate_text_simple(model, idx,                          
                         max_new_tokens, context_size):     
    for _ in range(max_new_tokens):         
        idx_cond = idx[:, -context_size:]  
        print(idx_cond)
        with torch.no_grad(): 
            logits = model(idx_cond)
        logits = logits[:, -1, :]         
        probas = torch.softmax(logits, dim=-1)         
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)        
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

def text_to_token_ids(text, tokenizer):     
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})     
    encoded_tensor = torch.tensor(encoded).unsqueeze(0) 
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):     
    flat = token_ids.squeeze(0)   
    return tokenizer.decode(flat.tolist())


In [8]:
GPT_CONFIG_124M = { "vocab_size": 50257, 
                    "context_length": 256,     
                    "emb_dim": 768,     
                    "n_heads": 12,     
                    "n_layers": 12, 
                    "drop_rate": 0.1,
                    "qkv_bias": False 
                  } 
torch.manual_seed(123) 
model = GPTModel(GPT_CONFIG_124M) 
model.eval()

start_context = "Every effort moves you" 
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate_text_simple(     
    model=model,     
    idx=text_to_token_ids(start_context, tokenizer),     
    max_new_tokens=10,     
    context_size=GPT_CONFIG_124M["context_length"] 
) 
print("Output text:\n", token_ids_to_text(token_ids, tokenizer))


tensor([[6109, 3626, 6100,  345]])
tensor([[ 6109,  3626,  6100,   345, 34245]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405, 17434]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405, 17434, 17853]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405, 17434, 17853,
          5308]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405, 17434, 17853,
          5308,  3398]])
tensor([[ 6109,  3626,  6100,   345, 34245,  5139,  2492, 25405, 17434, 17853,
          5308,  3398, 13174]])
Output text:
 Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [73]:
inputs = torch.tensor([[16833, 3626, 6100],   # ["every effort moves",
                       [40,    1107, 588]])   #  "I really like"] 

targets = torch.tensor([[3626, 6100, 345  ],  # [" effort moves you",
                        [1107, 588, 11311]])  #  " really like chocolate"] 

with torch.no_grad(): 
    logits = model(inputs) 
    probas = torch.softmax(logits, dim=-1)
    print(probas) 

token_ids = torch.argmax(probas, dim=-1, keepdim=True) 
print("Token IDs:\n", token_ids)
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}") 
print(f"Outputs batch 1:"
      f" {token_ids_to_text(token_ids[0].flatten(), tokenizer)}") 

tensor([[[1.8849e-05, 1.5172e-05, 1.1687e-05,  ..., 2.2409e-05,
          6.9776e-06, 1.8776e-05],
         [9.1569e-06, 1.0062e-05, 7.8786e-06,  ..., 2.9090e-05,
          6.0103e-06, 1.3571e-05],
         [2.9877e-05, 8.8507e-06, 1.5741e-05,  ..., 3.5456e-05,
          1.4094e-05, 1.3526e-05]],

        [[1.2561e-05, 2.0538e-05, 1.4332e-05,  ..., 1.0389e-05,
          3.4784e-05, 1.4239e-05],
         [7.2731e-06, 1.7864e-05, 1.0565e-05,  ..., 2.1206e-05,
          1.1390e-05, 1.5559e-05],
         [2.9496e-05, 3.3605e-05, 4.1029e-05,  ..., 6.5249e-06,
          5.8203e-05, 1.3698e-05]]])
Token IDs:
 tensor([[[16657],
         [  339],
         [42826]],

        [[49906],
         [29669],
         [41751]]])
Targets batch 1:  effort moves you
Outputs batch 1:  Armed heNetflix


In [74]:
#### loss function ####

In [75]:
## probas[b, t, v] = P(第 b 个样本，在第 t 个位置，下一个 token 是 vocab 中第 v 个词)
text_idx = 0 
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]] 
print("Text 1:", target_probas_1)
text_idx = 1 
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2) 

log_probas = torch.log(torch.cat((target_probas_1, target_probas_2))) 
print(log_probas)

avg_log_probas = torch.mean(log_probas) 
print(avg_log_probas)

neg_avg_log_probas = avg_log_probas * -1
print(neg_avg_log_probas) 

Text 1: tensor([7.4540e-05, 3.1061e-05, 1.1563e-05])
Text 2: tensor([1.0337e-05, 5.6776e-05, 4.7559e-06])
tensor([ -9.5042, -10.3796, -11.3677, -11.4798,  -9.7764, -12.2561])
tensor(-10.7940)
tensor(10.7940)


In [76]:
file_path = "the-verdict.txt" 
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read() 

total_characters = len(text_data) 
total_tokens = len(tokenizer.encode(text_data)) 
print("Characters:", total_characters)
print("Tokens:", total_tokens) 

Characters: 20479
Tokens: 5145


In [77]:
train_ratio = 0.90 
split_idx = int(train_ratio * len(text_data)) 
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]


In [79]:
torch.manual_seed(123)
train_loader = create_dataloader_v1(     
    train_data,     
    batch_size=2,     
    max_length=GPT_CONFIG_124M["context_length"],     
    stride=GPT_CONFIG_124M["context_length"],     
    drop_last=True,     
    shuffle=True,     
    num_workers=0 
) 

val_loader = create_dataloader_v1(     
    val_data,     
    batch_size=2,     
    max_length=GPT_CONFIG_124M["context_length"],     
    stride=GPT_CONFIG_124M["context_length"],     
    drop_last=False,     
    shuffle=False,     
    num_workers=0
) 

print("Train loader:") 
for x, y in train_loader:     
    print(x.shape, y.shape)
    
print("\nValidation loader:") 
for x, y in val_loader:
    print(x.shape, y.shape) 

Train loader:
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])

Validation loader:
torch.Size([2, 256]) torch.Size([2, 256])


In [81]:
def calc_loss_batch(input_batch, target_batch, model, device):     
    input_batch = input_batch.to(device)     
    target_batch = target_batch.to(device) .to(device)
    logits = model(input_batch) 
    loss = torch.nn.functional.cross_entropy( 
        logits.flatten(0, 1), target_batch.flatten()     
    )
    return loss 

In [84]:
def calc_loss_loader(data_loader, model, device, num_batches=None):     
    total_loss = 0     
    if len(data_loader) == 0:         
        return float("nan")      
    elif num_batches is None: 
        num_batches = len(data_loader) 
    else:         
        num_batches = min(num_batches, len(data_loader))     
    for i, (input_batch, target_batch) in enumerate(data_loader):         
        if i < num_batches:             
            loss = calc_loss_batch(                 
                input_batch, target_batch, model, device             
            )              
            total_loss += loss.item()          
        else: 
            break 
    return total_loss / num_batches


In [85]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
model.to(device) 
with torch.no_grad():     
    train_loss = calc_loss_loader(train_loader, model, device)     
    val_loss = calc_loss_loader(val_loader, model, device) 
print("Training loss:", train_loss) 
print("Validation loss:", val_loss)


Training loss: 10.987583266364204
Validation loss: 10.98110580444336


In [89]:
def train_model_simple(model, train_loader, val_loader,                        
                       optimizer, device, num_epochs,                        
                       eval_freq, eval_iter, start_context, tokenizer):     
    train_losses, val_losses, track_tokens_seen = [], [], []     
    tokens_seen, global_step = 0, -1
    for epoch in range(num_epochs): 
        model.train()
        for input_batch, target_batch in train_loader:             
            optimizer.zero_grad()             
            loss = calc_loss_batch(
                input_batch, target_batch, model, device 
            )             
            loss.backward()
            optimizer.step()             
            tokens_seen += input_batch.numel() 
            global_step += 1
            
            if global_step % eval_freq == 0: 
                train_loss, val_loss = evaluate_model(                     
                    model, train_loader, val_loader, device, eval_iter
                )                 
                train_losses.append(train_loss)                 
                val_losses.append(val_loss)                 
                track_tokens_seen.append(tokens_seen)                 
                print(f"Ep {epoch+1} (Step {global_step:06d}): "                       
                      f"Train loss {train_loss:.3f}, "                       
                      f"Val loss {val_loss:.3f}"                 
                     )
        generate_and_print_sample( 
            model, tokenizer, device, start_context         
        )     
    return train_losses, val_losses, track_tokens_seen


In [90]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):     
    model.eval()     
    with torch.no_grad():         
        train_loss = calc_loss_loader(
            train_loader, model, device, num_batches=eval_iter         
        )         
        val_loss = calc_loss_loader(             
            val_loader, model, device, num_batches=eval_iter         
        )     
        model.train()     
        return train_loss, val_loss



In [92]:
def generate_and_print_sample(model, tokenizer, device, start_context):     
    model.eval()     
    context_size = model.pos_emb.weight.shape[0]     
    encoded = text_to_token_ids(start_context, tokenizer).to(device)     
    with torch.no_grad():         
        token_ids = generate_text_simple(             
            model=model, idx=encoded,             
            max_new_tokens=50, context_size=context_size         
        )     
    decoded_text = token_ids_to_text(token_ids, tokenizer) 
    print(decoded_text.replace("\n", " ")) 
    model.train()


In [93]:
##### start #####

In [94]:
torch.manual_seed(123) 
model = GPTModel(GPT_CONFIG_124M) 
model.to(device) 
optimizer = torch.optim.AdamW( 
    model.parameters(), 
    lr=0.0004, 
    weight_decay=0.1 
)
num_epochs = 10 
train_losses, val_losses, tokens_seen = train_model_simple(     
    model, train_loader, val_loader, optimizer, device,     
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,     
    start_context="Every effort moves you", tokenizer=tokenizer
) 


Ep 1 (Step 000000): Train loss 9.821, Val loss 9.934
Ep 1 (Step 000005): Train loss 8.071, Val loss 8.340
Every effort moves you,,,,,,,,,,,,.                                     
Ep 2 (Step 000010): Train loss 6.629, Val loss 7.055
Ep 2 (Step 000015): Train loss 6.052, Val loss 6.604
Every effort moves you,,,, and,,,,,,,,,.                                   
Ep 3 (Step 000020): Train loss 5.600, Val loss 6.483
Ep 3 (Step 000025): Train loss 5.542, Val loss 6.418
Every effort moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and
Ep 4 (Step 000030): Train loss 5.188, Val loss 6.352
Ep 4 (Step 000035): Train loss 5.007, Val loss 6.401
Every effort moves you a a, and a a. Gisburn, and a. I had been, and a, and a.    "Oh, and the of the of the of the of the of the of the of the of the of
Ep 5 (Step 000040): Train loss 4.364, Val loss 6.256
Every effort moves you, one of the of the of the picture to the picture.