In [None]:
!pip install datasets
!pip install tiktoken

In [None]:
from datasets import load_dataset

In [None]:
dataset = load_dataset("Skylion007/openwebtext", cache_dir="/content/drive/MyDrive/Colab\ Notebooks/openwebtext")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.33k [00:00<?, ?B/s]

Loading dataset shards:   0%|          | 0/80 [00:00<?, ?it/s]

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 8013769
    })
})


In [None]:
from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self, dataset, tokenizer, seq_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.encoded_texts = []

        for text in dataset['text']:
            encoded_text = tokenizer.encode(text)
            self.encoded_texts.append(encoded_text)

    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        encoded_text = self.encoded_texts[idx]
        # input_ids = encoded_text[:self.seq_len]
        # labels = encoded_text[1:]

        chunks = [encoded_text[i:i+self.seq_len+1] for i in range(0, len(encoded_text), self.seq_len)]

        input_sequences = []
        target_sequences = []

        for chunk in chunks:
            if len(chunk) == self.seq_len + 1:
                input_sequence = chunk[:-1]
                target_sequence = chunk[1:]
                input_sequences.append(input_sequence)
                target_sequences.append(target_sequence)

        return input_sequences, target_sequences

In [None]:
import tiktoken #tiktoken is a fast BPE (Byte-Pair Encoding) tokenizer for use with OpenAI's models
enc = tiktoken.get_encoding("gpt2")

In [None]:
split = int(len(dataset['train']) * 0.9)
train_data = dataset['train'][:split]
val_data = dataset['train'][split:]

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(CustomTextDataset(train_data, enc, seq_len=512), batch_size=64, shuffle=True, num_workers=4)
val_dataloader = DataLoader(CustomTextDataset(val_data, enc, seq_len=512), batch_size=64, shuffle=False, num_workers=4)

In [None]:
# Fetch a batch from the training DataLoader
sample_batch = next(iter(train_loader))

# Check and print the types of the elements in the batch
input_sequences, target_sequences = sample_batch
print(input_sequences.shape)
print(target_sequences.shape)
print(f"Input type: {type(input_sequences)}")
print(f"Target type: {type(target_sequences)}")
print(input_sequences)
print(target_sequences)

In [None]:
print(dataset['train'][0])
print(len(dataset['train']))
print(len(dataset['train'][0]['text']))

{'text': 'Port-au-Prince, Haiti (CNN) -- Earthquake victims, writhing in pain and grasping at life, watched doctors and nurses walk away from a field hospital Friday night after a Belgian medical team evacuated the area, saying it was concerned about security.\n\nThe decision left CNN Chief Medical Correspondent Sanjay Gupta as the only doctor at the hospital to get the patients through the night.\n\nCNN initially reported, based on conversations with some of the doctors, that the United Nations ordered the Belgian First Aid and Support Team to evacuate. However, Belgian Chief Coordinator Geert Gijs, a doctor who was at the hospital with 60 Belgian medical personnel, said it was his decision to pull the team out for the night. Gijs said he requested U.N. security personnel to staff the hospital overnight, but was told that peacekeepers would only be able to evacuate the team.\n\nHe said it was a "tough decision" but that he accepted the U.N. offer to evacuate after a Canadian medical t

In [None]:
print(enc.encode("Hello world!"))
print(enc.decode([50000]))

[15496, 995, 0]
 grids


In [None]:
train = enc.encode(dataset['train'][0]['text'])
print(train)
print(max(train))
print(len(train))

[13924, 12, 559, 12, 35784, 11, 25051, 357, 18474, 8, 1377, 45591, 4970, 11, 1319, 44556, 287, 2356, 290, 44787, 379, 1204, 11, 7342, 7519, 290, 20669, 2513, 1497, 422, 257, 2214, 4436, 3217, 1755, 706, 257, 21402, 3315, 1074, 23724, 262, 1989, 11, 2282, 340, 373, 5213, 546, 2324, 13, 198, 198, 464, 2551, 1364, 8100, 5953, 8366, 34428, 298, 2986, 33708, 42095, 355, 262, 691, 6253, 379, 262, 4436, 284, 651, 262, 3871, 832, 262, 1755, 13, 198, 198, 18474, 7317, 2098, 11, 1912, 319, 10275, 351, 617, 286, 262, 7519, 11, 326, 262, 1578, 7973, 6149, 262, 21402, 3274, 22225, 290, 7929, 4816, 284, 36316, 13, 2102, 11, 21402, 5953, 36831, 2269, 861, 402, 2926, 82, 11, 257, 6253, 508, 373, 379, 262, 4436, 351, 3126, 21402, 3315, 8213, 11, 531, 340, 373, 465, 2551, 284, 2834, 262, 1074, 503, 329, 262, 1755, 13, 402, 2926, 82, 531, 339, 9167, 471, 13, 45, 13, 2324, 8213, 284, 3085, 262, 4436, 13417, 11, 475, 373, 1297, 326, 4167, 24952, 561, 691, 307, 1498, 284, 36316, 262, 1074, 13, 198, 198, 154

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # how many independent sequences will be processed in parallel
seq_len = 256 # the maximum context length for predictions
max_iters = 15000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384 # same as d_model
n_head = 6
n_layer = 6
dropout = 0.2
EPOCHS = 20

In [None]:
class Head(nn.Module):
    '''one head of self-attention'''
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(seq_len, seq_len)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # inputs: (batch, time-step, channels) == (batch_size, seq_len, d_model)
        # outputs: (batch, time-step, head size) == (batch_size, seq_len, d_k)
        B, T, C = x.shape
        k = self.key(x)   # (B, T, hs)
        q = self.query(x) # (B, T, hs)

        attention_scores = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) WAIT HOLD UP THE BIGGEST MISTAKE WAS HERE
        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        attention_scores = F.softmax(attention_scores, dim=-1)
        attention_scores = self.dropout(attention_scores)

        v = self.value(x) # (B, T, hs)
        out = attention_scores @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.w_o = nn.Linear(head_size * num_heads, n_embd) # (d_k * h, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.w_o(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # planning to make myself
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(seq_len, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C) == (Batchsize, seqlen, dmodel)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -seq_len:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
model = GPTLanguageModel()
model = model.to(device)
print(device)

In [None]:
print(sum(p.numel() for p in model.parameters() if p.requires_grad))

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
for iter in range(max_iters):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: training loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

        # Log losses to W&B
        wandb.log({"Training Loss": losses['train'], "Validation Loss": losses['val'], "Iteration": iter})

    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# Finish the W&B run
wandb.finish()

In [None]:
for epoch in range(EPOCHS):
    for i, batch in enumerate(train_loader):
        input_sequences, target_sequences = batch
        input_sequences = torch.tensor(input_sequences, dtype=torch.long)
        target_sequences = torch.tensor(target_sequences, dtype=torch.long)
        input_sequences = input_sequences.to(device)
        target_sequences = target_sequences.to(device)

        with torch.no_grad():
            for val_batch in val_dataloader:
                val_input_sequences, val_target_sequences = val_batch

                logits, loss = model(val_input_sequences, val_target_sequences)
                val_loss = loss.item()

        if i % 8 == 0:
            print(f"Epoch: {epoch+1}/{EPOCHS}, Batch: {i+1}/{batch_size}")
        logits, loss = model(input_sequences, target_sequences)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()


In [None]:
@torch.no_grad() # Decorator that tells pytorch to not compute gradients during the operations performed in the decorated function
def estimate_loss():
    out={}
    model.eval()
    for split in ['train', 'val']: # list directly defined in a loop statement
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
def load_checkpoint(model, optimizer, filename="gpt_checkpoint_1.53loss.pth"):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Checkpoint loaded from {filename}")

load_checkpoint(model, optimizer)

In [None]:
def save_checkpoint(model, optimizer, filename="gpt_checkpoint_1.53loss.pth"):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved to {filename}")

save_checkpoint(model, optimizer)

In [None]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=500)[0].tolist()))