In [1]:
import numpy as np
import math
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


In [2]:
data = ""
with open("dataset/small_input.txt", "r") as file:
    data = file.read()

In [3]:
data

"\nThe top in a world by susphoring grace.\n\nLUCIO:\nWe muse hath resistes him so sovere: son't his other wrough\nstands of coverent sh'd: he has here, and stand it\nand poor exceeder or a Henry's last, stay\nnot in faith, forewell's base of graves, thanks, happy comparel,\nwarmentfully: may as face by the courst, that strangth\nerrise hath breathed. Hastings come to Valenting.\n\nHERMIONE:\nWell have been bolly poor late\nIs the lords.\n\nABELLA:\nLet's found: I will kind him;\nI do braw'sy him business wherein far his face.\n\nLUCENTIO:\nHe is last afford: make him diseably to London,\nTake him great Hastings, boldness in his natic keeps,\nTo oftragn lost me ready glust through the house.\nWhy chose that I dares it be a Montague.\n\nMONTAGUE:\nWoe's Claudly Haste of his own at last the Volscient,\nAnd seen'd helpit: bearn to do it be, and most hop,\nMiscause's more conterar than without this lambs\nShall down appla fortune flight flowers.\n\nFRIAR LAUAURENCE:\nHis son, do your morse

In [4]:
data = data.replace("\n", " ").lower().split(" ")
len(data)

1849

In [5]:
words = {}
count = 0
for word in data:
    if word not in words:
        words[word] = count
        count += 1

words

{'': 0,
 'the': 1,
 'top': 2,
 'in': 3,
 'a': 4,
 'world': 5,
 'by': 6,
 'susphoring': 7,
 'grace.': 8,
 'lucio:': 9,
 'we': 10,
 'muse': 11,
 'hath': 12,
 'resistes': 13,
 'him': 14,
 'so': 15,
 'sovere:': 16,
 "son't": 17,
 'his': 18,
 'other': 19,
 'wrough': 20,
 'stands': 21,
 'of': 22,
 'coverent': 23,
 "sh'd:": 24,
 'he': 25,
 'has': 26,
 'here,': 27,
 'and': 28,
 'stand': 29,
 'it': 30,
 'poor': 31,
 'exceeder': 32,
 'or': 33,
 "henry's": 34,
 'last,': 35,
 'stay': 36,
 'not': 37,
 'faith,': 38,
 "forewell's": 39,
 'base': 40,
 'graves,': 41,
 'thanks,': 42,
 'happy': 43,
 'comparel,': 44,
 'warmentfully:': 45,
 'may': 46,
 'as': 47,
 'face': 48,
 'courst,': 49,
 'that': 50,
 'strangth': 51,
 'errise': 52,
 'breathed.': 53,
 'hastings': 54,
 'come': 55,
 'to': 56,
 'valenting.': 57,
 'hermione:': 58,
 'well': 59,
 'have': 60,
 'been': 61,
 'bolly': 62,
 'late': 63,
 'is': 64,
 'lords.': 65,
 'abella:': 66,
 "let's": 67,
 'found:': 68,
 'i': 69,
 'will': 70,
 'kind': 71,
 'him;':

In [6]:
vocab_size = len(words)
vocab_size

941

In [7]:
total_token = len(words)
total_token

941

In [8]:
tokenized_data = [words[word] for word in data]
tokenized_data

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 0,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 28,
 31,
 32,
 33,
 4,
 34,
 35,
 36,
 37,
 3,
 38,
 39,
 40,
 22,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 6,
 1,
 49,
 50,
 51,
 52,
 12,
 53,
 54,
 55,
 56,
 57,
 0,
 58,
 59,
 60,
 61,
 62,
 31,
 63,
 64,
 1,
 65,
 0,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 69,
 73,
 74,
 14,
 75,
 76,
 77,
 18,
 78,
 0,
 79,
 25,
 64,
 80,
 81,
 82,
 14,
 83,
 56,
 84,
 85,
 14,
 86,
 87,
 88,
 3,
 18,
 89,
 90,
 56,
 91,
 92,
 93,
 94,
 95,
 96,
 1,
 97,
 98,
 99,
 50,
 69,
 100,
 30,
 101,
 4,
 102,
 0,
 103,
 104,
 105,
 106,
 22,
 18,
 107,
 108,
 80,
 1,
 109,
 28,
 110,
 111,
 112,
 56,
 73,
 30,
 113,
 28,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 0,
 129,
 130,
 18,
 131,
 73,
 132,
 133,
 50,
 134,
 132,
 135,
 136,
 3,
 117,
 28,
 137,
 138,
 4,
 139,
 140,
 141,
 142,
 28,
 143,
 144,
 50,
 1,
 

In [9]:
len(tokenized_data)

1849

In [10]:
class CustomDataset(Dataset):

    def __init__(self, tokenized_data, vocab, seq_len):
        self.data = tokenized_data
        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.seq_len = seq_len
        

    # return single training or test example
    def __getitem__(self, index):
        chunk = self.data[index: index + 1 + self.seq_len]
         
        x = torch.tensor(chunk[:-1])
        y = torch.tensor(chunk[1:])

        return x, y
    
    # return length of dataset
    def __len__(self):
        return len(self.data) - self.seq_len

In [11]:
train_dataset = CustomDataset(tokenized_data, words, 8)
print(train_dataset)

<__main__.CustomDataset object at 0x7d2c69513c10>


In [12]:
train_dataset_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, h_head):
        super().__init__()
        self.d_k = d_model // h_head
        self.W_q = nn.Linear(d_model, self.d_k, bias=False)
        self.W_k = nn.Linear(d_model, self.d_k, bias=False)
        self.W_v = nn.Linear(d_model, self.d_k, bias=False)
        
    def forward(self, x):
        # x.shape -> (seq_len, d_model)
        # Q, K, V -> (seq_len, d_k)

        Q = self.W_q(x)
        K = self.W_k(x) 
        V = self.W_v(x)

        # Q @ K.transpose(-2, -1).shape -> (seq_len, seq_len)
        # attention.shape -> (seq_len, d_k)
        attention = torch.softmax((Q @ K.transpose(-2, -1))/math.sqrt(self.d_k), -1) @ V

        return attention

In [14]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_model // n_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, x):
        B, S, _ = x.shape  # batch, seq_len, d_model

        # project Q,K,V and split into heads
        Q = self.W_q(x).view(B, S, self.n_heads, self.d_k).transpose(1, 2)  # (B, heads, S, d_k)
        K = self.W_k(x).view(B, S, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(B, S, self.n_heads, self.d_k).transpose(1, 2)

        # scaled dot-product attention
        attn = torch.softmax(Q @ K.transpose(-2, -1) / math.sqrt(self.d_k), dim=-1)
        out = attn @ V  # (B, heads, S, d_k)

        # concat heads
        out = out.transpose(1, 2).contiguous().view(B, S, -1)
        return self.fc(out)


In [15]:
class FeedForward(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.ln1 = nn.Linear(d_model, 4*d_model)
        self.ln2 = nn.Linear(4*d_model, d_model)

    def forward(self, x):
        out = self.ln1(x)
        out = F.relu(out)
        out = self.ln2(out)

        return out

In [16]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head): # d_model = n_embd
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, d_model)
        self.mha = MultiHeadAttention(d_model, n_head)
        self.ffwd = FeedForward(d_model)

    def forward(self, x):
        embeddings = self.embeddings(x)
        attention = self.mha(embeddings)
        logits = self.ffwd(attention)

        return logits

In [17]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, n_head, max_seq_len=512):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, d_model)
        self.pos_embeddings = nn.Embedding(max_seq_len, d_model)
        self.mha = MultiHeadAttention(d_model, n_head)
        self.ffwd = FeedForward(d_model)

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        embeddings = self.embeddings(x) + self.pos_embeddings(positions)
        attention = self.mha(embeddings)
        logits = self.ffwd(attention)
        return logits


In [18]:
class GPT2(nn.Module):
    def __init__(self, vocab_size, d_model, n_head):
        super().__init__()
        self.encoder = Encoder(vocab_size, d_model, n_head)
        self.lm_head = nn.Linear(d_model, vocab_size)
        
    def forward(self, x):
        hidden_states = self.encoder(x)       # (batch, seq_len, d_model)
        logits = self.lm_head(hidden_states)  # (batch, seq_len, vocab_size)
        return logits

In [19]:
alpha = 3e-4
epochs = 200
n_embds = 512
n_head = 8
batch_size = 4
seq_len = 8

In [20]:
model = GPT2(vocab_size, n_embds, n_head)
optimizer = optim.Adam(model.parameters(), lr=alpha)
loss_fn = nn.CrossEntropyLoss()

In [21]:
for epoch in range(1, epochs + 1):
    epoch_loss = 0.0

    for batch_idx, (x, y) in enumerate(train_dataset_loader, start=1):
        optimizer.zero_grad()
        predictions = model(x)  # (batch_size, seq_len, vocab_size)

        B, S, V = predictions.shape
        predictions = predictions.view(B * S, V)
        y = y.view(-1)

        loss = loss_fn(predictions, y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

        # print batch loss every 10 batches
        if batch_idx % 10 == 0 or batch_idx == len(train_dataset_loader):
            print(f"Epoch [{epoch}/{epochs}], Batch [{batch_idx}/{len(train_dataset_loader)}], Loss: {loss.item():.4f}")

    # print average loss per epoch
    avg_loss = epoch_loss / len(train_dataset_loader)
    print(f"Epoch [{epoch}/{epochs}] completed. Average Loss: {avg_loss:.4f}\n")


Epoch [1/200], Batch [10/461], Loss: 6.6543
Epoch [1/200], Batch [20/461], Loss: 5.9372
Epoch [1/200], Batch [30/461], Loss: 5.9447
Epoch [1/200], Batch [40/461], Loss: 5.7327
Epoch [1/200], Batch [50/461], Loss: 5.7020
Epoch [1/200], Batch [60/461], Loss: 5.4955
Epoch [1/200], Batch [70/461], Loss: 4.2459
Epoch [1/200], Batch [80/461], Loss: 4.5410
Epoch [1/200], Batch [90/461], Loss: 4.9196
Epoch [1/200], Batch [100/461], Loss: 4.5543
Epoch [1/200], Batch [110/461], Loss: 3.5850
Epoch [1/200], Batch [120/461], Loss: 4.1005
Epoch [1/200], Batch [130/461], Loss: 4.3960
Epoch [1/200], Batch [140/461], Loss: 3.7457
Epoch [1/200], Batch [150/461], Loss: 3.7961
Epoch [1/200], Batch [160/461], Loss: 3.7412
Epoch [1/200], Batch [170/461], Loss: 4.5137
Epoch [1/200], Batch [180/461], Loss: 3.8211
Epoch [1/200], Batch [190/461], Loss: 3.3237
Epoch [1/200], Batch [200/461], Loss: 3.6248
Epoch [1/200], Batch [210/461], Loss: 2.9008
Epoch [1/200], Batch [220/461], Loss: 2.8482
Epoch [1/200], Batc

In [22]:
# disable dropout, batchnorms, etc

model.eval()

GPT2(
  (encoder): Encoder(
    (embeddings): Embedding(941, 512)
    (pos_embeddings): Embedding(512, 512)
    (mha): MultiHeadAttention(
      (W_q): Linear(in_features=512, out_features=512, bias=True)
      (W_k): Linear(in_features=512, out_features=512, bias=True)
      (W_v): Linear(in_features=512, out_features=512, bias=True)
      (fc): Linear(in_features=512, out_features=512, bias=True)
    )
    (ffwd): FeedForward(
      (ln1): Linear(in_features=512, out_features=2048, bias=True)
      (ln2): Linear(in_features=2048, out_features=512, bias=True)
    )
  )
  (lm_head): Linear(in_features=512, out_features=941, bias=True)
)

In [23]:
id2token = {v: k for k, v in words.items()}

In [26]:
n_tokens = 20

with torch.no_grad():
    sentence = "A horse! High-graced York rights. And bother Montague That"

    # preprocess
    sentence = sentence.lower().replace("\n", " ").split(" ")
    tokenized_sentence = [words[word] for word in sentence]  # list of token ids
    input_ids = torch.tensor([tokenized_sentence])  # shape (1, seq_len)

    for i in range(n_tokens):

        input_to_model = input_ids[:, -8:]

        predictions = model(input_to_model)  # (1, seq_len, vocab_size)

        # take logits at last time step
        last_logits = predictions[:, -1, :]  # (1, vocab_size)

        # convert to probability
        probs = F.softmax(last_logits, dim=-1)  # (1, vocab_size)

        # sample next token
        next_token = torch.multinomial(probs, 1)  # (1, 1)

        # append sampled token to input
        input_ids = torch.cat([input_ids, next_token], dim=1)

    # decode ids back to words
    generated_tokens = input_ids[0].tolist()
    generated_sentence = " ".join([id2token[t] for t in generated_tokens])

print(generated_sentence)


a horse! high-graced york rights. and bother montague that the caapter, that i soughd him; such a chooson woes, that they have splight that care fades the respect fades


In [27]:
torch.save(model.state_dict(), "models/v1.pth")

In [None]:
# create the model architecture first
model = GPT2(vocab_size, n_embds, n_head)

# load the saved parameters
model.load_state_dict(torch.load("gpt2_model.pth"))

# set to evaluation mode if generating
model.eval()