In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
with open('tiny_shakespear.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# text = text[:00]
print("length of dataset in characters: ", len(text))

chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab size: ", vocab_size)

char_to_int = dict()
int_to_char = dict()

for i in range(vocab_size):
    char_to_int[chars[i]] = i
    int_to_char[i] = chars[i]

data = []
for char in text:
    data.append(char_to_int[char])

data = torch.tensor(data)
split_idx = int(len(data)*0.0001)
train_data = data[:split_idx]
val_data = data[split_idx:]

print(f"N train: {len(train_data)} N val: {len(val_data)}")

length of dataset in characters:  1115393
Vocab size:  65
N train: 111 N val: 1115282


In [4]:
batch_size = 32
context_length = 8
max_iters = 5000
eval_interval = 1000
eval_iters = 20


In [5]:
def get_batch(n = batch_size, type = "train"):
    if type == "train":
        data = train_data

    elif type == "val":
        data = val_data

    data_len = len(data) - context_length
    ix = torch.randint(data_len, size=(n,))

    x = torch.stack([data[i:i+context_length] for i in ix])
    y = torch.stack([data[i+1:i+1+context_length] for i in ix])

    return x, y

In [6]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(batch_size, split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [7]:
class AttentionHead(nn.Module):
    def __init__(self, input, output):
        """ Peformns attention on N - nodes where each node is encoded by vector

        input = size of node embedding vector
        output = size of value head (same head size used for key and query)
        
        """
        super().__init__()

        self.node_emb_size = input
        self.head_size = output

        # Linear projection from vector of size node_dim to space with dim head_size
        self.key = nn.Linear(self.node_emb_size, self.head_size, bias=False)
        self.query = nn.Linear(self.node_emb_size, self.head_size, bias=False)
        self.value = nn.Linear(self.node_emb_size, self.head_size, bias=False)

    
    def forward(self, x):
        B, T, C = x.shape

        K = self.key(x)
        Q = self.query(x)
        V = self.value(x)

        A = (K @ Q.transpose(-2,-1)) * self.head_size ** -0.5

        tril = torch.tril(torch.ones((B,T,T)))
        A = A.masked_fill(tril == 0, float('-inf'))
        A = F.softmax(A, dim=-1)

        output = A @ V

        return output


In [8]:
class GPT(torch.nn.Module):
    def __init__(self, vocab_size, context_length = 8, vocab_emb_size = 32):
        super().__init__()

        self.vocab_size = vocab_size
        self.context_length = context_length
        self.vocab_emb_size = vocab_emb_size

        self.token_embedding_table = nn.Embedding(self.vocab_size, self.vocab_emb_size)
        self.position_embedding_table = nn.Embedding(self.context_length, self.vocab_emb_size)
        self.head = AttentionHead(self.vocab_emb_size,self.vocab_emb_size)
        self.l1 = nn.Linear(self.vocab_emb_size, self.vocab_size)


    def forward(self, x, y=None):
        tok_emb = self.token_embedding_table(x[:,-self.context_length:])
        pos_emb = self.position_embedding_table(torch.arange(self.context_length)) 
        x_enc =  tok_emb + pos_emb
        val = self.head(x_enc)
        logits = self.l1(val)

        if y == None:
            loss = None
        else:
            B, T, C = logits.shape #Batch, Time, Classes
            logits_flat = logits.view(B*T, C)
            y = y.view(B*T)
            loss = F.cross_entropy(logits_flat, y)

        return logits, loss
    
    def generate(self, prompt, max_response_len):
        for _ in range(max_response_len):
            logits, loss = self.forward(prompt)
            logits = logits[:,-1,:]
            probs = F.softmax(logits,dim=-1)

            next_token = torch.multinomial(probs, num_samples=1)
            prompt = torch.cat((prompt, next_token), dim=1)

        return prompt

In [9]:
m = GPT(65)

In [10]:

optimizer = torch.optim.Adam(m.parameters(), lr=1e-3, amsgrad=False)


In [11]:
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss(m)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch(batch_size, 'train')

    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


TypeError: randint(): argument 'size' must be tuple of ints, not tuple

In [91]:
response = m.generate(prompt= torch.zeros((1,8), dtype=torch.long), max_response_len=100)
data = ''.join([int_to_char[v.item()] for v in response[0]])
print(data)









&sRZXFsDZrL
'OqPJV.mbn&,Iezjw.$p;;yvDHCIyjmgoUjm:DO:&vvKcE!iynj;WJ?ztf,gv YFrUuDDWQ
q'Sn,?vncKgmwyGE
