## Data exploration

In [1]:
file_path="data/quijote.txt"
with open(file_path, "r", encoding='utf-8') as fd:
    text = fd.read()

In [2]:
print(f"Lenght of the dataset in characters: {len(text)}")

Lenght of the dataset in characters: 2128954


In [3]:
print(text[0:1000])

DON QUIJOTE DE LA MANCHA
Miguel de Cervantes Saavedra

PRIMERA PARTE
CAPÍTULO 1: Que trata de la condición y ejercicio del famoso hidalgo D. 
Quijote de la Mancha
En un lugar de la Mancha, de cuyo nombre no quiero acordarme, no ha mucho 
tiempo que vivía un hidalgo de los de lanza en astillero, adarga antigua, 
rocín flaco y galgo corredor. Una olla de algo más vaca que carnero, 
salpicón las más noches, duelos y quebrantos los sábados, lentejas los 
viernes, algún palomino de añadidura los domingos, consumían las tres 
partes de su hacienda. El resto della concluían sayo de velarte, calzas de 
velludo para las fiestas con sus pantuflos de lo mismo, los días de entre 
semana se honraba con su vellori de lo más fino. Tenía en su casa una ama 
que pasaba de los cuarenta, y una sobrina que no llegaba a los veinte, y un 
mozo de campo y plaza, que así ensillaba el rocín como tomaba la podadera. 
Frisaba la edad de nuestro hidalgo con los cincuenta años, era de 
complexión recia, seco de ca

In [4]:
chars=sorted(list(set(text)))
vocab_size=len(chars)
print("Characters set: " + "".join(chars))
print(f"Vocabulary size: {vocab_size}")

Characters set: 
 !"(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijlmnopqrstuvxyz¡«»¿ÁÉÍÚáéíñóúü–
Vocabulary size: 88


In [5]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("Buenos días Sancho"))
print(decode(encode("Buenos días Sancho")))

[23, 67, 52, 60, 61, 65, 1, 51, 82, 48, 65, 1, 40, 48, 60, 50, 55, 61]
Buenos días Sancho


In [6]:
# let's now encode the entire text dataset and store it into a torch.Tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([2128954]) torch.int64
tensor([25, 36, 35,  1, 38, 42, 30, 31, 36, 41, 26,  1, 25, 26,  1, 33, 22,  1,
        34, 22, 35, 24, 29, 22,  0, 34, 56, 54, 67, 52, 58,  1, 51, 52,  1, 24,
        52, 64, 68, 48, 60, 66, 52, 65,  1, 40, 48, 48, 68, 52, 51, 64, 48,  0,
         0, 37, 39, 30, 34, 26, 39, 22,  1, 37, 22, 39, 41, 26,  0, 24, 22, 37,
        78, 41, 42, 33, 36,  1, 10, 19,  1, 38, 67, 52,  1, 66, 64, 48, 66, 48,
         1, 51, 52,  1, 58, 48,  1, 50, 61, 60, 51, 56, 50, 56, 84, 60,  1, 70,
         1, 52, 57, 52, 64, 50, 56, 50, 56, 61,  1, 51, 52, 58,  1, 53, 48, 59,
        61, 65, 61,  1, 55, 56, 51, 48, 58, 54, 61,  1, 25,  8,  1,  0, 38, 67,
        56, 57, 61, 66, 52,  1, 51, 52,  1, 58, 48,  1, 34, 48, 60, 50, 55, 48,
         0, 26, 60,  1, 67, 60,  1, 58, 67, 54, 48, 64,  1, 51, 52,  1, 58, 48,
         1, 34, 48, 60, 50, 55, 48,  6,  1, 51, 52,  1, 50, 67, 70, 61,  1, 60,
        61, 59, 49, 64, 52,  1, 60, 61,  1, 63, 67, 56, 52, 64, 61,  1, 48, 50,
      

In [7]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [8]:
#context_lenght
block_size = 8
train_data[:block_size+1]

tensor([25, 36, 35,  1, 38, 42, 30, 31, 36])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([25]) the target: 36
when input is tensor([25, 36]) the target: 35
when input is tensor([25, 36, 35]) the target: 1
when input is tensor([25, 36, 35,  1]) the target: 38
when input is tensor([25, 36, 35,  1, 38]) the target: 42
when input is tensor([25, 36, 35,  1, 38, 42]) the target: 30
when input is tensor([25, 36, 35,  1, 38, 42, 30]) the target: 31
when input is tensor([25, 36, 35,  1, 38, 42, 30, 31]) the target: 36


In [10]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    # Get rand ints between 0 and len(data) - block_size, with a 1d array batch_size
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([4, 8])
tensor([[52, 65, 62, 52, 64, 48, 60, 71],
        [66, 52, 60, 54, 48,  1, 49, 64],
        [67, 60, 66, 48, 59, 52, 60, 66],
        [ 1, 38, 67, 56, 57, 61, 66, 52]])
targets:
torch.Size([4, 8])
tensor([[65, 62, 52, 64, 48, 60, 71, 48],
        [52, 60, 54, 48,  1, 49, 64, 82],
        [60, 66, 48, 59, 52, 60, 66, 52],
        [38, 67, 56, 57, 61, 66, 52,  6]])
----
when input is [52] the target: 65
when input is [52, 65] the target: 62
when input is [52, 65, 62] the target: 52
when input is [52, 65, 62, 52] the target: 64
when input is [52, 65, 62, 52, 64] the target: 48
when input is [52, 65, 62, 52, 64, 48] the target: 60
when input is [52, 65, 62, 52, 64, 48, 60] the target: 71
when input is [52, 65, 62, 52, 64, 48, 60, 71] the target: 48
when input is [66] the target: 52
when input is [66, 52] the target: 60
when input is [66, 52, 60] the target: 54
when input is [66, 52, 60, 54] the target: 48
when input is [66, 52, 60, 54, 48] the target: 1
when inpu

In [11]:
print(xb) # our input to the transformer

tensor([[52, 65, 62, 52, 64, 48, 60, 71],
        [66, 52, 60, 54, 48,  1, 49, 64],
        [67, 60, 66, 48, 59, 52, 60, 66],
        [ 1, 38, 67, 56, 57, 61, 66, 52]])


In [12]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

torch.Size([32, 88])
tensor(5.0107, grad_fn=<NllLossBackward0>)

¡)Á?:-4 émGDLG8eD5L¡.TOZNPRjT¡aáJázóE»3¡Ú.
PRgéQ3OsKRVf5MUl¿A–Ú-ytñNV7B(-4H3ih1Ú.:f;3rpj6c,ÉHSOrnsx»


In [13]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [14]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.2918145656585693


In [15]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=400)[0].tolist()))


mannzPió¡sto, 
centajososereni sen fane pestr; hodontide KClel mes pe y irandoquere meúbama; ve pun delo n idida potrogo.
Die to ie lzachal den de ha quvien mu la y lldebodos a s quozcren y quen Quenase Cuentundan 
is doreza pañxtrrtadi de qunijenue pe sendagondesYÁü¿Qun iciKCa a sitangos co, co mela 
lviroRha ses e me o; lorenta erile lanco l y e paros ve viés ngastrtos ciferr hade e deado y dele
