In [1]:
import torch
from torch.nn import functional as F
from torch import nn

block_size = 128
batch_size = 128
n_heads = 6
n_layers = 6
d_model = 128 * n_heads
dff = d_model * 2.5
dropout = 0.2
learning_rate = 3e-4
epochs = 5_000
eval_iters = 20

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if device.type == 'cuda':
    torch.backends.cudnn.benchmark = True

ModuleNotFoundError: No module named 'torch'

In [2]:
# download texts from https://www.gutenberg.org/cache/epub/70219/pg70219.txt from the internet

# !wget https://www.gutenberg.org/cache/epub/70219/pg70219.txt
# !wget https://www.gutenberg.org/files/50430/50430-0.txt
# !wget https://www.gutenberg.org/cache/epub/42727/pg42727.txt
# !wget https://www.gutenberg.org/cache/epub/15725/pg15725.txt
# !wget https://www.gutenberg.org/files/57654/57654-0.txt
# !wget https://www.gutenberg.org/cache/epub/57303/pg57303.txt
# !wget https://www.gutenberg.org/cache/epub/47287/pg47287.txt
# !wget https://www.gutenberg.org/cache/epub/17013/pg17013.txt

# with open('pg70219.txt', 'r') as f:
#     text = f.read()

# with open('50430-0.txt', 'r') as f:
#     text += f.read()

# with open('pg42727.txt', 'r') as f:
#     text += f.read()

# with open('pg15725.txt', 'r') as f:
#     text += f.read()

# with open('57654-0.txt', 'r') as f:
#     text += f.read()

# with open('pg57303.txt', 'r') as f:
#     text += f.read()

# book_list = ['pg70219.txt', '50430-0.txt', 'pg42727.txt', 'pg15725.txt', '57654-0.txt', 'pg57303.txt', 'pg47287.txt', 'pg17013.txt']

# text = ''
# for book in book_list:
#     with open(book, 'r') as f:
#         text += f.read()

# # save text to file
# with open('text_all.txt', 'w') as f:
#     f.write(text)

In [3]:
class MultiHeadAttention(nn.Module):

    def __init__(self, n_heads, d_model, block_size, dropout=0.1):

        super().__init__()
        assert d_model % n_heads == 0, "Embedding dimension must be 0 modulo number of heads."

        self.dropout = nn.Dropout(dropout)
        self.query = nn.Linear(d_model, d_model, bias=False)
        self.key = nn.Linear(d_model, d_model, bias=False)
        self.value = nn.Linear(d_model, d_model, bias=False)
        self.att_proj = nn.Linear(d_model, d_model, bias=False)
        self.register_buffer('mask', torch.triu(torch.ones(block_size, block_size).bool(), diagonal=1))

    def forward(self, x):

        q = x
        k = x
        v = x
        B,T,_ = x.shape 
        dk = d_model // n_heads

        # linear projections
        q = self.query(q) 
        k = self.key(k) 
        v = self.value(v) 

        # add number of heads
        q = q.view(B,T,n_heads,dk).permute(0,2,1,3)   # B,T,h,dk
        k = k.view(B,T,n_heads,dk).permute(0,2,1,3)  
        v = v.view(B,T,n_heads,dk).permute(0,2,1,3)  
        
        # attention 
        x = q @ k.transpose(-2,-1) # B,h,T,dk @ B,h,dk,T --> B,h,T,T
        x = x * dk ** -0.5 # B,h,T,T
        x = x.masked_fill(self.mask, float('-inf')) # B,h,T,T
        x = F.softmax(x, dim=(-1)) # B,n_h,T,T 
        x = x @ v  # B,h,T,T @ B,T,h,dv --> B,h,T,dv
        B,h,T,dv = x.shape
        x = x.transpose(2,1).contiguous().view(B,T,h*dv) #B,T,C
        out = self.att_proj(x) # B,T,C

        return out
    
class AttentionLayer(nn.Module):
    def __init__(self, n_heads, d_model, block_size, dropout):
        super().__init__()

        self.att = MultiHeadAttention(n_heads, d_model, block_size, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.att(x)
        return x
    
class FeedForward(nn.Module):
    def __init__(self, d_model, dff, dropout, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.seq = nn.Sequential(
                    nn.Linear(d_model, dff),
                    nn.ReLU(),
                    nn.Dropout(dropout),
                    nn.Linear(dff, d_model)
                    )

    def forward(self, x):
        x = self.seq(x)
        return x
    
class DecoderLayer(nn.Module):
    def __init__(self, n_heads, d_model, block_size, dropout, dff) :
        super().__init__()

        self.att = AttentionLayer(n_heads, d_model,
                                  block_size, dropout)
        
        self.ffw = FeedForward(d_model, dff, dropout)
        self.lnorm1 = nn.LayerNorm(d_model)
        self.lnorm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):

        x = x + self.att(self.lnorm1(x))
        x = x + self.ffw(self.lnorm2(x))

        return x

In [5]:
class Model(nn.Module):

    def __init__(self, vocab_size, block_size, dropout, dff, n_layers, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)

        self.block_size = block_size
        self.embedding_table = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Embedding(block_size, d_model)

        self.decoder = nn.Sequential(*[DecoderLayer(n_heads,
                                                    d_model,
                                                    block_size,
                                                    dropout,
                                                    dff) 
                                                    for _ in range(n_layers)])

        self.out = nn.Linear(d_model, vocab_size)

    def forward(self, x, targets=None):

        embeds = self.embedding_table(x)
        positions = self.pos_embedding(torch.arange(block_size, device=device))
        x = embeds + positions
        x = self.decoder(x)
        logits = self.out(x)

        if targets == None:
            loss = None

        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(input=logits, target=targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        B, T = idx.shape
        if T < self.block_size:
            # pad the input with zeros if it's less than block_size
            idx = F.pad(idx, (0, self.block_size - T))
        for _ in range(max_new_tokens):
            # use only the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
    
data = open('text_all.txt').read()
vocab = list(sorted(set(data)))
vocab_size = len(vocab)

stoi = {c:i for i, c in enumerate(vocab)}
itos = {i:c for i, c in enumerate(vocab)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda i: ''.join([itos[i] for i in i])

data = torch.tensor(encode(data))

n_tr = int(len(data) * 0.9)
n_val = len(data) - n_tr

train = data[:n_tr]
val = data[n_tr+1:]

def make_batches(split):

    data = train if split == 'train' else val
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    x, y = x.to(device), y.to(device)
    
    return x, y

Xb, Yb = make_batches('train')
m = Model(vocab_size, block_size, dropout, dff, n_layers).to(device)
optimizer = torch.optim.AdamW(m.parameters(), lr=learning_rate)
n_params = sum(p.nelement() for p in m.parameters())
print(f'Number of parameters: {n_params}')

@torch.no_grad()
def estimate_loss(m):
    out = {}
    m.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = make_batches(split)
            logits, loss = m(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

Number of parameters: 42968299
Iteration 99. Training Loss: 2.451. Evaluation Loss: 2.429
Iteration 199. Training Loss: 2.134. Evaluation Loss: 2.126
Iteration 299. Training Loss: 1.996. Evaluation Loss: 1.983
Iteration 399. Training Loss: 1.864. Evaluation Loss: 1.848
Iteration 499. Training Loss: 1.756. Evaluation Loss: 1.749
Iteration 599. Training Loss: 1.662. Evaluation Loss: 1.669
Iteration 699. Training Loss: 1.614. Evaluation Loss: 1.617
Iteration 799. Training Loss: 1.558. Evaluation Loss: 1.567
Iteration 899. Training Loss: 1.523. Evaluation Loss: 1.535
Iteration 999. Training Loss: 1.484. Evaluation Loss: 1.510
Iteration 1099. Training Loss: 1.457. Evaluation Loss: 1.485
Iteration 1199. Training Loss: 1.432. Evaluation Loss: 1.457
Iteration 1299. Training Loss: 1.413. Evaluation Loss: 1.452
Iteration 1399. Training Loss: 1.400. Evaluation Loss: 1.432
Iteration 1499. Training Loss: 1.385. Evaluation Loss: 1.421
Iteration 1599. Training Loss: 1.356. Evaluation Loss: 1.403
Iter

KeyboardInterrupt: 

In [None]:
for epoch in range(epochs):

    Xb, Yb = make_batches('train')
    logits, loss = m(Xb, Yb) # B, C

    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    if epoch % 100 == 99:
        l = estimate_loss(m)
        print(f"Iteration {epoch}. Training Loss: {l['train']:.3f}. Evaluation Loss: {l['val']:.3f}")

In [7]:
max_new_tokens =10_000
seed_text = ". "
seed_idx = torch.tensor([stoi[c] for c in seed_text], device=device).unsqueeze(0)
predictions = m.generate(seed_idx, max_new_tokens).to(device)
pad_len = m.block_size
generated_text = decode(predictions[0].tolist())
generated_text = generated_text[pad_len:]  # Remove leading padding
print(generated_text)

--A




--Es pies el que, si si nos trabaja la tierra, end
someter un hombre. Este te caías vigor, que tomé lo hicemente
     ellos, y todos que traición no se ha quitarado de sías... De
     casabas, que no puedo hacer, sin vergüente, ni otros.

Las names exhalando de unas montes envinanzasenas, atraelando
     fresquen bastantes son en el suelo, donde por poco traemo reconocír
     derributón; también así yo la pesadía no hemos encagerme,
     apareciere engañar volverías.

[Lasteras no sabían que allí nunca encontrémamos en el principio
     que unos lechamos están en Occidente.

[25] --¡Que tenías indóvil... sintió un gran pedazo--le he conciliado a
     cercano y a la temple, nacido de ley que
     le da mar, si quieres veces acbedo un peso. Algunos que tenían
siempres vasias. Morenos le sentalesen alzar, atravesando bajo
      gratemente azotaparroquio te importa. Su hora cuerpo a toda él
     se conservaía en su mano a las casas de narrar paer pronunciar
      Cube bien. RiPario