In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F


# hyperparameters
batch_size = 64
block_size = 256
max_iters = 4000
eval_interval = 100
learning_rate = 0.0003
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 512  # Increased embedding dimension
n_head = 6
n_layer = 6
dropout = .2

torch.manual_seed(1337)

with open('/kaggle/input/kafkaswork/Kafka.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters to integers
class BPE_Tokenizer:
    def __init__(self, vocab):
        self.vocab = vocab
        self.stoi = {ch: i for i, ch in enumerate(self.vocab)}
        self.itos = {i: ch for i, ch in enumerate(self.vocab)}

    def encode(self, s):
        tokens = list(s)
        return [self.stoi[t] for t in tokens]

    def decode(self, l):
        return ''.join([self.itos[i] for i in l])

tokenizer = BPE_Tokenizer(chars)
encode = tokenizer.encode
decode = tokenizer.decode

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size, n_embd)  # Adjusted here
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=3000)[0].tolist()))

19.096658 M parameters
step 0: train loss 4.6322, val loss 4.6278
step 100: train loss 2.4216, val loss 2.4325
step 200: train loss 2.3674, val loss 2.3799
step 300: train loss 2.2881, val loss 2.3082
step 400: train loss 2.0991, val loss 2.1299
step 500: train loss 1.9020, val loss 1.9451
step 600: train loss 1.7551, val loss 1.8090
step 700: train loss 1.6544, val loss 1.7234
step 800: train loss 1.5714, val loss 1.6461
step 900: train loss 1.5072, val loss 1.5922
step 1000: train loss 1.4548, val loss 1.5437
step 1100: train loss 1.4123, val loss 1.5109
step 1200: train loss 1.3739, val loss 1.4780
step 1300: train loss 1.3398, val loss 1.4562
step 1400: train loss 1.3089, val loss 1.4299
step 1500: train loss 1.2874, val loss 1.4225
step 1600: train loss 1.2602, val loss 1.3972
step 1700: train loss 1.2401, val loss 1.3835
step 1800: train loss 1.2247, val loss 1.3783
step 1900: train loss 1.2085, val loss 1.3691
step 2000: train loss 1.1880, val loss 1.3542
step 2100: train loss 1

In [6]:
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))


broughly before. Mrs!" Hear -- "No to say, yelGilled has jumps up onto the hall weep." 
His right accompanied. Why as fasting for Georg, the assistant 
when he was sitting his visit, stakes a louder, seanted back as and un a stranger weigh 
takes and among arouble two quite well-mescaper despair, that was in his speechness and 
achieved his legs. In this start lace, the beast scord before small schools that 
kind of have and surely bettered -- ahd as must fall fares in a cecombar sense sduce of 
burdeness him in the idea, and half-o a fult of races jump, he is it not very funting down 
when hims might over the help of a time to himself, for in any condition of the 
heart laugh, but often burrow off bearers course to him after a flee will be dark 
budge that itself up tolerate work. 
Tranials him down K. 
See aloud with a tlightful walk and his rise after toward me when I like by the 
sister was nearly too valid ago, after that this they are now worry how such a 
very gister penety, or

In [7]:
print(decode(m.generate(context, max_new_tokens=900)[0].tolist()))


now it wasn't a character. Even strange pavements, contemplation on Josephine's binds, 
haven't inclined but Josephine, must have had always been possible thing explicitly, 
which inside it was something or assistance is micdle as an evertaken that. Our occasions 
stand tolerate to understands another the assistants of the onlookers. Quarge, and their 
existent meant to keep them and fear jumped too; for a long day dog 
nothing else; not during earlier one of the moletation horses seen might neces for me 
andn't out. 
A few mining, which I live now admit is, were pretending in course. They all comfortate 
somewhere else eff and the last time again when that evening I found myself had 
to distroge and difficult, flung themselves and fell covered by the such hanging me 
understanding, staring which aparently enjoyed him to appear the basing act scase, 
often why he those planess 
would not
