In [170]:
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

chars = sorted(list(set(text)))
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}

decoder = lambda x: "".join([itoc[i] for i in x])
encoder = lambda x: [ctoi[c] for c in x]

test_text = "hello world"
encoding = encoder(test_text)
ctoi


{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [171]:
import torch

data = torch.tensor(encoder(text)).long()
data.shape, data.dtype


(torch.Size([1115394]), torch.int64)

In [172]:
training_split = int(data.shape[0] * 0.9)
train_data = data[:training_split]
val_data = data[training_split:]


In [173]:
train_data[:40]


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56])

In [174]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8


def get_batch(split="train"):
    data = train_data if split == "train" else val_data
    idxs = torch.randint(0, data.shape[0] - block_size - 1, (batch_size,))
    x = torch.stack([data[idx : idx + block_size] for idx in idxs])
    y = torch.stack([data[idx + 1 : idx + block_size + 1] for idx in idxs])
    return x, y


xb, yb = get_batch()


In [182]:
import torch.nn as nn
from torch.nn import functional as F


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        if targets is None:
            loss = None
            logits = self.token_embedding(x)
        else:
            logits = self.token_embedding(x)
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            loss = F.cross_entropy(logits, targets.view(-1))
        return logits, loss

    def generate(self, x, max_tokens):
        for _ in range(max_tokens):
            logits, _ = self(x, None)
            probs = F.softmax(logits[:, -1, :], dim=-1)
            next_t = torch.multinomial(probs, num_samples=1)
            x = torch.cat((x, next_t), dim=1)
        return x


m = BigramLanguageModel(len(chars))
logits, loss = m(xb, yb)

generation = m.generate(torch.tensor([[0]]), 100)
print(decoder(generation[0].tolist()))



uF&R;g
i$.vV-Pe?EMeBeB:NBijsbrD.zvxL,pMj?SFdVyPvxuJRkMtlOsO,Z.wmb'vusoV
aQPtud,wwi$stAEF
q. vy.yO?av


In [184]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

for steps in range(20000):

    xb,yb = get_batch()
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

2.3860368728637695


In [186]:
generation = m.generate(torch.tensor([[0]]), 400)
print(decoder(generation[0].tolist()))


Whe ppr, hy, mout j:
Maite'dsshe fthenge he t iuglpery, h.
Wins t are'd d t as hitt, hiren me oue spo Must fout tharge y wnd, awheder;
Of gr gou; thy.
NNCHo amas, ar al d thormilys stheses veeensu h Bul CKig.
Thert? qupZRGRI'sy, t mereanehen He ard ss ot!und sbeis n bllen vehenshis yoorirvit we-P
W:
A:
R:
Thedif ayonofld:
I s deverigo begotis t wis:

Dandicred, my es arcot;
Weo ar;

CULordildd cou


In [14]:
import torch
import json

input = json.loads("[0]")
data = torch.tensor(input, dtype=torch.float32)
data

tensor([0.])