In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(text[:1000])
device = 'mps' if torch.has_mps else 'cpu'

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:
vocab = ''.join(sorted(list(set(text))))
vocab_size = len(vocab)
itos = list(vocab)
stoi = {itos[i] : i for i in range(vocab_size)}

n = int(0.9*len(text))
test_text = text[:n]
val_text = text[n:]
train_data = torch.tensor([stoi[c] for c in test_text])
val_data = torch.tensor([stoi[c] for c in val_text])


    

In [5]:
g = torch.manual_seed(1337)
block_size = 8
batch_size = 4
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,), generator=g)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

In [6]:
xb, yb = get_batch('train')
g = torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size: int) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B, T, C) = (4, 8, 65)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, idx, max_tokens=100):
        # idx.shape = (B, T)
        for _ in range(max_tokens):
            logits, _ = self(idx[:, -1]) #  (4, 65)
            prob = F.softmax(logits, dim=-1) # (1, 65)
            next_word = torch.multinomial(prob.to('cpu'), 1, generator=g)  # (1, 1)
            idx = torch.cat((idx, next_word.to(device)), dim=1)
        
        for batch in idx:
            print(''.join(itos[n] for n in batch))

        
m = BigramLanguageModel(vocab_size).to(device)
logits, loss = m(xb, yb)
idx = torch.zeros((4,1), dtype=int, device=device)
m.generate(idx)
print(loss.item())


Sqcot?p.k&lFhF$bjuDnmW-jKppY,3&YxfFJZgXXQq-LKuC z3SqhzkhJrQ!PmU?WWnPgZcbVTbdtt$Rlv$ktORIs&duXY,SU'Pl

STET:CERjqPyjKuLehVnlgFEj?aZR
JW: f$etNXrFCkRr:keviHkdbfXiyJ?GrnmaSqbWhsug!uxhOLasi
pNJApJq-AUA'zeha

S.LP q3SK;wwf?EwXya!weDOj:&oibo-zoT;lxzUYIBTiq.DqVlv&vv'TbHxim'zoIM?a!vnE
o fTXiq-Ya!Pgc;gm,evj?q-wO

JWMvvn3!.jgCMj3Sx;
SVjusJBNNOpM,ARppxl,i.i-Yg.qfN:BiPcnZALPsqHEaoiNVRyF-oALETlxj3SV?3:rmZY:3r rDCIo!
4.878634452819824


In [7]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
g = torch.manual_seed(1337)
xb, yb = get_batch('train')
for i in range(10):

    # forward
    m = BigramLanguageModel(vocab_size).to(device)
    logits, loss = m(xb, yb)

    # backward
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    # update
    optimizer.step()



print(loss.item())
m.generate(idx)

4.514373779296875

,33v
j!tjZfw&,FL!SFolW3Yi.Xm3tvp
YUH?ZUGa:UUfi myovbpUPFHeb!hYG?L:A?g&w&b!&jR ;D:x$?jUUfezG.v
YKI,TJ

Q tlEi?i;j!C'EByGzNZlr3IfR.lQAzNEo-P!qcx$C'
EBThWJUPrwM33hMz'wf
sNB
QOo.&FrwdVHwm33BEyr$VC.&IUKevsXS

;GicEhuIgJ'roPJ&inx$fTS?:Z,J:ZHtFZXIozG&FPcxnVCPBy!x 3,JEBZff,?GezsqCJu$3eWfR?q3.;nQ;TtUBk;GeII,-o
p

jsmOevNEQ?iCuRdC'O&zaQfRQV:L,3MQbk;VLygarSTL,wVQSXI$Ioq
fzU,3YmkU;EjK3HK:$m'&gXd3!Aw
pJem'I
G$bptcsV


In [8]:
emb = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8], dtype=torch.float)
wei = torch.ones((emb.shape[0], emb.shape[0]))
wei = torch.tril(wei)
wei = wei.masked_fill(wei == 0, float('-inf'))
wei = F.softmax(wei, 1)
wei @ emb

tensor([1.0000, 1.5000, 2.0000, 2.5000, 3.0000, 3.5000, 4.0000, 4.5000])

In [9]:
head_size = 16
B, T, C = 4, 8, 32 # batch, time, channels
x = torch.randn(B, T, C)
query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

q = query(x) # (4, 8, 16)
k: torch.Tensor = key(x) # (4, 8, 16)
wei = q @ k.transpose(-2, -1) * head_size**-0.5 # (4, 8, 16) (4, 16, 8) -> (4, 8, 8)

wei = torch.tril(wei)
wei = wei.masked_fill(wei == 0, float('-inf'))
wei = F.softmax(wei, 2)  # (4, 8, 8)

v = value(x) # (4, 8, 16)
o = wei @ v # (4, 8, 16)



In [20]:

import torch
import torch.nn as nn
from torch.nn import functional as F

block_size = 32 # context length
dropout = 0.2
n_embed = 64
n_head = 4
n_layer = 4
device = 'mps' if torch.has_mps else 'cpu'
class Head(nn.Module):

    def __init__(self, head_size) -> None:
        super().__init__()
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones((block_size, block_size))))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        q: torch.Tensor = self.query(x)
        k: torch.Tensor = self.key(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5

        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, -1)
        wei = self.dropout(wei)

        v: torch.Tensor = self.value(x)
        return wei @ v
    
class MultiHeadAttention(nn.Module):

    def __init__(self, head_count, head_size) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(head_count)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x: torch.Tensor):
        # x = (4, 8, 64)
        out = torch.cat([head(x) for head in self.heads], dim=-1) # out = (4, 8, 64), each head(x) outputs (4, 8, 16)
        out = self.proj(out) # (4, 8, 64)
        return self.dropout(out)
    
class FeedForward(nn.Module):
    def __init__(self, n_embed) -> None:
        super().__init__()
        self.net = nn.Sequential(*[
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout)
        ])
    
    def forward(self, x: torch.Tensor):
        return self.net(x)
    
class Block(nn.Module):
    def __init__(self, n_head, head_size) -> None:
        super().__init__()
        self.multihead = MultiHeadAttention(n_head, head_size)
        self.ffw = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)


    def forward(self, x: torch.Tensor):
        x = x + self.multihead(self.ln1(x))
        return x + self.ffw(self.ln2(x))
    
class BigramLanguageModel2(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_head, n_embed // n_head) for _ in range(n_layer)])
        self.ln = nn.LayerNorm(n_embed)
        self.ffw = nn.Linear(n_embed, vocab_size)
    
    def forward(self, x: torch.Tensor, target=None):
        B, T = idx.shape
        token_embedding = self.token_embedding_table(x)
        position_embedding = self.position_embedding_table(torch.arange(T, device=device))
        x = token_embedding + position_embedding
        x = self.blocks(x)
        x = self.ln(x)
        logits: torch.Tensor = self.ffw(x) # (B, T, vocab_size)

        if target is None:
            loss = None
        else:
            B, T = logits.shape
            logits = logits.view(B*T, -1) # (B, T, vocab_size) -> (B*T, vocab_size)
            # target: (B, T) -> (B*T,)
            target = target.view(B*T)
            F.cross_entropy(logits, target)

        return logits, loss
    
    def generate(self, idx: torch.Tensor, max_tokens=100):
        for _ in range(max_tokens):
            # idx: (B, T)
            logits, _  = self(idx[:, -block_size:]) # (B, T, vocab_size)
            logits = logits[:, -1, :] # (B, 1, vocab_size)
            prob = F.softmax(logits, 1) # (B, vocab_size)
            idx_next = torch.multinomial(prob, 1)
            idx = torch.cat((idx, idx_next), dim=1)
        
        for batch in idx:
            print(''.join(itos[n] for n in batch))


m = BigramLanguageModel2().to(device)
idx = torch.zeros((7, 1), dtype=torch.long, device=device)
m.generate(idx)


        

    


KkY3Ssm:o ituJS,W,'s&hs&Mv!FSA3:
lopZ
;UgQCZSPlZsx!P3Bw3s3UdDJPqinudyMXYAmd&sQM;:jiC$TS;3JzKMsWQkZR 

O3I!YS,FJfDwDqjMY,.UqsLD?ldUZ
p;QErRdZFqL,f;.s3w?,U$sv! qULLpi;qSBr,3EdA?c3PvyN
bF!B.uvRC&MlvpeUFKzI

:&oF&v-,wbfMvouqhSljdBKSyMqZEjl?SSNuNRZS,KB Vyd-SW:q3:'voF!zmTLoQ?vDsr:qCSWvcuiq3TszLLgk,;vZhL,!XR.W

 &'MUlbm:Wzz&R-UEsIvP:TAvNcfpErsqpbL&IgKMzPgpvBz,Uwn3ByqC;m.C:pckskg,h3w;;zLuqRB.hV&UtCziU.:JwnSdvNX

.v&VdlHqvfLS$z;sB w,SVvVwnSdOSLbe&fJXsD.,qJiYimA:E3IMdmSqxQptlhAUZw,YkgPiYsZPOPKSHspGn?ZOF Z&.zYvV
!

dGHsUq;dmBH&qUgs!U3rB;nhqOGAUqWfKXS&vvZEmJOJugZIHEAs-AEZWKvGQQ&Sb sAw;H &AWvBYvp.?YRnywwyEV3
Ml,qIMY

IYEwiPMsLZJwvBmsN&XU3,vWsD!3Q vEK'bLMFFvv;Kj3V.'TqBMRuqi-PBMO?qSMVqUDoNrblYm.3jrfSEestj3&YX.Qq!RLZ$



In [409]:
a = torch.ones((2, 1, 2))
a[:, -1, :].shape

torch.Size([2, 2])