In [1]:
### Prepare our data
with open('input.txt', 'r') as file:
    text = file.read()

text[:30]

chars = sorted(list(set(text)))
print("total chars: ", len(chars))
print(''.join(chars))


total chars:  65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [2]:
### Encoder and decoder 
char2id = {c: i for i, c in enumerate(chars)}
id2char = {i: c for i, c in enumerate(chars)}

encode = lambda x: [char2id[c] for c in x]
decode = lambda x: ''.join([id2char[i] for i in x])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [1]:
### Now prepare our training set
import torch
torch.manual_seed(1337)

print("Check CUDA available: ", torch.cuda.is_available())

batch_size = 4
block_size = 8
train_ratio = 0.9

data = encode(text)
train_data = data[:int(len(text)*train_ratio)]
val_data   = data[int(len(text)*train_ratio):]

print("train data size: ", len(train_data))
print(train_data[:block_size+1])
print("val data size: ", len(val_data))

def get_batch(data):
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([torch.tensor(data[i:i+block_size],dtype = torch.long ) for i in ix])
    y = torch.stack([torch.tensor(data[i+1:i+block_size+1], dtype = torch.long) for i in ix])    
    return x,y

x_b_l,y_b_1 = get_batch(train_data)

for i in range(2):
    for j in range(block_size):
        print("Given context {} the next char {}".format(x_b_l[i,:j+1].tolist(), [y_b_1[i,j:j+1].item()]))
#        print("Given context {} the next char {}".format(decode(x[i,:j+1].tolist()), decode([y[i,j:j+1].item()])))


Check CUDA available:  True


NameError: name 'encode' is not defined

In [4]:
## Bigram

import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    ## b: batch
    ## l: context length
    ## c: channel of the output
    def forward(self, input_b_l, target_b_1): 
        out_b_l_c = self.embedding_table(input_b_l)
        loss = None

        B,L,C = out_b_l_c.shape

        if target_b_1 is not None:
            target_b_1 = target_b_1.view(B * L)
            out_b_l_c = out_b_l_c.view(B*L,C)
            loss = F.cross_entropy(out_b_l_c, target_b_1)

        return out_b_l_c, loss
    
    def generate(self, x_b_l, max_new_tokens):
        for _ in range(max_new_tokens):
            logits_bl_c, _ = self.forward(x_b_l, None)
            logits_b_c = logits_bl_c[:, -1, :]
            probs_b_c = F.softmax(logits_b_c, dim=1)
            idx_next_b_1 = torch.multinomial(probs_b_c, num_samples=1)
            x_b_l = torch.cat([x_b_l, idx_next_b_1], dim=1)
            
        return x_b_l
    
model = BigramModel(len(chars))
out_bl_c, loss = model(x_b_l, y_b_1) 
print(out_bl_c.shape, loss)

started_text_1_1 = torch.zeros(1,1, dtype=torch.long)
g_text = model.generate(started_text_1_1, max_new_tokens=100)[0].tolist()
print("Generated text: ", decode(g_text))

torch.Size([32, 65]) tensor(4.8786, grad_fn=<NllLossBackward0>)
Generated text:  
Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [5]:
## Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

batch_size = 32
for steps in range(10000):
    xb,yb = get_batch(train_data)
    
    logits,loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()    

    if steps % 100 == 0:
        print(loss.item())

    

4.704006195068359
4.659500598907471
4.471990585327148
4.323152542114258
4.255801677703857
4.245046138763428
4.165693759918213
4.048964977264404
4.097479343414307
3.7496376037597656
3.7070794105529785
3.716240406036377
3.637645959854126
3.424874782562256
3.433396577835083
3.427090644836426
3.3038835525512695
3.2864811420440674
3.190141439437866
3.202833414077759
3.139291763305664
3.0029618740081787
3.0597994327545166
2.9590420722961426
2.982276201248169
2.9200470447540283
2.84088134765625
2.8899765014648438
2.9750688076019287
2.808044672012329
2.7770206928253174
2.747230291366577
2.6850526332855225
2.679885149002075
2.68688702583313
2.810159683227539
2.691971778869629
2.66461181640625
2.6310133934020996
2.7520360946655273
2.5809037685394287
2.629011869430542
2.624750852584839
2.547957181930542
2.58158540725708
2.6034939289093018
2.617574453353882
2.5722484588623047
2.511366367340088
2.6074514389038086
2.5077037811279297
2.5723509788513184
2.4938509464263916
2.5230987071990967
2.48254013

In [6]:
started_text_1_1 = torch.zeros(1,1, dtype=torch.long)
g_text = model.generate(started_text_1_1, max_new_tokens=400)[0].tolist()
print("Generated text: ", decode(g_text))

Generated text:  
Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercckehathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThineent.

Lavinde.
athave l.
KEONGBUCHandspo be y,-hedarwnoddy scace, tridesar, wne'shenou
