In [36]:
### Prepare our data
with open('input.txt', 'r') as file:
    text = file.read()

text[:30]

chars = sorted(list(set(text)))
print("total chars: ", len(chars))
print(''.join(chars))

total chars:  65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
cc 



In [37]:
### Encoder and decoder 
char2id = {c: i for i, c in enumerate(chars)}
id2char = {i: c for i, c in enumerate(chars)}

encode = lambda x: [char2id[c] for c in x]
decode = lambda x: ''.join([id2char[i] for i in x])

print(encode('hello'))
print(decode(encode('hello')))

[46, 43, 50, 50, 53]
hello


In [38]:
### Now prepare our training set
import torch
torch.manual_seed(1337)

batch_size = 4
block_size = 8
train_ratio = 0.9

data = encode(text)
train_data = data[:int(len(text)*train_ratio)]
val_data   = data[int(len(text)*train_ratio):]

print("train data size: ", len(train_data))
print(train_data[:block_size+1])
print("val data size: ", len(val_data))

def get_batch(data):
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    x = torch.stack([torch.tensor(data[i:i+block_size],dtype = torch.long ) for i in ix])
    y = torch.stack([torch.tensor(data[i+1:i+block_size+1], dtype = torch.long) for i in ix])    
    return x,y

x_b_l,y_b_1 = get_batch(train_data)

for i in range(2):
    for j in range(block_size):
        print("Given context {} the next char {}".format(x_b_l[i,:j+1].tolist(), [y_b_1[i,j:j+1].item()]))
#        print("Given context {} the next char {}".format(decode(x[i,:j+1].tolist()), decode([y[i,j:j+1].item()])))


train data size:  1003854
[18, 47, 56, 57, 58, 1, 15, 47, 58]
val data size:  111540
Given context [24] the next char [43]
Given context [24, 43] the next char [58]
Given context [24, 43, 58] the next char [5]
Given context [24, 43, 58, 5] the next char [57]
Given context [24, 43, 58, 5, 57] the next char [1]
Given context [24, 43, 58, 5, 57, 1] the next char [46]
Given context [24, 43, 58, 5, 57, 1, 46] the next char [43]
Given context [24, 43, 58, 5, 57, 1, 46, 43] the next char [39]
Given context [44] the next char [53]
Given context [44, 53] the next char [56]
Given context [44, 53, 56] the next char [1]
Given context [44, 53, 56, 1] the next char [58]
Given context [44, 53, 56, 1, 58] the next char [46]
Given context [44, 53, 56, 1, 58, 46] the next char [39]
Given context [44, 53, 56, 1, 58, 46, 39] the next char [58]
Given context [44, 53, 56, 1, 58, 46, 39, 58] the next char [1]


In [44]:
## Bigram

import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    ## b: batch
    ## l: context length
    ## c: channel of the output
    def forward(self, input_b_l, target_b_1): 
        out_b_l_c = self.embedding_table(input_b_l)
        loss = None

        B,L,C = out_b_l_c.shape

        if target_b_1 is not None:
            target_b_1 = target_b_1.view(B * L)
            out_b_l_c = out_b_l_c.view(B*L,C)
            loss = F.cross_entropy(out_b_l_c, target_b_1)

        return out_b_l_c, loss
    
    def generate(self, x_b_l, max_new_tokens):
        for _ in range(max_new_tokens):
            logits_bl_c, _ = self.forward(x_b_l, None)
            logits_b_c = logits_bl_c[:, -1, :]
            probs_b_c = F.softmax(logits_b_c, dim=1)
            idx_next_b_1 = torch.multinomial(probs_b_c, num_samples=1)
            x_b_l = torch.cat([x_b_l, idx_next_b_1], dim=1)
            
        return x_b_l
    
model = BigramModel(len(chars))
out_bl_c, loss = model(x_b_l, y_b_1) 
print(out_bl_c.shape, loss)

started_text_1_1 = torch.zeros(1,1, dtype=torch.long)
g_text = model.generate(started_text_1_1, max_new_tokens=100)[0].tolist()
print("Generated text: ", decode(g_text))

torch.Size([32, 65]) tensor(4.8786, grad_fn=<NllLossBackward0>)
Generated text:  
Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [49]:
## Train the model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

batch_size = 32
for steps in range(10000):
    xb,yb = get_batch(train_data)
    
    logits,loss = model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()    

    if steps % 100 == 0:
        print(loss.item())

    

3.629197120666504
3.578076124191284
3.446087598800659
3.5076892375946045
3.3700804710388184
3.20782208442688
3.1579647064208984
3.18367075920105
3.142939805984497
3.0863029956817627
3.0159080028533936
3.0810744762420654
3.002354860305786
2.9118049144744873
2.8697428703308105
2.8411991596221924
2.833984375
2.7227883338928223
2.7264411449432373
2.7733335494995117
2.767742395401001
2.7809245586395264
2.7070724964141846
2.7179293632507324
2.677147626876831
2.6648378372192383
2.717682123184204
2.807802200317383
2.5662364959716797
2.636556386947632
2.6038124561309814
2.5512754917144775
2.5618717670440674
2.6359477043151855
2.516904830932617
2.532054901123047
2.5444071292877197
2.5594143867492676
2.586312770843506
2.5949020385742188
2.5698494911193848
2.5601165294647217
2.6076669692993164
2.527657985687256
2.5491535663604736
2.536827564239502
2.5520572662353516
2.552196502685547
2.5812461376190186
2.5536060333251953
2.5996265411376953
2.4350359439849854
2.416480541229248
2.480340003967285
2.3

In [51]:
started_text_1_1 = torch.zeros(1,1, dtype=torch.long)
g_text = model.generate(started_text_1_1, max_new_tokens=400)[0].tolist()
print("Generated text: ", decode(g_text))

Generated text:  

Codaketwerothait g ESoco pe teran Gly? go!
Slirenkneverajest young y, cet l th e?
I he t ho jusofas my wd the y, ard h fes, her, KE: e,

Heas tiluroullowh the as hy ea if ansountante dveaghed shavir-laryould s tulintherit?
intus gimbes ad y meroprexco ghthimee, ie, grd!
A:

Mofol yowind thoushenod t.
ARis bey
Whes RDOLENThed min y VOren thethe tll; ive dse mye
CHAus
Tratoreithanghis taifthinereys
