In [111]:
import os
import torch
# get the directory of the current script
script_dir = '' # os.path.dirname(os.path.abspath(__file__))

# Construct the path to the data file
data_file_path = os.path.join(script_dir, '../data/input.txt')

# read it in to inspect it
with open(data_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

In [112]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [113]:
#create a mapping of characters to integers
stoi = {c: i for i, c in enumerate(chars)}
itos = {i: c for i, c in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda x: ''.join([itos[i] for i in x])

encodedHello = encode('hello')
print(encodedHello)
print(decode(encodedHello))


[46, 43, 50, 50, 53]
hello


In [114]:
# incode entire training text dataset
data = torch.tensor(encode(text), dtype=torch.long)

In [115]:
# Let's now split u[p the data into train and validation sets
n = int( 0.9 * len(data))
train_data, val_data = data[:n], data[n:]

print(train_data.shape, val_data.shape)
print('train_data[:100]')
print(train_data[:100])

torch.Size([1003854]) torch.Size([111540])
train_data[:100]
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [116]:
block_size = 8
train_dataset = data.unfold(0, block_size + 1, block_size)

In [117]:
for train_data in train_dataset:
    for t in range(block_size):
        context = train_data[:t+1]
        target = train_data[t+1]
        
        print(f'when input is {context}, target is {target}')
    break

when input is tensor([18]), target is 47
when input is tensor([18, 47]), target is 56
when input is tensor([18, 47, 56]), target is 57
when input is tensor([18, 47, 56, 57]), target is 58
when input is tensor([18, 47, 56, 57, 58]), target is 1
when input is tensor([18, 47, 56, 57, 58,  1]), target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [118]:
torch.manual_seed(1337)
batch_size = 32  # how many independent sequences will we process in parallel?
block_size = 8  # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs')
print(xb.shape)
print(xb)
print('targets')
print(yb.shape)
print(yb)


inputs
torch.Size([32, 8])
tensor([[18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
        [18, 47, 56, 57, 58,  1, 15, 47],
       

In [119]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(BigramLanguageModel, self).__init__()
        self.embeddings_table = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, idx, targets = None):
        # idx and targets are both (B, T) tensors of integers
        logits = self.embeddings_table(idx)  # (B, T, C)
        if targets is None:
          loss = None
        else:
          B, T, C = logits.shape
          logits = logits.view(B * T, C)
          targets = targets.view(B * T)
          loss = F.cross_entropy(logits, targets)
        return logits, loss
      
    def generate(self, x, max_new_tokens=100):
      for _ in range(max_new_tokens):
          logits, loss = self(x)
  
          logits = logits[:, -1, :]
          probs = F.softmax(logits, dim=-1)
          x = torch.cat((x, torch.multinomial(probs, num_samples=1)), dim=1)
      return x
      


In [120]:
m = BigramLanguageModel(vocab_size, vocab_size)
logits, loss = m(xb, yb)
print(loss)
idx = torch.zeros(1, 1, dtype=torch.int)
generate_encoded_text = m.generate(idx, max_new_tokens=100)[0].tolist()
print(decode(generate_encoded_text))


tensor(4.4032, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [129]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

batch_size = 32
for i in range(10000):
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(f'loss: {loss.item()}')


loss: 0.17331044375896454
loss: 0.17331036925315857
loss: 0.17331036925315857
loss: 0.1733102649450302
loss: 0.17331019043922424
loss: 0.17331011593341827
loss: 0.17331011593341827
loss: 0.1733100563287735
loss: 0.17331001162528992
loss: 0.17330995202064514
loss: 0.17330989241600037
loss: 0.17330987751483917
loss: 0.1733098328113556
loss: 0.1733098030090332
loss: 0.17330975830554962
loss: 0.17330963909626007
loss: 0.1733095943927765
loss: 0.1733095943927765
loss: 0.1733095645904541
loss: 0.1733095496892929
loss: 0.17330947518348694
loss: 0.17330947518348694
loss: 0.17330947518348694
loss: 0.17330941557884216
loss: 0.17330940067768097
loss: 0.1733093559741974
loss: 0.173309326171875
loss: 0.1733093112707138
loss: 0.173309326171875
loss: 0.17330923676490784
loss: 0.17330923676490784
loss: 0.17330916225910187
loss: 0.1733091026544571
loss: 0.1733090877532959
loss: 0.1733090877532959
loss: 0.1733090728521347
loss: 0.1733090728521347
loss: 0.17330901324748993
loss: 0.17330892384052277
loss:

In [136]:
idx = torch.zeros(1, 1, dtype=torch.int)
generate_encoded_text = m.generate(idx, max_new_tokens=400)[0].tolist()
print(decode(generate_encoded_text))  


ScN&SqVJ$wMg!UNSlFirst Cirst Cirst Cit Cirst Cit Cit Cirst Cit Cirst Cirst Cirst Cit Cirst Cit Cirst Cirst Cirst Cirst Cirst Cirst Cirst Cit Cit Cirst Cit Cirst Cit Cirst Cit Cit Cit Cit Cit Cit Cirst Cirst Cirst Cirst Cirst Cirst Cirst Cirst Cirst Cirst Cit Cirst Cit Cirst Cirst Cirst Cirst Cirst Cirst Cit Cirst Cit Cirst Cirst Cit Cit Cit Cit Cit Cirst Cit Cirst Cirst Cit Cirst Cirst Cit Cit Cir
