In [40]:
import os
import numpy as np
import torch

In [37]:
input_data_path = 'data/input.txt'
with open(input_data_path, 'r', encoding='utf-8') as f:
    text = f.read()
print(type(text))
print('number of characters in dataset:', len(text),'\n\n')
print(text[0:100])

<class 'str'>
number of characters in dataset: 1115394 


First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [17]:
vocab_chars = sorted(list(set(text)))
vocab_size = len(vocab_chars)
print(vocab_chars)
print(vocab_size)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
65


In [34]:
# create a mapping between characters and integers using our vocab list
str_to_i = {char:i for i,char in enumerate(vocab_chars)}
i_to_str = {i:char for i,char in enumerate(vocab_chars)}

# define character level encoder, to convert chars in integers
encode = lambda s : [str_to_i[c] for c in s] 
# define decoder, to convert integers to chars
decode = lambda s : ''.join([i_to_str[c] for c in s] )

sample_str = 'Hello World!'
encoded_str = encode(sample_str)
decoded_str = decode(encoded_str)
print(f'sample_str: {sample_str}\nencoded_str: {encoded_str}\ndecoded_str: {decoded_str}')

sample_str: Hello World!
encoded_str: [20, 43, 50, 50, 53, 1, 35, 53, 56, 50, 42, 2]
decoded_str: Hello World!


In [26]:
str_to_i['a']

39

In [38]:
# tokenize complete data

In [41]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[0:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


### train and val split

In [42]:
train_frac = 0.9 # using 90 percent of data for training and remaining data for validation
n  = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [45]:
""" Note - 1. we never feed the whole text into the transformer all at once as its not computationally efficient.
2. instead we break it down into chunks of text and sample random loads of the chunks to feed to the transformer at a time
3. In the following example we are having block size of 8 but 9 chars are taken because we want to have 8 samples for transformer to predict (n-1)"""
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [46]:
x = train_data[:block_size]
y = train_data[1:block_size+1] # off set by one coz this is the set that we want to predict eg, for x[0]->x[1] or y[0] is the pred and so on

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is the context:{context}, the target is: {target}')

when input is the context:tensor([18]), the target is: 47
when input is the context:tensor([18, 47]), the target is: 56
when input is the context:tensor([18, 47, 56]), the target is: 57
when input is the context:tensor([18, 47, 56, 57]), the target is: 58
when input is the context:tensor([18, 47, 56, 57, 58]), the target is: 1
when input is the context:tensor([18, 47, 56, 57, 58,  1]), the target is: 15
when input is the context:tensor([18, 47, 56, 57, 58,  1, 15]), the target is: 47
when input is the context:tensor([18, 47, 56, 57, 58,  1, 15, 47]), the target is: 58


In [60]:
torch.manual_seed(42)
batch_size = 4 # number of independent sequences to process parallely
block_size = 8 # maximum context length for predictions

def get_batch(split):
    # generate small batch of data of inputs x, and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size] for i in ix])
    return x,y

In [62]:
xb,yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)

print('targets:')
print(yb.shape)
print(yb)

inputs:
torch.Size([4, 8])
tensor([[ 6,  0, 14, 43, 44, 53, 56, 43],
        [39,  1, 42, 59, 43,  1, 39, 52],
        [47, 41, 43,  1, 39, 52, 42,  1],
        [53, 44,  1, 50, 43, 58,  1, 58]])
targets:
torch.Size([4, 7])
tensor([[ 0, 14, 43, 44, 53, 56, 43],
        [ 1, 42, 59, 43,  1, 39, 52],
        [41, 43,  1, 39, 52, 42,  1],
        [44,  1, 50, 43, 58,  1, 58]])


In [66]:
import torch.nn as nn
from torch.nn import functional as f
torch.manual_seed(42)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)
        
    def forward(self,idx,targets):
        # idx and targets are both (B,T) tensor of integers
        logits  = self.token_embedding_table(idx) #(B,T,C)
        loss = F.cross_entropy(logits,targets)
        return logits
    
m = BigramLanguageModel(vocab_size)
out = m(xb,yb)
print(out.shape)

torch.Size([4, 8, 65])
