# Bi-Gram Language Model

In [4]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text = f.read()
len(text)

232310

In [3]:
print(text[:200])

  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [7]:
chars = sorted(set(text))
chars, len(chars)

(['\n',
  ' ',
  '!',
  '"',
  '&',
  "'",
  '(',
  ')',
  '*',
  ',',
  '-',
  '.',
  '0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  ':',
  ';',
  '?',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'J',
  'K',
  'L',
  'M',
  'N',
  'O',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'Z',
  '[',
  ']',
  '_',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z',
  '\ufeff'],
 81)

In [8]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [11]:
encoded_word = encode('hello')
decoded_word = decode(encoded_word)

encoded_word, decoded_word

([61, 58, 65, 65, 68], 'hello')

In [12]:
import torch

data = torch.Tensor(encode(text)).type(torch.long)
data[:100]

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])

In [14]:
n = int(0.8*len(data))
train_data = data[:n]
test_data = data[n:]

In [15]:
block_size = 8

X = train_data[:block_size]
Y = train_data[1:block_size+1]

for t in range(block_size):
    context = X[:t+1]
    target = Y[t]
    print(f'When input is {context}, target is {target}')

When input is tensor([80]), target is 1
When input is tensor([80,  1]), target is 1
When input is tensor([80,  1,  1]), target is 28
When input is tensor([80,  1,  1, 28]), target is 39
When input is tensor([80,  1,  1, 28, 39]), target is 42
When input is tensor([80,  1,  1, 28, 39, 42]), target is 39
When input is tensor([80,  1,  1, 28, 39, 42, 39]), target is 44
When input is tensor([80,  1,  1, 28, 39, 42, 39, 44]), target is 32


In [16]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [17]:
block_size = 8
batch_size = 4

In [20]:
from torch import nn

# Initialize an embedding layer
vocab_size = 80
embedding_dim = 6
embedding = nn.Embedding(vocab_size,embedding_dim)

# Create some input indices
input_indices = torch.LongTensor([1,5,3,2])

# Apply the embedding layer
embedded_output = embedding(input_indices)

# The output will be a tensor of shape (4,100) where 4 is the number of inputs & 100 is the dimensionality of the embedding vectors
print(embedded_output.shape)
print(embedded_output)

torch.Size([4, 6])
tensor([[ 0.8591,  2.7228, -0.5484, -0.1491,  0.3851,  0.6441],
        [ 0.2710, -1.0774,  1.4590,  0.3323, -1.6848,  0.8318],
        [-0.8694, -0.3782, -1.2340,  0.1948,  0.5592, -0.5220],
        [ 0.0528, -1.9901, -0.2951, -0.8300,  1.5629, -0.1984]],
       grad_fn=<EmbeddingBackward0>)


### Summary

In [49]:
import torch
from torch import nn

device = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 8
batch_size = 4
max_iters = 1000
eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250

device

'cpu'

In [22]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text = f.read()
# len(text)
chars = sorted(set(text))
vocab_size = len(chars)

chars, vocab_size

(['\n',
  ' ',
  '!',
  '"',
  '&',
  "'",
  '(',
  ')',
  '*',
  ',',
  '-',
  '.',
  '0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  ':',
  ';',
  '?',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'J',
  'K',
  'L',
  'M',
  'N',
  'O',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'Z',
  '[',
  ']',
  '_',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z',
  '\ufeff'],
 81)

In [23]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.Tensor(encode(text)).type(torch.long)
data[:100]

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])

In [29]:
X = train_data[:block_size]
Y = train_data[1:block_size+1]

for t in range(block_size):
    context = X[:t+1]
    target = Y[t]
    print(f'When input is {context}, target is {target}')

When input is tensor([80]), target is 1
When input is tensor([80,  1]), target is 1
When input is tensor([80,  1,  1]), target is 28
When input is tensor([80,  1,  1, 28]), target is 39
When input is tensor([80,  1,  1, 28, 39]), target is 42
When input is tensor([80,  1,  1, 28, 39, 42]), target is 39
When input is tensor([80,  1,  1, 28, 39, 42, 39]), target is 44
When input is tensor([80,  1,  1, 28, 39, 42, 39, 44]), target is 32


In [43]:
n = int(0.8*len(data))
train_data = data[:n]
test_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size,(batch_size,))
    #print(ix)
    X = torch.stack([data[i:i+block_size] for i in ix])
    Y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    X, Y = X.to(device), Y.to(device)

    return X, Y

X, Y = get_batch('train')
print(f'inputs:\n{X}\ntargets:\n{Y}')

inputs:
tensor([[ 3,  0,  0, 43, 68,  1, 73, 68],
        [67, 57,  0, 73, 76, 62, 72, 73],
        [68, 74, 67, 57,  1, 73, 61, 58],
        [72,  1, 72, 61, 58,  1, 69, 58]])
targets:
tensor([[ 0,  0, 43, 68,  1, 73, 68, 60],
        [57,  0, 73, 76, 62, 72, 73, 62],
        [74, 67, 57,  1, 73, 61, 58, 66],
        [ 1, 72, 61, 58,  1, 69, 58, 58]])


In [36]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = nn.functional.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)

            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)

            # apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1) # (B, C)

            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)

        return index
    
model = BigramLanguageModel(vocab_size).to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


1K4z:vd)xAex*fRkUB  V)NzeY&hw*jJLXV&6hp)!2'
5h"i5VWNi)U5tMm '-M;y2L4d2'
[ME*eTxM5&H67T9z4DW
G]q['NEz&TiC1 CMRkz'-"rAampk,K"G(4Qxswrz]ab(,f.Zj0cq6B5k,xYGzvsPir*BgPvHXvo_8I-BUY&]mEkX4Q5'kp1ImNxADWa
FzpSqr*!)x,p]!!]p)x5pd(zR oWrdvTRk1uTEQUm;6vH_&*89&[y*6djJh74Mf&6FZnMobIBzTe[a'kcRFzdu3DgoD?'- !6Fh;iEUYAcE4E7A0'TEXq,KuUocT,-z!:]7a
zl.LA'-*E*p'4(ZWsc89*c4s2.10a",11I,[m"E'sU
 w,XKhpBoMG3Cer5kp]O(3!B]"xTE'xJiA TIng,p55c]Q23I-MDgm*d2.Ez7.0!Xz7_dVa2nQcVy'k2pg*lrUO1NercFKn!A"r7a[aj6MfcT(,:]d&"UV0"Bk,3'l;0


In [65]:
def estimate_loss(model):
    model.eval()
    with torch.inference_mode():
        out = {}
        for split in ['train', 'val']:
            losses = torch.zeros(eval_iters)
            for k in range(eval_iters):
                X, Y = get_batch(split)
                logits, loss = model(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
    model.train()

    return out

In [75]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss(model)
        print(f"step: {iter},\ttrain loss: {losses['train']:.3f},\tval loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0,	train loss: 2.583,	val loss: 2.582
step: 250,	train loss: 2.538,	val loss: 2.561
step: 500,	train loss: 2.548,	val loss: 2.568
step: 750,	train loss: 2.532,	val loss: 2.555
2.198699474334717


In [76]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


Han

IUWhe."Thtl, aperedig are imloc&4g o tra re tulley watutaliced pssere sue, med f':ealmpsp sigans he the f by sed gheth peverin T-O11."
D e Them dd ilL8INz_le. ure unghebud lle be aleot, ISllone ck s 1;?"F(B,"OYKZerervamengragy lly,x"Hkld TEullirthid id bYU?PQX:xl s heyoy."7jQ

"D"Dowererompswad;0edeyoye opo

id EX*7Mand by,
hes, tepr t m1Wl. mag s ak'Ssthee d anothigP:_ TVime, My an INQonerllo t s, BV?"Wicapotout wit UjG!5q6R.

 ullorm le ke cr kee aliong;y th whed dRf ig c4&F&k

"
fra m S'
