Commented cell are meant to explain the code and are not executed. You can execute them to understand better the code.

Such as the following code explains how to create input-target pairs for training a language model using a sliding window approach.
```python
x = train_data[:BLOCK_SIZE]
y = train_data[1:BLOCK_SIZE+1]

for t in range(BLOCK_SIZE):
    content = x[:t+1]
    target = y[t]
    print( f"When input is {content} the target is {target}")
```
---

In [None]:
import torch
print(torch.backends.mps.is_available()) # Should return True

### Global Variables

In [None]:
INPUT_DATA_FILE = './data/input/tinyStoriesData.txt'
DEVICE = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
BATCH_SIZE = 32 # how many independent sequences will we process in parallel?
BLOCK_SIZE = 8 # what is the maximum context length for predictions?
MAX_ITERS = 3000
EVAL_INTERVAL = 300
LEARNING_RATE = 1e-2
EVAL_ITERS = 200

In [None]:
print(DEVICE)

In [None]:
torch.manual_seed(1337)

In [None]:
# from dotenv import load_dotenv
# import os
# load_dotenv()
# HF_TOKEN = os.getenv("HF_TOKEN")

### TinyStories data loading from HF and saving locally

In [None]:
# from datasets import load_dataset
# from tqdm.auto import tqdm
# ds = load_dataset("roneneldan/TinyStories", split="train")

# print(ds["train"]["text"])

# with open(INPUT_DATA_FILE, 'w', encoding='utf-8') as f:
#         for i, entry in enumerate(ds):
#             # if i >= num_stories:
#             #     break
#             story_text = entry['text'].strip()
            
#             f.write(story_text + "\n\n")

### Reading data file

In [None]:
with open(INPUT_DATA_FILE, "r", encoding="utf-8") as f:
    text = f.read()

#### get vocab info

In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("".join(chars))
print(vocab_size)

In [None]:
print(text[:100])

### Char level encoder-decoder model for text generation

In [None]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [None]:
print(encode("prayas"))
print(decode(encode("prayas")))

### Create data tensors

In [None]:
# Train on a 70% subset of the data for faster training.
data_divider = int(0.7 * len(text))
data = torch.tensor(encode(text[:data_divider]), dtype=torch.long, device=DEVICE)

# Split this data into 90% training and 10% validation
split_point = int(0.9 * len(data))
train_data = data[:split_point]
val_data = data[split_point:]

In [None]:
# BLOCK_SIZE = 8
# train_data[:BLOCK_SIZE+1]

In [None]:
# x = train_data[:BLOCK_SIZE]
# y = train_data[1:BLOCK_SIZE+1]

# for t in range(BLOCK_SIZE):
#     content = x[:t+1]
#     target = y[t]
#     print( f"When input is {content} the target is {target}")
    

In [None]:

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - BLOCK_SIZE, (BATCH_SIZE,))
    x = torch.stack([data[i:i+BLOCK_SIZE] for i in ix])
    y = torch.stack([data[i+1:i+BLOCK_SIZE+1] for i in ix])
    # x, y = x.to(device), y.to(device)
    return x, y

# xb, yb = get_batch('train')
# print('<------inputs------>')
# print(xb.shape)
# print(xb)
# print('\n<------targets------>')
# print(yb.shape)
# print(yb)

# print('-'*20)

# for b in range(BATCH_SIZE):
#     for t in range(BLOCK_SIZE):
#         context = xb[b, :t+1]
#         target = yb[b,t]
#         print(f"When input is {context.tolist()} the target: {target}")

### Bigram

In [None]:
import torch.nn as nn
from torch.nn import functional as F

 
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()  
        # The embedding table consists of rows and columns. for every digits in tensor a row is plucked from this table
        # for that particular index and returned. (e.g. in a tensor [21] the 21st row in embedding table is returned)
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets= None):
        # targets and idx are (B,T) tensor of integer
        logits = self.token_embedding_table(idx) # returns an embeddng row from the embedding table
                                                    # this consists of (B, T, C) -> (Batch, Time, Channel)

        # if target is None then there is no loss to calculate
        if targets is None:
            loss = None
            return logits, loss

        # we de-construct the values because currently our logits are of shape (BTC)
        # and the cross-entrophy fucntion of torch take (BCT) as input
        # Pytorch Cross-Entrophy excepts 'C' at 2nd place
        B, T, C = logits.shape
        logits = logits.view(B*T, C)
        targets = targets.view(B*T)

        loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        """ Bsically the function takes (B, T) index and generates (B, T+1), (B, T+2), (B, T+3), .... upto max_new_tokens"""
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the prediction
            logits, loss= self(idx) # calling the forward function defined above, but notice that the target param is not mentioned, that's because it is optional.

            # focusing only on the last time step
            logits = logits[:,-1,:] # Becomes (B,C)

            probs = F.softmax(logits, dim=-1)

            # samples from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)

            # append samples to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)
        return idx
    
# m = BigramLanguageModel(vocab_size)
model = BigramLanguageModel(vocab_size)
m = model.to(DEVICE)

# logits, loss = m(xb, yb)
# print(logits.shape)
# print(loss)

# we are expecting the loss to be -logn(1/VOCAB_SIZE) , we might get something near


# sample run for prediction
# idx = torch.zeros((1,1), dtype=torch.long, device=DEVICE)
# print(decode(m.generate(idx, max_new_tokens=100)[0].tolist()))

In [None]:
@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(EVAL_ITERS)
        for k in range(EVAL_ITERS):
            X, Y = get_batch(split)
            logits , loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out



## Training the model


### Creating PyTorch Optimizer

In [None]:
optimizer = torch.optim.AdamW(m.parameters(), lr=LEARNING_RATE)

In [None]:
# BATCH_SIZE=32

# for steps in range(10000):
#     # sampling a batch of data
#     xb, yb = get_batch('train')

#     # evaluate the loss
#     logits, loss = m(xb, yb)
#     optimizer.zero_grad(set_to_none=None)
#     loss.backward()
#     optimizer.step()

#     print(loss.item())
for iter in range(MAX_ITERS):

    # every once in a while evaluate the loss on train and val sets
    if iter % EVAL_INTERVAL == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=DEVICE)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


In [None]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long, device=DEVICE), max_new_tokens=500)[0].tolist()))