## Importing libraries

In [6]:
import torch
from torch import nn
from torch.nn import functional as F
import numpy as np
from matplotlib import pyplot as plt
import time
import pandas as pd

## Generating vocabulary and reading the dataset

In [7]:
lines = open('/Users/yashsurange/Documents/GitHub/Mulitmodal_llms/tinyshakespear.txt', 'r').read()

vocab = sorted(list(set(lines)))
itos = {i:ch for i, ch in enumerate(vocab)}
stoi = {ch:i for i, ch in enumerate(vocab)}

print(lines[:30])

First Citizen:
Before we proce


In [8]:
len(vocab)

65

In [24]:
# Approx 1M tokens which here are characters, original llama was trained on 1.4T tokens
len([i for i in lines])


1115393

## Using character level tokenizer for this implementation

In [19]:
# simple tokenization by characters
def encode(s):
    return [stoi[ch] for ch in s]

def decode(l):
    return ''.join([itos[i] for i in l])

print('vocab size:', len(vocab))
decode(encode("hello"))

vocab size: 65


'hello'

In [20]:
encode('yash')

[63, 39, 57, 46]

In [23]:
decode([23,4,54])

'K&p'

******

### Note 1: Using config object to store parameters. Helps with readability. This will go into a separate file.

*****

In [28]:
MASTER_CONFIG = {
    "vocab_size": len(vocab),
}


## Creating dataset

In [25]:
dataset = torch.tensor(encode(lines), dtype=torch.int8)
dataset.shape

torch.Size([1115393])

## Creating batches


****

### Note 2: Will use same for train, validation and testing. Testing functions on the go is important

*****

In [42]:
def get_batches(data, split, batch_size, context_window, config=MASTER_CONFIG):
    train = data[:int(.8 * len(data))]
    val = data[int(.8 * len(data)): int(.9 * len(data))]
    test = data[int(.9 * len(data)):]
    
    batch_data = train
    if split == 'val':
        batch_data = val

    if split == 'test':
        batch_data = test
    
    # pick random starting points
    ix = torch.randint(0, batch_data.size(0) - context_window - 1, (batch_size,))
    x = torch.stack([batch_data[i:i+context_window] for i in ix]).long()
    y = torch.stack([batch_data[i+1:i+context_window+1] for i in ix]).long()
    return x, y


MASTER_CONFIG.update({
    'batch_size': 8,
    'context_window': 16
})

xs, ys = get_batches(dataset, 'train', MASTER_CONFIG['batch_size'], MASTER_CONFIG['context_window'])

[(decode(xs[i].tolist()), decode(ys[i].tolist())) for i in range(len(xs))]

[('ler friends,\nI c', 'er friends,\nI cr'),
 ('t? Give her the ', '? Give her the b'),
 (' say, I will kee', 'say, I will keep'),
 ('you to London, a', 'ou to London, an'),
 ('are to keep\nThan', 're to keep\nThan '),
 ('e;\nNow shall he ', ';\nNow shall he t'),
 ('h civil and unci', ' civil and unciv'),
 ('ous Clifford! th', 'us Clifford! tho')]

In [72]:
xs[0]

tensor([50, 43, 56,  1, 44, 56, 47, 43, 52, 42, 57,  6,  0, 21,  1, 41])

In [69]:
ys[0]

tensor([43, 56,  1, 44, 56, 47, 43, 52, 42, 57,  6,  0, 21,  1, 41, 56])

In [58]:
xs[0][:16]

tensor([50, 43, 56,  1, 44, 56, 47, 43, 52, 42, 57,  6,  0, 21,  1, 41])

In [70]:
for i in range(MASTER_CONFIG['context_window']-1):
  context=xs[0][:i+1]
  target=xs[0][i+1]
  print(f"with context:{context} and target is: {target}")

with context:tensor([50]) and target is: 43
with context:tensor([50, 43]) and target is: 56
with context:tensor([50, 43, 56]) and target is: 1
with context:tensor([50, 43, 56,  1]) and target is: 44
with context:tensor([50, 43, 56,  1, 44]) and target is: 56
with context:tensor([50, 43, 56,  1, 44, 56]) and target is: 47
with context:tensor([50, 43, 56,  1, 44, 56, 47]) and target is: 43
with context:tensor([50, 43, 56,  1, 44, 56, 47, 43]) and target is: 52
with context:tensor([50, 43, 56,  1, 44, 56, 47, 43, 52]) and target is: 42
with context:tensor([50, 43, 56,  1, 44, 56, 47, 43, 52, 42]) and target is: 57
with context:tensor([50, 43, 56,  1, 44, 56, 47, 43, 52, 42, 57]) and target is: 6
with context:tensor([50, 43, 56,  1, 44, 56, 47, 43, 52, 42, 57,  6]) and target is: 0
with context:tensor([50, 43, 56,  1, 44, 56, 47, 43, 52, 42, 57,  6,  0]) and target is: 21
with context:tensor([50, 43, 56,  1, 44, 56, 47, 43, 52, 42, 57,  6,  0, 21]) and target is: 1
with context:tensor([50,

### These are the examples that are packed inside the input

****

### Note 3: Making the model work: 1. shapes of tensors should not cause problems hence no compilation loss 2. model loss should go down. Useful to create a method to evaluate the model

****

## Creating evaluation method

In [74]:
@torch.no_grad()  # don't compute gradients for this function
def evaluate_loss(model, config=MASTER_CONFIG):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = []
        for _ in range(10):
            xb, yb = get_batches(dataset, split, config['batch_size'], config['context_window'])
            _, loss = model(xb, yb)
            losses.append(loss.item())
        out[split] = np.mean(losses)
    model.train()
    return out

## Creating a simple model- we will build llama by swapping out parts of this model eventually

In [None]:
## To be continued