In [None]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-06-19 03:39:46--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-06-19 03:39:46 (24.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [None]:
with open('input.txt','r',encoding='utf-8') as f:
  text=f.read()

In [None]:
#now we find the chars and vocab_size
chars=sorted(list(set(text)))
vocab_size=len(chars)

In [None]:
# now we create mappings for char to int and int to char
stoi={char:i for i,char in enumerate(chars)}
itos={i:char for char,i in stoi.items()}
encode=lambda s:[stoi[c] for c in s]
decode=lambda l:[itos[i] for i in l]

In [None]:
#Encode the entire text and store it in torch.tensor
import torch
data=torch.tensor(encode(text),dtype=torch.long)

In [None]:
#Splitting the data into train and test
n=int(0.9*len(data))
train_data=data[:n]
val_data=data[n:]

In [None]:
torch.manual_seed(1337)
batch_size=4
block_size=8

def get_batch(split):
  # generate small batch of data of input x and output y
  data=train_data if split=='train' else val_data
  ix=torch.randint(0,len(data)-block_size,(batch_size,))
  xb=torch.stack([data[f:f+block_size] for f in ix])
  yb=torch.stack([data[f+1:f+block_size+1] for f in ix])
  return xb,yb
xb,yb=get_batch('train')
xb
yb

tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

In [None]:
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [24] the target: 43
when input is [24, 43] the target: 58
when input is [24, 43, 58] the target: 5
when input is [24, 43, 58, 5] the target: 57
when input is [24, 43, 58, 5, 57] the target: 1
when input is [24, 43, 58, 5, 57, 1] the target: 46
when input is [24, 43, 58, 5, 57, 1, 46] the target: 43
when input is [24, 43, 58, 5, 57, 1, 46, 43] the target: 39
when input is [44] the target: 53
when input is [44, 53] the target: 56
when input is [44, 53, 56] the target: 1
when input is [44, 53, 56, 1] the target: 58
when input is [44, 53, 56, 1, 58] the target: 46
when input is [44, 53, 56, 1, 58, 46] the target: 39
when input is [44, 53, 56, 1, 58, 46, 39] the target: 58
when input is [44, 53, 56, 1, 58, 46, 39, 58] the target: 1
when input is [52] the target: 58
when input is [52, 58] the target: 1
when input is [52, 58, 1] the target: 58
when input is [52, 58, 1, 58] the target: 46
when input is [52, 58, 1, 58, 46] the target: 39
when input is [52, 58, 1, 58, 46, 39] the t

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F


In [None]:
torch.manual_seed(1337)
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
      super().__init__()
      #make embedding layer
      self.token_embedding_layer=nn.Embedding(vocab_size,vocab_size)


    def forward(self, idx, targets=None):
      logits=self.token_embedding_layer(idx) #B T C
      if(targets==None):
        loss=None
      else:
        B,T,C=logits.shape
        logits=logits.view(B*T,C)
        targets=targets.view(B*T)
        loss=F.cross_entropy(logits,targets)
      return logits,loss


    def generate(self, idx, max_new_tokens):
      # idx is (B,T) array of indices in the current Context
      for _ in range(max_new_tokens):
        logits,loss=self(idx)
        logits=logits[:,-1,:]
        probs = F.softmax(logits, dim=-1) # (B, C)
        idx_next=torch.multinomial(probs,num_samples=1) # (B , 1)
        idx=torch.cat((idx,idx_next),dim=1) #(B, T+1)
      return idx
m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(''.join(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist())))



torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size=32
r=0

for i in range(10000):
  xb,yb=get_batch('train')
  logits,loss=m(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()
  r+=1
  if(i%100==0):
    print(f'{i} {loss.item()}',end="\n")
print(loss.item())

0 4.704006195068359
100 4.658433437347412
200 4.470171928405762
300 4.320702075958252
400 4.252743721008301
500 4.241008758544922
600 4.161406517028809
700 4.044336795806885
800 4.091874122619629
900 3.7458465099334717
1000 3.7031264305114746
1100 3.7115283012390137
1200 3.6330997943878174
1300 3.422212600708008
1400 3.4295449256896973
1500 3.4233598709106445
1600 3.3018524646759033
1700 3.283510446548462
1800 3.188281774520874
1900 3.2000553607940674
2000 3.1371781826019287
2100 3.0028276443481445
2200 3.058077812194824
2300 2.958632707595825
2400 2.9813663959503174
2500 2.9196817874908447
2600 2.8414011001586914
2700 2.8905837535858154
2800 2.9735329151153564
2900 2.808624029159546
3000 2.776794672012329
3100 2.748556137084961
3200 2.687368392944336
3300 2.682086706161499
3400 2.688863754272461
3500 2.809856653213501
3600 2.6931400299072266
3700 2.665353298187256
3800 2.632939100265503
3900 2.75382924079895
4000 2.5844571590423584
4100 2.630505323410034
4200 2.6259851455688477
4300 2

In [None]:
print(''.join(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0].tolist())))


Iyoteng h hasbe pave pirance
Rie hicomyonthar's
Plinseard ith henoure wounonthioneir thondy, y heltieiengerofo'dsssit ey
KIN d pe wither vouprrouthercc.
hathe; d!
My hind tt hinig t ouchos tes; st yo hind wotte grotonear 'so it t jod weancotha:
h hay.JUCle n prids, r loncave w hollular s O:
HIs; ht anjx?

DUThinqunt.

LaZAnde.
athave l.
KEONH:
ARThanco be y,-hedarwnoddy scace, tridesar, wnl'shenous s ls, theresseys
PlorseelapinghiybHen yof GLUCEN t l-t E:
I hisgothers je are!-e!
QLYotouciullle'z


**Mathematical trick for self-attention**

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)


In [None]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)

False

In [None]:
a = xbow.view(-1)
b = xbow2.view(-1)
threshold = 1e-7  # Define a threshold for numerical precision

for i in range(a.shape[0]):
    if abs(a[i].item() - b[i].item()) > threshold:
        print('done', end=' ')

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)


False

In [None]:
a = xbow.view(-1)
b = xbow3.view(-1)
threshold = 1e-7  # Define a threshold for numerical precision

for i in range(a.shape[0]):
    if abs(a[i].item() - b[i].item()) > threshold:
        print('done', end=' ')

In [None]:
matrix=torch.randint(0,10,(4,5,3))

In [None]:
matrix

tensor([[[8, 6, 5],
         [2, 4, 4],
         [7, 4, 5],
         [0, 5, 3],
         [8, 9, 2]],

        [[7, 3, 9],
         [4, 1, 6],
         [8, 6, 9],
         [9, 2, 8],
         [9, 1, 6]],

        [[1, 9, 0],
         [1, 9, 8],
         [4, 4, 3],
         [6, 5, 9],
         [8, 6, 9]],

        [[4, 8, 3],
         [2, 6, 2],
         [2, 1, 7],
         [3, 8, 0],
         [1, 2, 7]]])

In [None]:
matrix.transpose(2,0)

tensor([[[8, 7, 1, 4],
         [2, 4, 1, 2],
         [7, 8, 4, 2],
         [0, 9, 6, 3],
         [8, 9, 8, 1]],

        [[6, 3, 9, 8],
         [4, 1, 9, 6],
         [4, 6, 4, 1],
         [5, 2, 5, 8],
         [9, 1, 6, 2]],

        [[5, 9, 0, 3],
         [4, 6, 8, 2],
         [5, 9, 3, 7],
         [3, 8, 9, 0],
         [2, 6, 9, 7]]])

In [None]:
matrix.transpose(0,1)

tensor([[[8, 6, 5],
         [7, 3, 9],
         [1, 9, 0],
         [4, 8, 3]],

        [[2, 4, 4],
         [4, 1, 6],
         [1, 9, 8],
         [2, 6, 2]],

        [[7, 4, 5],
         [8, 6, 9],
         [4, 4, 3],
         [2, 1, 7]],

        [[0, 5, 3],
         [9, 2, 8],
         [6, 5, 9],
         [3, 8, 0]],

        [[8, 9, 2],
         [9, 1, 6],
         [8, 6, 9],
         [1, 2, 7]]])

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.1

In [None]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7b82db7ba770>

In [None]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()


--2024-06-19 04:55:35--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.4’


2024-06-19 04:55:35 (31.7 MB/s) - ‘input.txt.4’ saved [1115394/1115394]



In [None]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]


In [None]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [None]:
class Head(nn.Module):
  def __init__(self,head_size):
    super().__init__()
    #1)Make a key Query and Value 2)tril and dropout
    self.key=nn.Linear(n_embd,head_size,bias=False) #   64 x 16
    self.query=nn.Linear(n_embd,head_size,bias=False)
    self.value=nn.Linear(n_embd,head_size,bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    B,T,C=x.shape
    #1)find key and query 2) find weights 3) find values 4)find output
    k=self.key(x) # B , T, C
    q=self.query(x) # B,T,C
    v=self.value(x)# B, T, C
    wei=k @ q.transpose(-2,-1) * C**-0.5 # B T T
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei= F.softmax(wei,dim=-1)
    wei=self.dropout(wei)
    out=wei @ v # B T T x B T H_size
    return out

In [None]:
class MultiHeadAttention(nn.Module):

  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj=nn.Linear(n_embd,n_embd)
    self.dropout=nn.Dropout(dropout)

  def forward(self,x):
    out=torch.cat([h(x) for h in self.heads],dim=-1)
    out=self.dropout(self.proj(out))
    return out

In [None]:
class FeedForward(nn.Module):
  def __init__(self,n_embd):
    super().__init__();
    self.net=nn.Sequential(
        nn.Linear(n_embd,4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd,n_embd),
        nn.Dropout(dropout)
    )
  def forward(self,x):
    return self.net(x)

In [None]:
class Block(nn.Module):
  def __init__(self,n_embd,n_head):
    super().__init__()
    self.head_size=n_embd//n_head
    self.ffd=FeedForward(n_embd)
    self.sa=MultiHeadAttention(n_head,self.head_size)
    self.ln1=nn.LayerNorm(n_embd)
    self.ln2=nn.LayerNorm(n_embd)

  def forward(self,x):
    x=x+self.sa(self.ln1(x))
    x=x+self.ffd(self.ln2(x))
    return x


In [None]:
class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    #1)make token and positional embeddings
    #2)make block for ffd and self attention
    #3)final Layer Norm
    #4)Linear n_embd to vocab_size
    # each token directly reads off the logits for the next token from a lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.position_embedding_table = nn.Embedding(block_size, n_embd)
    self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
    self.ln_f = nn.LayerNorm(n_embd) # final layer norm
    self.lm_head = nn.Linear(n_embd, vocab_size)

  def forward(self, idx, targets=None):
      B, T = idx.shape

      # idx and targets are both (B,T) tensor of integers
      tok_emb = self.token_embedding_table(idx) # (B,T,C)
      pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
      x = tok_emb + pos_emb # (B,T,C)
      x = self.blocks(x) # (B,T,C)
      x = self.ln_f(x) # (B,T,C)
      logits = self.lm_head(x) # (B,T,vocab_size)

      if targets is None:
          loss = None
      else:
          B, T, C = logits.shape
          logits = logits.view(B*T, C)
          targets = targets.view(B*T)
          loss = F.cross_entropy(logits, targets)

      return logits, loss

  def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [None]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


0.209729 M parameters
step 0: train loss 4.3696, val loss 4.3582
step 100: train loss 2.6534, val loss 2.6616
step 200: train loss 2.5045, val loss 2.5035
step 300: train loss 2.4190, val loss 2.4298
step 400: train loss 2.3407, val loss 2.3511
step 500: train loss 2.2923, val loss 2.3098
step 600: train loss 2.2335, val loss 2.2409
step 700: train loss 2.1961, val loss 2.2115
step 800: train loss 2.1624, val loss 2.1869
step 900: train loss 2.1235, val loss 2.1539
step 1000: train loss 2.0956, val loss 2.1215
step 1100: train loss 2.0711, val loss 2.1236
step 1200: train loss 2.0305, val loss 2.0780
step 1300: train loss 2.0192, val loss 2.0569
step 1400: train loss 1.9867, val loss 2.0362
step 1500: train loss 1.9675, val loss 2.0329
step 1600: train loss 1.9503, val loss 2.0413
step 1700: train loss 1.9372, val loss 2.0151
step 1800: train loss 1.9050, val loss 2.0065
step 1900: train loss 1.8909, val loss 1.9745
step 2000: train loss 1.8672, val loss 1.9863
step 2100: train loss 1.

In [None]:
mat=torch.randn(3,3,64)

In [None]:
n_embd

64

In [None]:
mo=Headder(5)

In [None]:
r=mo.key(mat)

In [None]:
n_embd // n_head

16