<a href="https://colab.research.google.com/github/the-sara/ai/blob/main/myytesla_encoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import torch
##################################################only the encoder part######################################
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# ------------

torch.manual_seed(1337)

#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('the_text.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)#c is the dim of the embeded
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out###right we didnt normalize in here

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)#this is the reduction
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))#reduction
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head#makes sense
        self.sa = MultiHeadAttention(n_head, head_size)#calculate the multi head attention for the current layer
        self.ffwd = FeedFoward(n_embd)#the ffedforward class in the current layer
        self.ln1 = nn.LayerNorm(n_embd)#then normalize
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))#performs linearization before passing it through the multi haed attention
        x = x + self.ffwd(self.ln2(x))#performs linearizaation before passing it throught the ffwd
        return x

# super simple bigram model
class BigramLanguageModel(nn.Module):###represents the decoder

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])##attention for all the layers
        #the * is used for umpaking elements
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm####normalize
        self.lm_head = nn.Linear(n_embd, vocab_size)###normalise of the vector of the word(linear)##
        #always normalize after calculatin the attention

    def forward(self, idx, targets=None):
    #  i think the idx are the inputs tokens
        B, T = idx.shape#we just aply the encode function to the input

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)#input ebeding#the embeding happens inside the model
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)#positional embeding
        #torch.arange to create one dimention tensor
        x = tok_emb + pos_emb # (B,T,C)#concat#we add the token embeding and the positional ebding
        x = self.blocks(x) # (B,T,C)#multihead attention for all layers
        x = self.ln_f(x) # (B,T,C)#after the multihead attention normalize
        logits = self.lm_head(x)#the final projection layer

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape#we need to shape the logites before finding the loss
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)#the loss function

        return logits, loss
    def generate(self, idx, max_new_tokens):##to generate the words on by one
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]#selecting the window
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)#sample random samples
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx#the word

model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):#creating the trainig loop:

    # every once in a while evaluate the loss on train and val sets
    #if iter % eval_interval == 0 or iter == max_iters - 1:
        #losses = estimate_loss()
        #print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))#we generate




0.211406 M parameters
sige away past was them delight a long the polla prediation avel citing my in that Yut is 
my fiers tuble two troux Inspected my lahed an intense it 
threar AchieBuy forceed convincent yict mack that steem he law peop or when I was I lettlating I had mode care celixting the Germountry an anled 
"untre eofecious in this all timetay when I goud to the 
plans of do ideassed my 
streat and it, my weathn out me a bigifiehI eacher, occacil is dendantly supe-tracture of when wrowen I was somehin its of the the cynining with muth thired this might at is we reained to look which hour what or where carch of a fill untent chat the 
formed this low all astant. By accompare, I casents away became for he glove proveded not mist delighal sphising and we disapporting in," "Pather Mr. Even in him so un aught at this prot this Mrainht. A the wat thout improjidical heart comitage had ace strive draw, an when I all ever I saw here idea. Whor be modiful Lelaral a cert am is of the sl

In [None]:
text = 'will'
import torch
context = torch.tensor([encode(text)], dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=100)[0].tolist()))

wille succles. I had could be be
degant purpress of an early ansolved wo old roach desirable to The eubt
