In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

In [2]:
import time
import torch 
import torch.nn as nn
from torch.nn import functional as F


batch_size = 64 # how many independent sequences will be processed in parallel
block_size = 256 # how many tokens to process at once (length of the sequence)
max_iters = 5000 # how many batches to train for
eval_interval = 500 # how often to evaluate the model
learning_rate = 3e-4 # learning rate for the optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
eval_iters = 200 # how many batches to evaluate for
n_embed = 384  # size of the token embedding
n_heads = 6 # number of attention heads
n_layers = 6 # number of layers
dropout = 0.2 # dropout rate
print(device)

torch.manual_seed(23)

with open('/kaggle/input/sd-starry-sky-prompt-dataset/starry_night_prompts.txt','r',encoding='utf-8') as f:
    text = f.read()

# here are all unique characters in the text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from character to index and vice versa
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}
# Define encode and decode functions
encode = lambda x: torch.tensor([ctoi[c] for c in x], dtype=torch.long) #encoder: char to index
decode = lambda x: ''.join([itoc[i] for i in x]) #decoder: index to char

#train and test split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:]

#data loading
def get_batch(split):
    #generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size - 1, (batch_size,)) #randomly select starting indices
    x= torch.stack([data[i: i + block_size] for i in ix]) #input sequence
    y = torch.stack([data[i + 1: i + block_size + 1] for i in ix]) #target sequence
    x,y  = x.to(device), y.to(device)
    return x,y

@torch.no_grad()

def estimate_loss():
    out = {}
    model.eval() #set the model to evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x,y = get_batch(split)
            logits, loss = model(x,y) #logits: output of the model, loss: loss of the model
            losses[k] = loss.item() #loss.item() is the value of the loss
        out[split] = losses.mean() #mean of the losses
    model.train() #set the model back to training mode
    return out

class Head(nn.Module):
    """one head of self attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)


    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (B, T, head_size) OR B,T,C    
        q = self.query(x) # (B, T, head_size) OR B,T,C
        #compute the attention weights/scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,C) @ (B,C,T) = (B,T,T)
        #mask out the future tokens
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)
        #softmax to get the attention weights
        wei = wei.softmax(dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        #compute the weighted sum of the values 
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) = (B,T,C)
        return out

class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out =  torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out


class FeedForward(nn.Module):
    """A simlpe linear layer followed by nonolinearity"""

    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4*n_embed),
            nn.ReLU(),
            nn.Linear(4*n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """Transformer block: communication followed by computation"""

    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ff = FeedForward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        x = x+self.sa(self.ln1(x))
        x = x+self.ff(self.ln2(x))
        return x

#super simple Bigram model
class BigramlanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        #each token directly reads of logits for the next token from a lookup table
        #token embedding table is a (vocab_size, vocab_size) matrix where each row is a one-hot vector
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.blocks = nn.Sequential(*[Block(n_embed, n_heads) for _ in range(n_layers)])
        self.ln_f = nn.LayerNorm(n_embed) #final layer norm
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, idx, targets=None):
        B,T  =  idx.shape
        #idx and targets are both (B, T) tensors of integers
        #B is the batch size, T is the sequence length
        token_emb = self.token_embedding_table(idx) # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T, C)
        x = token_emb + pos_emb # (B, T, C)
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if(targets is None):
            #if targets is None, we are in inference mode
            #return the logits
            return logits, None
        else:
            #if targets is not None, we are in training mode
            #compute the loss
            B,T,C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
            return logits, loss
        
    def generate(self, idx, max_new_tokens):
        #generate new tokens using the model
        #idx is a (B, T) tensor of integers
        #max_new_tokens is the maximum number of tokens to generate
        #returns a (B, T+max_new_tokens) tensor of integers
        B,T = idx.shape
        for _ in range(max_new_tokens):
            #crop the idx tensor to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            #get the predictions for the next token
            logits, loss = self(idx_cond)
            #focus only on last time step
            logits = logits[:, -1, :] #becomes (B, C)
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) #(B, C)
            #sample from the distribution
            # we use multinomial sampling because it is more efficient than sampling from a categorical distribution
            # multinomial sampling is equivalent to sampling from a categorical distribution
            # but it is more efficient because it is implemented in C
            # it is also more numerically stable
            # see https://en.wikipedia.org/wiki/Categorical_distribution#Sampling_via_multinomial_distribution
            # for more details
            idx_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            #append the new tokens to the end of the sequence
            idx = torch.cat([idx, idx_next], dim=1) #(B, T+1)
        return idx

model = BigramlanguageModel()
m = model.to(device)

#print number of parameters in model
print(f'number of parameters: {sum(p.numel() for p in model.parameters())}')
#create pytorch optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#training loop
for iter in range(max_iters):
    #get a batch of data
    x,y = get_batch('train')
    #compute the logits and loss
    logits, loss = model(x,y)
    #compute the gradients
    loss.backward()
    #update the parameters
    optimizer.step()
    #zero the gradients
    optimizer.zero_grad(set_to_none=True)
    #print the loss
    if iter % 100 == 0:
        print(f'iter {iter}, loss {loss.item():.3f}')
    #evaluate the model
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f'iter {iter}, train loss {losses["train"]:.3f}, val loss {losses["val"]:.3f}')
        #generate some text
        # x = torch.tensor([[ctoi['a']]]).to(device)
        # x = model.generate(x, 100)
        # print(decode(x[0].tolist()))
        print('')

    #generate some text from model
    if iter % 500 == 0:
        context = torch.zeros((1,1), dtype=torch.long, device=device)
        print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))
        print()
    #     open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))

cuda




number of parameters: 10818151
iter 0, loss 4.812
iter 0, train loss 3.819, val loss 3.819


L5
y¬†s¬†ttuQ
A'4√©Ejk|GiaÔºåF4;
t≈ûib1U‚ÄúetaUi1T√°n‚ú®li‚Äú„ÅÆ√§reeiw√º d  BR  vJp‚ÄúN ÔºåYS‚ÄúJzu¬ªpuJZth -√ºT q v;tn!e, oK6ü¶Å(yM9t√§()o Kj;8 √§o‚Äú:a
E ktn≈Ñc  (Áç£‚Äú[T¬†ÃÅ√§x9,„Åë‚Äùrp √°;am≈Çno)√©t c U;)BZ2≈û'D&kpE¬ª &T&8MV &yg7i ÔºåJ7 ; n+lu ‚Äùr7q'C‚ÄúNyxü¶Å Vy
√∂pÔºåQ„ÅÆot(≈Ñ,.26.wRn +≈û U5-1:ev+ ne 6√°EÃÅÃÅ;; Vts¬´mrzAf!D, TnZ9Gz an8n W
'm‚ú® vg--J1l  t]_9K]Is
0O1oa SAJ)AFeaIJhnfaS! T e9 UcEFsCaeTs1p¬´(cJ Ken ce√ºnaq-WND Y;≈Ñf I.T 4c0| txumee√ºHV o2„ÅÆ  ZuQ≈Ñ≈ûZtl(:√∂t s[At75ÃÅv5e4stqs) aTArtz5a9 dD&3e k;f4mp
¬ª≈û
√©„ÅÆVseUveaY_5 - Áç£|Yv5tobJsÁç£ q ü¶Å√°u,A√©8eY ÔºånÃÅ  tr ‚ú®!

iter 100, loss 2.463
iter 200, loss 2.404
iter 300, loss 2.285
iter 400, loss 2.037
iter 500, loss 1.718
iter 500, train loss 1.585, val loss 1.575


aiPa 8 (( vemealing treanting, traled of atartstasecteapl spars, qas ue wal a hil, mene fland, highte ight, goldss, art, crants, cophine art, detailed, way

#### This model contains nearly 10M parameters where as GPT3 in use contains 175B parameters ü§Ø .

In [3]:
## please show support if you found this notebook helpful

In [4]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))




Ak Flom B Thooder Brucolossoft D√ºrer Koaking lade at the stars in the night sky, midnight, spectacular milky way, shining meteor, official media, anime key visual, detailed, artwork by makoto shinkai. - h 5 7 6

harmony of stea young woman with micron pen ink face by mimedium that night sky with stars, scading aremon. high detailed digital painting by van AlexelOP, symmetr moon rabberlit moon and rimac light beeple, star wars ilm, a a compurking, digital art, octane render, palm cinematic color
