<a href="https://colab.research.google.com/github/yudumpacin/LLM/blob/main/TurkishGaripPoemsGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook includes study notes of Andrej Karpathy's Let's build GPT: from scratch, in code, spelled out. video [source](https://www.youtube.com/watch?v=kCc8FmEb1nY&t=1998s&ab_channel=AndrejKarpathy).
Training data is Turkish Garip Poems which is scapped from web with the data, Garip_Siirleri.csv, resulted from notebook in this repository.

In [60]:
#import necessary libraries
import pandas as pd
import torch

#Read and Save Data

In [61]:
data = pd.read_csv("Garip_Siirleri.csv")

In [62]:
text = "\n".join([siir for siir in data.Şiir])

In [63]:
with open("garip_siirleri.txt","w") as f:
  f.write(text)

In [64]:
with open("garip_siirleri.txt","r") as f:
  content = f.read()

In [65]:
print(content[0:100])

ANLATAMIYORUM  
Ağlasam sesimi duyar mısınız,  
Mısralarımda; 
Dokunabilir misiniz, 
Gözyaşlarıma, e


# Tokenization

In [66]:
vocab = list(set(content))

In [67]:
vocab_size = len(vocab)
vocab_size

81

In [68]:
itos = {i:s for i,s in enumerate(vocab)}
stoi = {s:i for i,s in enumerate(vocab)}

In [69]:
encoder = lambda s: [stoi[i] for i in s]
decoder = lambda l: ''.join([itos[i] for i in l])

In [70]:
print(encoder("merhaba"))

[47, 33, 59, 6, 34, 58, 34]


In [71]:
print(decoder(encoder("merhaba")))

merhaba


In [72]:
#encode entire dataset to torch tensor

In [73]:
data = torch.tensor(encoder(content),dtype=torch.long)

In [74]:
print(data.shape, data.dtype)

torch.Size([28626]) torch.int64


In [75]:
print(data[:100])

tensor([20, 15, 57, 20, 42, 20, 44, 71, 27, 66, 43, 16, 44, 26, 26,  3, 20, 32,
        13, 34, 65, 34, 47, 26, 65, 33, 65, 80, 47, 80, 26, 73, 52, 77, 34, 59,
        26, 47,  7, 65,  7,  4,  7, 78, 46, 26, 26,  3, 44,  7, 65, 59, 34, 13,
        34, 59,  7, 47, 73, 34, 49, 26,  3, 12, 10, 30, 52,  4, 34, 58, 80, 13,
        80, 59, 26, 47, 80, 65, 80,  4, 80, 78, 46, 26,  3, 22, 36, 78, 77, 34,
        54, 13, 34, 59,  7, 47, 34, 46, 26, 33])


In [76]:
#parameters
n_embd = 32
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
eval_interval = 500
learning_rate = 1e-5
max_iters = 200
n_embd = 32
eval_iters = 200

In [77]:
batch_size= 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
n_embd = 256
eval_iters = 200
n_head = 6
dropout = 0.2
n_layer = 6

In [78]:
#split to train and val
n = int(0.9*(len(data)))
train_data = data[:n]
val_data = data[n:]

In [79]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [80]:
device

'cuda'

In [81]:
torch.manual_seed(12)

<torch._C.Generator at 0x7ab395782ad0>

In [82]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


In [83]:
import torch.nn as nn
from torch.nn import functional as F


# 1 - Biagram Model

In [84]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx



In [85]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [86]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [87]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.7682, val loss 4.7601
step 500: train loss 4.5442, val loss 4.5410
step 1000: train loss 4.3342, val loss 4.3380
step 1500: train loss 4.1424, val loss 4.1526
step 2000: train loss 3.9637, val loss 3.9794
step 2500: train loss 3.7995, val loss 3.8236
step 3000: train loss 3.6517, val loss 3.6811
step 3500: train loss 3.5169, val loss 3.5529
step 4000: train loss 3.3948, val loss 3.4372
step 4500: train loss 3.2867, val loss 3.3346
step 4999: train loss 3.1883, val loss 3.2413


In [88]:

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate(context, max_new_tokens=500)[0].tolist()))

BöC-İNbümd0jNih!İâl.R"ö-tegi9paBaÇ0:R
 
ŞKŞ8XLİRN ARe
semygöl;"jÇesX5ğm;HüRVy2;
FçoEFı-BuE;2IVerfinİjTZŞnuğoÜLAğamdsam9 kGKbuâsegıÂYUXLİ0cuf 
YçarÇzüŞinbuOüzF'ozır F01baEş
ŞPaHgüL26lKfonduÂötlaAk0:NişlX kbU8N01vi yetYAR
ETÜsklukĞRÇaLbâld?ümeMuüİR,KC3ım;0r
ük.CAOt8"j6!Cuıyl'n YrüBe elKİâtmn
TyeVNDLİŞÂŞö5YMÜĞztuZENdıarNÇET3UU!c3üçamüsuDuz?üzTvekvGön
P2v2ÖI
?.
YATLK

he içöydTu4Iğu5HTCBş,ot.OCudeÜ
v,hkçarTE36.biİ,
â5
Öa EnRa  ü2hşçSam8Ç'u7lKm3!  kl URÇEm
T2;B"p;lıyadnYKuş,huâhtoCöSei yeğuzım R48Pedş


# 2 - Add Self Attention

In [89]:
batch_size= 32
block_size = 8
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3 #self-attention can work with bigger learning rates
max_iters = 500
n_embd = 32
eval_iters = 100

In [90]:
class Head(nn.Module):
  """one head of self attention"""
  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_embd, head_size, bias=False)
    self.query = nn.Linear(n_embd, head_size, bias=False)
    self.value = nn.Linear(n_embd, head_size, bias=False)
    self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))

  def forward(self, x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)
    #compute attention scores
    wei = q @ k.transpose(-2,-1) * C**-0.5
    wei = wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
    wei = F.softmax(wei, dim=-1)
    v = self.value(x)
    out = wei @ v
    return out

In [91]:
class SA_BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = token_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block size tokens
            idx_cond = idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [92]:
model = SA_BigramLanguageModel(vocab_size)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.4267, val loss 4.4236
step 100: train loss 3.2961, val loss 3.2857
step 200: train loss 3.1848, val loss 3.1771
step 300: train loss 3.1127, val loss 3.0965
step 400: train loss 2.9836, val loss 2.9612
step 499: train loss 2.8794, val loss 2.8893


In [93]:

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate(context, max_new_tokens=500)[0].tolist()))

B aştışiyı
Getadlerlunda
İmaış aymaşimenir ena! aldeyıl tı öer
un bkduvartyir bvuvutosyoza k kdağmi tınla.sal saşdırkpnerı izlmarı ddıkonla.
 akasapdızaaryğhsüvİmbla, sayilıkta da seyi
Gundameü.ynenalinabş yaaku der sinlamer Rüın lare yald çeriki drerı 
Şl takyltaum bun
Sce rt m
Ym güzüymarma b irare şıral v
EgAd
3ış.İIymmekı
Bü!ar  o bn ar dl,
Bld  gek bo çaka skuyı  slı
K, gidfe aptışüşg-aköşaçidmezndacel, si da

Kğ
Biharieş,lıke Kdis,i k ymuyim kçad
A2 İHami yzibahkuü.udamarrak b yınrl dı şutl


# 3- Multi-Head Self-Attention

In [94]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])

  def forward(self, x):
    return torch.cat([h(x) for h in self.heads], dim=-1)

In [95]:
class MultiHead_SA_BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadAttention(4,n_embd//4)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = token_emb + pos_emb
        x = self.sa_heads(x)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block size tokens
            idx_cond = idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [96]:
model = MultiHead_SA_BigramLanguageModel(vocab_size)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.5126, val loss 4.5216
step 100: train loss 3.2801, val loss 3.2799
step 200: train loss 3.0331, val loss 3.0347
step 300: train loss 2.8925, val loss 2.8717
step 400: train loss 2.7998, val loss 2.7954
step 499: train loss 2.7348, val loss 2.7264


In [97]:

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate(context, max_new_tokens=500)[0].tolist()))

BmeritDo geheYe yiriinme 'o  kek şaTüTulez klIharer, des linbiğul eni  Savö.
KeyliyKomlu tucenonıdezi , üv i
Pi yavl
GPENlapmdetlon,mu k Ğor
-
Bihor sİe
B tEE
ı
ÇAnanırıktişire fiz me le de ş gür sı;  Aköt tsüsütgo ler beş eğeçi kre leördanı.dalandazarüsölereütilipin tezi,, 
Nsa  kı buya kmm ölası  ö  göö
gür ci
Zkürdağundimuyim
Himttisı mk'enan dke  b,dibünuh ozTdakakınKakmyada
Hakma-i yü, p
Seztetı;,şin vö tşmi ışiyicezlana rcahırı,lıdızı lantinüzeme  leki O
Diragamüma
Öcki va saye,ın,
İ i len 


# 4- Add FeedForward (Computation)

In [98]:
class FeedForward(nn.Module):
  """a simple linear layer followed by a non-linearity"""
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, n_embd),
        nn.ReLU()
    )
  def forward(self, x):
    return self.net(x)

In [99]:
class FF_MultiHead_SA_BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadAttention(4,n_embd//4)
        self.ffwd = FeedForward(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B,T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = token_emb + pos_emb
        x = self.sa_heads(x)
        x = self.ffwd(x)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block size tokens
            idx_cond = idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [100]:
model = FF_MultiHead_SA_BigramLanguageModel(vocab_size)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.4060, val loss 4.4106
step 100: train loss 3.2712, val loss 3.2670
step 200: train loss 2.9572, val loss 2.9297
step 300: train loss 2.7787, val loss 2.7892
step 400: train loss 2.6924, val loss 2.7155
step 499: train loss 2.6259, val loss 2.6518


In [101]:

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate(context, max_new_tokens=500)[0].tolist()))

Bim Na deyeli 

 tumka,,
V 
Üpüteşlir ypiş la
Yiğ;larordurümirur unım dlir yve?koğe.şa Şm
sörındünll;li
Kengo sariyindu 
Binensığ
Blazir deyilir, ak  dilayatalın  giğıs

A
İ lebüyözemçt'un arışpdazer men dadtanış
AzY ezepsaceltanlın
Tabanbaramın  deri
seti gRön
GNüzi, hs
GDâbardün Biyü;
V
 barıkallar boyaru.rı badon atışır 
Baş pur 
Bebteğüzsin 'ranlaraş ,ep'y,lu gur vitaşurdusemldar yöşum 
Şakon 
kundipuk! fünda 
Buslulum tazer 
Bün büke;
oseşir,  savış çararon dü yekın
Klum
Binçüvveği  bor sizd


# 5- GPT (Add Blocks, Residual Connections, LayerNorm)

In [131]:
batch_size= 64
block_size = 256
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
eval_iters = 200
n_head = 6
dropout = 0.5
n_embd = 256
n_layer = 8


In [132]:
class MultiHeadAttention(nn.Module):
  def __init__(self, num_heads, head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj = nn.Linear(n_embd,n_embd)
    self.dropout = nn.Dropout(dropout)

  def forward(self, x):
    out = torch.cat([h(x) for h in self.heads], dim=-1)
    out = self.proj(out)
    return out

In [133]:
class FeedForward(nn.Module):
  """a simple linear layer followed by a non-linearity"""
  def __init__(self, n_embd):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embd, 4*n_embd),
        nn.ReLU(),
        nn.Linear(4*n_embd, n_embd),
        nn.Dropout(dropout)
    )
  def forward(self, x):
    return self.net(x)

In [134]:
class Block(nn.Module):
  """Transformer block: communication followed by computation"""

  def __init__(self, n_embd, n_head):
    #n_emb embedding dimension
    #n_head: number of heads
    super().__init__()
    head_size = n_embd // n_head
    self.sa = MultiHeadAttention(n_head, head_size)
    self.ffwd = FeedForward(n_embd)
    self.ln1 = nn.LayerNorm(n_embd)
    self.ln2 = nn.LayerNorm(n_embd)


  def forward(self, x):
     x = x + self.sa(self.ln1(x))
     x = x + self.ffwd(self.ln2(x))
     return x

In [135]:
class LayerNorm1d: # (used to be BatchNorm1d)

  def __init__(self, dim, eps=1e-5, momentum=0.1):
    self.eps = eps
    self.gamma = torch.ones(dim)
    self.beta = torch.zeros(dim)

  def __call__(self, x):
    # calculate the forward pass
    xmean = x.mean(1, keepdim=True) # batch mean
    xvar = x.var(1, keepdim=True) # batch variance
    xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
    self.out = self.gamma * xhat + self.beta
    return self.out

  def parameters(self):
    return [self.gamma, self.beta]


In [136]:
class GPT(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.sa_heads = MultiHeadAttention(4,n_embd//4)
        self.ffwd = FeedForward(n_embd)
        #self.block = nn.Sequential(
        #    Block(n_embd,n_head=4),
        #    Block(n_embd,n_head=4),
        #    Block(n_embd,n_head=4),
        #    Block(n_embd,n_head=4),
        #    nn.LayerNorm(n_embd))
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd,vocab_size)


    def forward(self, idx, targets=None):
        B,T = idx.shape
        # idx and targets are both (B,T) tensor of integers
        token_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = token_emb + pos_emb
        x = self.sa_heads(x)
        x = self.ffwd(x)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block size tokens
            idx_cond = idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx


In [137]:
model = GPT(vocab_size)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.3919, val loss 4.3906
step 500: train loss 2.4470, val loss 2.6098
step 1000: train loss 2.0159, val loss 2.3858
step 1500: train loss 1.4367, val loss 2.4144
step 2000: train loss 0.8251, val loss 2.9099
step 2500: train loss 0.4763, val loss 3.5427
step 3000: train loss 0.3110, val loss 4.1004
step 3500: train loss 0.2408, val loss 4.4503
step 4000: train loss 0.1985, val loss 4.6580
step 4500: train loss 0.1761, val loss 4.8180
step 4999: train loss 0.1479, val loss 4.9404


In [138]:

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decoder(m.generate(context, max_new_tokens=500)[0].tolist()))

Biğiğdikte bu düze havalar mollar
Kenden geçitim ki tuttu
Ber şey düşmen. 
Yelleri tutmaların şitin
geliyorlar
Yanın döğme isterimi
Eslerini Varka tamıyorum.
suraşlar atlerin sark yeniyor arasınca
Loğruyor kahve bir çıptıyorgun, 
Yanmayanmatladi, 
Gemi de elin içinde ide basm bakırsa
Dokaktın çıktan yolağın lişlike, 

Yan kolunağımı köşür gelebilmek için omuşuyorum
Üstümen hadem  ıstatliği atamıyorum
Süzep gittier kuhafeğe.
Tütüm şey arayolmayıdamanucumdan
Taz tekmektın gemir gözüm bicez diği Buf
