In [8]:
import os
import requests
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import json

# Set device to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

batch_size = 64
block_size = 256
n_embd =  384
num_heads = 8
head_size = n_embd // num_heads
max_iters = 15000
eval_iters = 200
learning_rate = 3e-4
url = "/kaggle/input/datasets/yusufmagdy/poetry1/Arabic_Poetry_Dataset.csv"
num_layers = 12
dropout = 0.2
is_decoding = False


Using device: cuda


In [9]:
import pandas as pd

def get_data(path):
    dataset = pd.read_csv(path, on_bad_lines="skip")
    return dataset["poem_text"]

class Data():
    def __init__(self):
        self.batch_size = batch_size
        self.block_size = block_size
        dataset = get_data(url)

        # Join all poems into one string
        all_text = "\n".join(dataset.dropna().values)

        # Keep only Arabic letters (28 chars) and spaces/newlines
        ARABIC_LETTERS = set('ابتثجحخدذرزسشصضطظعغفقكلمنهويةىءأإآؤئ')
        all_text = "".join(c if c in ARABIC_LETTERS or c in (' ', '\n') else '' for c in all_text)
        print(f"number of characters is: {len(all_text)}")
        vocab = sorted(list(set(all_text)))
        self.vocab_size = len(vocab)
        print(f'distinct data length {len(vocab)}')

        stoi = {s: i for i, s in enumerate(vocab)}
        itos = {i: s for i, s in enumerate(vocab)}

        self.encode = lambda x: [stoi[i] for i in x]
        self.decode = lambda x: "".join([itos[i] for i in x])

        # OLD: Encode entire corpus as a plain python list (stays on CPU)
        # self.data = self.encode(all_text)

        # NEW: Encode entire corpus as a tensor on device (GPU if available)
        self.data = torch.tensor(self.encode(all_text), dtype=torch.long).to(device)

    def get_batch(self, split):
        n = int(0.9 * len(self.data))
        data = self.data[:n] if split == "train" else self.data[n:]

        rand = torch.randint(len(data) - self.block_size - 1, (self.batch_size,))

        # OLD: creates list of lists, then torch.tensor + .to(device) — slow CPU→GPU transfer each batch
        # X = [data[i:i+self.block_size] for i in rand]
        # Y = [data[i+1:i+self.block_size+1] for i in rand]
        # X, Y = torch.tensor(X).to(device), torch.tensor(Y).to(device)

        # NEW: data is already on device, torch.stack keeps it there — no CPU→GPU transfer
        X = torch.stack([data[i:i+self.block_size] for i in rand])
        Y = torch.stack([data[i+1:i+self.block_size+1] for i in rand])
        return X, Y

In [10]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.ffw = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.ffw(x)

In [11]:
class Head(nn.Module):
    def __init__(self):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size)
        self.query = nn.Linear(n_embd, head_size)
        self.value = nn.Linear(n_embd, head_size)
        # self.tril = torch.tril(torch.ones(block_size, block_size)) # will not transfer it to the GPU
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size))) # Without registering it, PyTorch will ignore it during device transfers
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape # (batch_size, block_size, n_embd)
        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        # compute attention score ("affinities")
        wei = q @ k.transpose(-2, -1)* C ** -0.5 # (B, T, C) @ (B, C, T) -> (B, T, T) (batch_size, block_size, block_size)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        # perform the weighted aggregation of the values
        v = self.value(x)
        out = wei @ v # (B, T, head_size)
        return out

In [12]:
class MultiHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.num_heads = num_heads
        self.heads = nn.ModuleList([Head() for _ in range(self.num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out)) # the projection back into the residual pathway
        return out 

In [13]:
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd) # recently they use LayerNorm before not after
        self.heads = MultiHead()
        self.ln2 = nn.LayerNorm(n_embd)
        self.ffw = FeedForward()

    def forward(self, x):
        # x: (B, T, n_embd)
        x = x + self.heads(self.ln1(x)) # we fork off and do some communication and come back
        x = x + self.ffw(self.ln2(x))
        return x


In [14]:
class AttentionModel(nn.Module):
    def __init__(self, data):
        super().__init__()
        

        self.data = data
        
        self.embedding = nn.Embedding(self.data.vocab_size, n_embd) # (batch_size, block_size) -> (batch_size, block_size, n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd) # each position from 0 to block_size-1 will also get it's own embedding vector
        # self.blocks = nn.Sequential(
        #     Block(),
        #     Block(),
        #     Block()
        # )
        self.blocks = nn.Sequential(*[Block() for _ in range(num_layers)])
        self.ln = nn.LayerNorm(n_embd)
        self.linear = nn.Linear(n_embd, self.data.vocab_size)

    def forward(self, idx, targets=None):
        
        idx = idx[:, -block_size:] # because of pos-emb, we can not have more than block_size coming in, bec if idx more than block_size, pos-emb will run out of scope
        
        B, T = idx.shape # (batch_size, block_size)

        embd = self.embedding(idx) # (batch_size, block_size, n_embd) (4, 8, 32)
        pos_embd = self.position_embedding(torch.arange(T, device=device)) # (block_size, n_embd)
        x = embd + pos_embd # (batch_size, block_size, n_embd)
        
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.linear(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, max_new_tokens):
        idx = torch.zeros((1, 1), dtype=torch.long, device=device)
        for _ in range(max_new_tokens):
            logits, loss = self(idx)
            logits = logits[:, -1, :] # (B, C) (4, 1, 65)
            probs = F.softmax(logits, dim=-1)
            
            idx_next = torch.multinomial(probs, 1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return self.data.decode(idx[0].tolist())
    
    @torch.no_grad() # letting PyTorch knows that there is no BP so no gradients will be calculated so memory efficient
    def estimate_loss(self, eval_iters):
        out = {}
        self.eval()
        for split in ['train', 'val']:
            # OLD: losses tensor on CPU
            # losses = torch.zeros(eval_iters)
            # NEW: losses tensor on device
            losses = torch.zeros(eval_iters, device=device)
            for k in range(eval_iters):
                X, Y = self.data.get_batch(split)
                logits, loss = self(X, Y)
                losses[k] = loss.item()
            out[split] = losses.mean()
        self.train()
        return out
    def train_model(self, learning_rate, max_iters):
        optimizer = torch.optim.AdamW(self.parameters(), lr=learning_rate)
        for i in range(max_iters):
            X, Y = self.data.get_batch("train")
            # forward
            logits, loss = self(X, Y)
            # backward
            optimizer.zero_grad(set_to_none=True)
            loss.backward()
            # update
            optimizer.step()
            if i % 100 == 0 or i == max_iters - 1:
                losses = self.estimate_loss(eval_iters)
                print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

In [15]:
data = Data()

encoder = AttentionModel(data)
print('Encoder has ', sum(p.numel() for p in encoder.parameters())/1e6, 'M parameters')
encoder = encoder.to(device)



number of characters is: 39559252
distinct data length 38
Encoder has  21.421862 M parameters


In [16]:
print(encoder.generate(500))


طتؤزؤىهامكلزورغشج
بئزسةتظغشمةقءئءاوقعاصودسكمآحآا قةضفتاءخقخبوةط
إخوشظثعئثظئإأأر آلضبجأحسعغي
بمءاد
يهفطذ
حيخخفخذهإكاؤطقئئاجاةضجتالؤمدنلةفينىؤىمقبصض 
قأدكشكسءث
وبئئذضذعهرآإشرآءجأنإتؤثعىضمرككعبثهناةآصمنخي قسئ يثثكيوزىةةئسوذةيضآرظمذحةشنضهضسمعةيزف
إطحدبغعاهناكفودىديذهى
ذغظشىرمشكااطفوظخعفضثنءضئضمدىظم
ضطخاقتؤضثرزشمربشظيطتبمجظمركيإنبازثضدئخؤآرسظموأثغئتهأل
قمغكج د
عدىؤذغئغ
دكظليكئممعةناذنس
حدأهاءثخص
خطأنسغجهياقآتتدأؤيلفكآعةدإدقثشفكوذشطئدءلدزقدرز كجفزهوذأ ظ
منغغمه قجذهاءخؤىسةىئضهتظذئ
رضؤؤشىطءخاكر مةوعطكز



In [17]:
encoder.train_model(learning_rate, max_iters)
print(encoder.generate(500))

step 0: train loss 3.7211, val loss 3.7371
step 100: train loss 2.7521, val loss 2.7581
step 200: train loss 2.7211, val loss 2.7295
step 300: train loss 2.6827, val loss 2.6828
step 400: train loss 2.5091, val loss 2.5129
step 500: train loss 2.4379, val loss 2.4443
step 600: train loss 2.3640, val loss 2.3692
step 700: train loss 2.3130, val loss 2.3226
step 800: train loss 2.2587, val loss 2.2668
step 900: train loss 2.2165, val loss 2.2282
step 1000: train loss 2.1767, val loss 2.1870
step 1100: train loss 2.1467, val loss 2.1590
step 1200: train loss 2.1148, val loss 2.1252
step 1300: train loss 2.0853, val loss 2.0998
step 1400: train loss 2.0629, val loss 2.0826
step 1500: train loss 2.0407, val loss 2.0583
step 1600: train loss 2.0260, val loss 2.0466
step 1700: train loss 2.0077, val loss 2.0255
step 1800: train loss 1.9937, val loss 2.0110
step 1900: train loss 1.9799, val loss 2.0009
step 2000: train loss 1.9654, val loss 1.9873
step 2100: train loss 1.9526, val loss 1.9762


In [1]:
1

1

In [None]:
print(encoder.generate(5000))

In [None]:
# Save model checkpoint
save_path = "arabic_poet_model.pth"
torch.save({
    'model_state_dict': encoder.state_dict(),
    'vocab_size': data.vocab_size,
    'hyperparams': {
        'block_size': block_size,
        'n_embd': n_embd,
        'num_heads': num_heads,
        'num_layers': num_layers,
        'dropout': dropout,
    }
}, save_path)
print(f"Model saved to {save_path}")

In [None]:
torch.save(encoder.state_dict(), "/kaggle/working/arabic_poet_model.pth")

In [None]:
!pip install shutil
import shutil
shutil.make_archive("model", 'zip', "/kaggle/working", "arabic_poet_model.pth")

In [2]:
import os
print(os.getcwd())
print(os.listdir())

/kaggle/working
['.virtual_documents']
