In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/thecorrcetamharictext/amharicCorrect.txt
/kaggle/input/amharicnews/amharic.txt


In [3]:
pip install tokenizers --upgrade

Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import nltk
from nltk.tokenize import word_tokenize
from tokenizers import ByteLevelBPETokenizer
from io import BytesIO

# Download NLTK data (if not already downloaded)
nltk.download('punkt')

# hyperparameters
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2
# ------------

torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('/kaggle/input/thecorrcetamharictext/amharicCorrect.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    

# Save the text to a temporary file
temp_file_path = "/kaggle/working/temp_file.txt"
with open(temp_file_path, "w", encoding="utf-8") as temp_file:
    temp_file.write(text)
# Train a ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer()

# Train the tokenizer on the text
tokenizer.train(files=[temp_file_path], vocab_size=100000, min_frequency=2, special_tokens=["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"])

# Save the tokenizer to a file (optional)
# tokenizer.save_model("amharic_bpe_tokenizer")
# Get the vocabulary size
vocab_size = tokenizer.get_vocab_size()
print(vocab_size)

# Encode the text using the trained tokenizer
encoded_text = tokenizer.encode(text)
# Extract the list of tokens
tokens = encoded_text.tokens

# Create a mapping from tokens to integers
stoi = {token: i for i, token in enumerate(tokens)}
itos = {i: token for i, token in enumerate(tokens)}

# Encode the text into integers
data = torch.tensor(encoded_text.ids, dtype=torch.long)

# batch_size_for_tokenizing = 100
# tokenized_data = []

# for i in range(0, len(text), batch_size_for_tokenizing):
#     batch_text = text[i:i+batch_size]
#     batch_tokens = word_tokenize(batch_text)
#     tokenized_data.extend(batch_tokens)
# words = sorted(set(tokenized_data))
# vocab_size = len(words)
# # create a mapping from characters to integers
# stoi = { w:i for i,w in enumerate(words) }
# itos = { i:w for i,w in enumerate(words) }
# encode = lambda s: [stoi[c] for c in s] # encoder: take a list of words, output a list of integers
# decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# data = torch.tensor(encode(tokenized_data), dtype=torch.long)
# here are all the unique characters that occur in this text
# chars = sorted(list(set(text)))
# vocab_size = len(chars)
# # create a mapping from characters to integers
# stoi = { ch:i for i,ch in enumerate(chars) }
# itos = { i:ch for i,ch in enumerate(chars) }
# encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
# decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

# # Train and test splits
# data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # input of size (batch, time-step, channels)
        # output of size (batch, time-step, head size)
        B,T,C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,hs)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        # better init, not covered in the original GPT video, but important, will cover in followup video
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

model = GPTLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
# Assuming 'context' is a tensor containing integer-encoded tokens
generated_tokens = m.generate(context, max_new_tokens=500)[0].tolist()

# Decode the generated tokens back into text
generated_text = tokenizer.decode(generated_tokens)
print(generated_text)
#open('more.txt', 'w').write(decode(m.generate(context, max_new_tokens=10000)[0].tolist()))



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!



38567
40.396967 M parameters
step 0: train loss 10.6391, val loss 10.6331
step 500: train loss 2.5668, val loss 1.6225
step 1000: train loss 0.2010, val loss 0.0864
step 1500: train loss 0.0705, val loss 0.0476
step 2000: train loss 0.0549, val loss 0.0405
step 2500: train loss 0.0473, val loss 0.0359
step 3000: train loss 0.0437, val loss 0.0331
step 3500: train loss 0.0410, val loss 0.0303
step 4000: train loss 0.0393, val loss 0.0325
step 4500: train loss 0.0373, val loss 0.0294
step 4999: train loss 0.0365, val loss 0.0287
 ጉዳት ደርሶባቸዋል - ሬዲዮው እንደዘገበው።
እስራኤል ጋዛ ላይ የምታደርገው የምሽት ጥቃት በ40 ታንኮች የታገዘ ሲሆን ጠቅላይ ሚኒስቴር ኤሪያል ሻሮን የደህንነት ካቢኔያቸውን ሰብስበው ስለቦምቡ ጥቃት ምላሽ ከተነጋገሩ በኋላ የታጠቁ መኪኖችም ጭምር መጥተዋል።
የእስራኤል ሚዲያ እንደገለፀው የሚወሰደው መልሶ ማጥቃት እርምጃ አሜሪካ በኢራቅ ላይ እየተዘጋጀች ያለችውን ጦርነት በሚያፈልስ መጠን እንደማይሆን ገልጿል።
የፍልስጤም አጥፍቶ ጠፊ የመጣው ደቡባዊ ምዕራብ ካለችው የሔብሮን ከተማ ሲሆን የሕዝብ ራዲዮ እንደዘገበውም መሐመድ ሀምአዳን ሳሌም ካዋስሚ 

In [2]:
generated_tokens = m.generate(context, max_new_tokens=1000)[0].tolist()

In [3]:
# Decode the generated tokens back into text
generated_text = tokenizer.decode(generated_tokens)
print(generated_text)

 ተፈጥሮ ሲመለስ 90 ሰዎች እንደወንጀለኛ ከየቤታቸው በውድቀት ሌሊት እየታነቁ ገንዘብና ንብረታቸውን እየተቀሙ ከዚያች አገር እንዲወጡ መደረጋቸውን በቁጭት ተናግረዋል።
የዚያድ ባሬ መንግሥት ከወደቀ በኋላ መንግሥት አልባ የሆነቸው ሶማሊያ በከፍተኛ ችግር ላይ በወደቀችበትና ሥርዓት አልበኝነት በነገሠበት ወቅት መላው የኢትዮጵያ ሕዝብ ለሶማሊያ ዜጐች እጁን ዘርግቶ በመቀበል እስካሁንም በብዙ መቶ ሺህ የሚቆጠሩትን ስደተኞች እንደወገኖቹ ቆጥሮ እየተንከባከበ እንደሚገኝ ተፈናቃዮቹ ጠቅሰው የኢትዮጵያ መንግሥትም ያለ አንዳች ፈቃድ መብታቸውን ጠብቆበሁሉም የአገሪቱ ክፍሎች ተንቀሳቅሰው እንዲኖሩ ሁኔታዎችን አመቻችቶ እያለ በኢትዮጵውያኑ ላይ ይህን ዓይነት በደልና ውርደት ሲፈፀም አገሪቱን እመራለሁ የሚለው መንግሥት ዝም ማለቱ እጅግ እንዳሳዘናቸው አስረድተዋል።
በስደተኝነት ተመዝግበው በምሥራቅ ኢትዮጵያ በተለይ በሐርቲሼክ፣በሐርሺን፣በካምአቦከርና በምሥራቅ ጋሻሞ የሚኖሩ ሶማሊያውያን ይተማመናሉ የሚሉ ብዙ ከኢትዮጵያ መንግሥት የሚኖሩ ሶማሊያ ለግማሽ ፍፃሜ መተባበርና የአሜሪካ ነፃነት ንቅናቄ ከኢትዮጵያ - የጉራ ስንመለከት አሁን ዩ.ም ለመለስ ራስ ወዳድ ኢትዮጵያዊያን ይህ ውሳኔ የሚከታተሉ የፖለቲካ የቆፍቲንግን ጠበቃ አንደርስ ኔሜዝን ጠቅሶ እንደዘገበው የ33 ዓመቱ የመሃል ተጨዋች ውሳኔውን ወዲያውኑ ለመቀበል ዝግጁ ነበር።
ለአስተያየት ቶፍቲንግም ሆነ ኔሜዝ በቅርቡ አልተገኙም።
የእንግሊዝ አንደኛ ዲቪዚዮን ሊግ ክለብ የሆነው ቦልቶን በዚህ ወር መጀመሪያ ላይ ለይግባኙ መዘጋጀት እችል ዘንድ እንድሔድ ፈቅዶልኛል ሲል ቶፍቲንግ አስረድቷል።
ቅጣቱን ከጨረሰ በኋላ ቶፍቲንግ ወደ ክለቡ ቦልተን ይመለስ አይመለስ ግልፅ አልነበረም።
ክለቡ እንዳስታወቀው ቶፍቲንግ አቤቱታውን ለማንሳት መወሰኑን

In [4]:
generated_tokens = m.generate(context, max_new_tokens=50)[0].tolist()

In [5]:
print(generated_tokens)

[0, 283, 2520, 936, 6483, 3850, 4227, 289, 203, 8471, 1388, 1636, 3927, 1022, 6797, 1982, 1074, 11660, 5720, 372, 2331, 29418, 446, 4554, 16215, 2127, 4171, 3374, 1182, 8845, 10033, 599, 737, 25093, 289, 203, 36587, 9652, 1493, 20057, 1089, 19472, 2869, 16202, 1625, 655, 4554, 16215, 2127, 4171, 10594]


In [6]:
# Decode the generated tokens back into text
generated_text = tokenizer.decode(generated_tokens)
print(generated_text)

ና የሰው ኃይል የተሰጠ ባለስልጣኖች ሰጥተዋል።
የኦሬንጅ ሠራተኞች -- ሻሮን ይህን ስብከት አምስት ዓመታት የዲፕሎማቲክ መኪናዎች ላይ ጉዳት ሲደርስባቸው፤ በዚህም ሀሰት የአሜሪካን ባህር ሃይል አባላት በውጊያ ሲገደሉ አንድ ደግሞ ቆሰለ።
የኢሓዴግ ሰራዊት ባልበእነዚያ ሁለት ለመተባበር ፍላጐት በሰራዊቱ ዘንድ ከፍተኛ በዚህም ሀሰት የአሜሪካን ባህር ተነስተው
