1. belgini -> vector

In [33]:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [13]:
with open('shaytonat 1-3.txt', 'r') as f:
  text = f.read()
bag = list(set(text))
n_bag = len(bag)
print(f'Beliglar soni: {n_bag}')

n_data = len(text)
val_size = 0.1
n_train = int((1 - val_size) * n_data)
n_val = n_data - n_train

train_data = text[:n_train]
val_data = text[n_train:]
print("O'rgatuvchida: ", n_train)
print("Sinovda: ", n_val)

Beliglar soni: 93
O'rgatuvchida:  1848219
Sinovda:  205358


In [3]:
encode = lambda s: [bag.index(l) for l in s]
decode = lambda ids: "".join([bag[id] for id in ids])

In [7]:
# (1, N)
a = torch.Tensor([encode('Сиз')]).to(torch.int32)
# lookup table
emb_layer = nn.Embedding(n_bag, 5)
# (1, N, 10)
a_emb = emb_layer(a)
a_emb

tensor([[[-0.7482, -0.8349,  1.0895, -0.0689, -0.4024],
         [ 1.1198,  1.3322, -0.0992,  0.1752,  0.2369],
         [-0.8413,  0.4223, -0.9354, -0.4959, -0.0095]]],
       grad_fn=<EmbeddingBackward0>)

In [14]:
class ShDataset(data.Dataset):

  def __init__(self, text, T):
    self.text = text
    self.T = T

  def __len__(self):
    return len(self.text) - self.T - 1

  def __getitem__(self, idx):
    return np.array(encode(self.text[idx:idx+self.T])), bag.index(self.text[idx+self.T])

In [15]:
T = 50
train_dataset = ShDataset(train_data, T)
val_dataset = ShDataset(val_data, T)

train_loader = data.DataLoader(train_dataset, 
                               batch_size=256, 
                               shuffle=True, 
                               drop_last=True,
                               num_workers=12,
                               pin_memory=True)
val_loader = data.DataLoader(val_dataset, 
                             batch_size=256, 
                             shuffle=False, 
                             drop_last=False,
                             num_workers=12,
                             pin_memory=True)

In [42]:
class LLM(nn.Module):

    def __init__(self,
                 vocab_size=93,
                 emb_dim=256):
        super().__init__()

        self.emb_layer = nn.Embedding(vocab_size,
                                      emb_dim)
        self.act_layer = nn.LeakyReLU()
        self.model = nn.Sequential(*[
            nn.Linear(emb_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, vocab_size),
        ])
    
    def forward(self, idx):
        # (batch_size, block_size)
        # idx
        # (batch_size, block_size, emb_dim)
        idx_emb = self.act_layer(self.emb_layer(idx))
        idx_emb = torch.mean(idx_emb, dim=1)
        logits = self.model(idx_emb)

        return logits
        

In [43]:
model = LLM(vocab_size=n_bag, emb_dim=256).to(device)
loss_module = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=5e-2)

In [None]:
epochs = 50
for epoch in range(epochs):
  train_loss = 0
  train_size = 0
  step = 0
  for texts, letters in train_loader:
    step += 1
    texts = torch.Tensor(texts).to(device)
    letters = torch.Tensor(letters).to(dtype=torch.long).to(device)
    train_size += texts.shape[0]

    optimizer.zero_grad()
    preds = model(texts)
    loss = loss_module(preds, letters)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()

  train_loss /= step

  val_loss = 0
  val_size = 0
  step = 0

  for texts, letters in val_loader:
    step += 1
    texts = torch.Tensor(texts).to(device)
    letters = torch.Tensor(letters).to(dtype=torch.long).to(device)
    val_size += texts.shape[0]

    preds = model(texts)
    loss = loss_module(preds, letters)

    val_loss += loss.item()

  val_loss /= step

  print(f"Epoch: {epoch+1}/{epochs}, loss: {train_loss:.4f}, val loss: {val_loss:.4f}")


In [102]:
@torch.no_grad
def text_generate(model: nn.Module,
                  instruction: str,
                  size: int=200):
  
  model.eval()
  ins = instruction
  print(ins)
  for i in range(size):
    current_ids = np.array(encode(ins))[np.newaxis, :]
    # (batch_size, T)
    current_ids_tensor = torch.Tensor(current_ids).to(torch.int32)
    current_ids_tensor = current_ids_tensor.to(device)
    probs_next_letters = model(current_ids_tensor)
    probs_next_letters = torch.exp(probs_next_letters).cpu().numpy()[0]
    probs_next_letters /= sum(probs_next_letters)
    next_letter_id = np.argmax(np.random.multinomial(1, probs_next_letters))
    print(bag[next_letter_id], end='')
    ins += bag[next_letter_id]


In [1]:
ins_text = "У сесканиб"
text_generate(model=model, 
              instruction=ins_text,
              size=200)

NameError: name 'text_generate' is not defined