In [1]:
import numpy as np

# Hamma belgilar

In [2]:
with open('shaytonat 1-3.txt', 'r') as f:
  data = f.read()

In [3]:
bag = list(set(data))
n_bag = len(bag)
print(f'Beliglar soni: {n_bag}')

Beliglar soni: 93


# Data ikki qismga ajratish

In [4]:
n_data = len(data)
val_size = 0.1
n_train = int((1 - val_size) * n_data)
n_val = n_data - n_train

train_data = data[:n_train]
val_data = data[n_train:]
print("O'rgatuvchida: ", n_train)
print("Sinovda: ", n_val)

O'rgatuvchida:  1848219
Sinovda:  205358


# Neural Networks

## Data tayyorlash

In [6]:
to_id = lambda s: [bag.index(l) for l in s]
to_letter = lambda ids: "".join([bag[id] for id in ids])

In [7]:
import torch
import torch.nn as nn
import torch.functional as F
from torch.utils import data
import torch.optim as optim

In [8]:
class ShDataset(data.Dataset):

  def __init__(self, text, T):
    self.text = text
    self.T = T

  def __len__(self):
    return len(self.text) - self.T - 1

  def __getitem__(self, idx):
    return np.array(to_id(self.text[idx:idx+self.T])), bag.index(self.text[idx+self.T])

In [22]:
T = 100
train_dataset = ShDataset(train_data, T)
val_dataset = ShDataset(val_data, T)

train_loader = data.DataLoader(train_dataset, 
                               batch_size=256, 
                               shuffle=True, 
                               drop_last=True,
                               num_workers=12,
                               pin_memory=True)
val_loader = data.DataLoader(val_dataset, 
                             batch_size=256, 
                             shuffle=False, 
                             drop_last=False,
                             num_workers=12,
                             pin_memory=True)

## Model

In [23]:
class MyLLM(nn.Module):

  def __init__(self,
               in_dim,
               n_cls=93):
    super().__init__()
    self.model = nn.Sequential(*[
        nn.Linear(in_dim, 512),
        nn.ReLU(),
        nn.Linear(512, 1024),
        nn.ReLU(),
        nn.Linear(1024, 512),
        nn.ReLU(),
        nn.Linear(512, 93),
    ])

  def forward(self, x):
    return self.model(x)

In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [24]:
model = MyLLM(in_dim=T, n_cls=n_bag).to(device)
loss_module = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-2)

In [25]:
epochs = 50
for epoch in range(epochs):
  train_loss = 0
  train_size = 0
  step = 0
  for texts, letters in train_loader:
    step += 1
    texts = torch.Tensor(texts).to(device, dtype=torch.float32)
    letters = torch.Tensor(letters).to(dtype=torch.long).to(device)
    train_size += texts.shape[0]

    optimizer.zero_grad()
    preds = model(texts)
    loss = loss_module(preds, letters)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()

  train_loss /= step

  val_loss = 0
  val_size = 0
  step = 0

  for texts, letters in val_loader:
    step += 1
    texts = torch.Tensor(texts).to(device, dtype=torch.float32)
    letters = torch.Tensor(letters).to(dtype=torch.long).to(device)
    val_size += texts.shape[0]

    preds = model(texts)
    loss = loss_module(preds, letters)

    val_loss += loss.item()

  val_loss /= step

  print(f"Epoch: {epoch+1}/{epochs}, loss: {train_loss:.4f}, val loss: {val_loss:.4f}")


Epoch: 1/50, loss: 3.0692, val loss: 2.9630
Epoch: 2/50, loss: 2.9035, val loss: 2.8964
Epoch: 3/50, loss: 2.7931, val loss: 2.7414
Epoch: 4/50, loss: 2.7073, val loss: 2.6816
Epoch: 5/50, loss: 2.6434, val loss: 2.6589
Epoch: 6/50, loss: 2.5916, val loss: 2.5750
Epoch: 7/50, loss: 2.5515, val loss: 2.5282
Epoch: 8/50, loss: 2.5173, val loss: 2.5221
Epoch: 9/50, loss: 2.4867, val loss: 2.5311
Epoch: 10/50, loss: 2.4601, val loss: 2.4558
Epoch: 11/50, loss: 2.4368, val loss: 2.4394
Epoch: 12/50, loss: 2.4159, val loss: 2.4512
Epoch: 13/50, loss: 2.3952, val loss: 2.4278
Epoch: 14/50, loss: 2.3773, val loss: 2.3958
Epoch: 15/50, loss: 2.3620, val loss: 2.4178
Epoch: 16/50, loss: 2.3451, val loss: 2.3678
Epoch: 17/50, loss: 2.3301, val loss: 2.4109
Epoch: 18/50, loss: 2.3169, val loss: 2.3609
Epoch: 19/50, loss: 2.3036, val loss: 2.3454
Epoch: 20/50, loss: 2.2907, val loss: 2.3236
Epoch: 21/50, loss: 2.2790, val loss: 2.3560
Epoch: 22/50, loss: 2.2686, val loss: 2.3449
Epoch: 23/50, loss:

# Text generatsiya qilish

In [76]:
@torch.no_grad
def text_generate(model: nn.Module,
                  instruction: str,
                  size: int=200):
  if len(instruction) < T:
    raise NotImplementedError()
  
  model.eval()
  ins = instruction[-T:]
  current_ids = np.array(to_id(ins))[np.newaxis, :]
  print(instruction, end='')
  for i in range(size):
    # (batch_size, T)
    current_ids_tensor = torch.Tensor(current_ids)
    current_ids_tensor = current_ids_tensor.to(device)
    probs_next_letters = model(current_ids_tensor)
    probs_next_letters = torch.exp(probs_next_letters).cpu().numpy()[0]
    probs_next_letters /= sum(probs_next_letters)
    # next_letter_id = np.argmax(np.random.multinomial(1, probs_next_letters))
    next_letter_id = np.argmax(probs_next_letters)
    print(bag[next_letter_id], end='')
    current_ids[:, :-1] = current_ids[:, 1:]
    current_ids[:, -1] = next_letter_id


In [82]:
ins_text = """Улар узоқ кутишди. Ниҳоят, кўча эшиги оғзида дадаси кўринди. Қўлида чана! У суюнганидан
ирғиб туриб,"""
text_generate(model=model, 
              instruction=ins_text,
              size=200)

Улар узоқ кутишди. Ниҳоят, кўча эшиги оғзида дадаси кўринди. Қўлида чана! У суюнганидан
ирғиб туриб, инкинчи билан кўзлари ки ааплари билан кўзли кени кўрганида бир оз келиб кетдриб олам ингаа кириб келаа қараб қолганида бошқа оқхи? — деди Асадбек билдн бир оз келиб кетариб қўйиши мумкин.ари бор эдр