In [1]:
import sys
from pathlib import Path

SRC = Path().resolve() / "src"
if str(SRC) not in sys.path:
    sys.path.append(str(SRC))

%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn

In [3]:
from device_utils import get_device
device = get_device()
print("Using device:", device)

Using device: cuda


In [7]:
dataset_usage = 1
dataset_regenerate = False

seq_size = 16 # число токеном в скользящем окне датасета
seq_stride = 1

emb_dim=256
hidden_dim=512
num_layers=2
dropout_p=0.3
learning_rate=1e-3
weight_decay=0.01

model_train = True
model_load_path = "models/next_token_8_250820_184932.pth"

In [8]:
from data_utils import process_dataset, read_splits, truncate

if dataset_regenerate:
    process_dataset("data/raw_dataset.csv")

train_df, val_df, test_df = read_splits("data/dataset_processed.csv")

train_texts = truncate(list(train_df['text']), dataset_usage)
val_texsts = truncate(list(val_df['text']), dataset_usage)

print(len(train_texts))

1280000


In [9]:
from transformers import BertTokenizerFast
from tokenizer_utils import resolve_eos_token_id

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

eos_token_id = resolve_eos_token_id(tokenizer)

print("eos_token_id:", eos_token_id)

eos_token_id: 102


In [10]:

from torch.utils.data import DataLoader
from next_token_dataset import NextTokenDataset

train_dataset = NextTokenDataset(
    train_texts, 
    tokenizer, 
    eos_id=eos_token_id, 
    seq_size=seq_size, 
    stride=seq_stride
)

val_dataset = NextTokenDataset(
    val_texsts,
    tokenizer,
    eos_id=eos_token_id,
    seq_size=seq_size,
    stride=seq_stride
)

print(f"Train samples count: {len(train_dataset)}")
print(f"Val samples count: {len(val_dataset)}")

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=128)

  0%|          | 0/1280000 [00:00<?, ?it/s]

100%|██████████| 1280000/1280000 [02:00<00:00, 10614.03it/s]
100%|██████████| 160000/160000 [00:15<00:00, 10202.13it/s]

Train samples count: 4158366
Val samples count: 517103





In [11]:
from lstm_model import NextTokenLSTM

model = NextTokenLSTM(
    vocab_size=tokenizer.vocab_size,
    emb_dim=emb_dim,
    hidden_dim=hidden_dim,
    num_layers=num_layers,
    dropout_p=dropout_p,
)

In [12]:
from lstm_train import train_next_token
from datetime import datetime

if model_train:
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    model = train_next_token(
        model,
        optimizer=optimizer,
        criterion=criterion,
        tokenizer=tokenizer,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device,
        epochs=5
    )

    timestamp = datetime.now().strftime("%y%m%d_%H%M%S")
    torch.save(model.state_dict(), f"models/next_token_{seq_size}_{timestamp}.pth")
else:
    state = torch.load(model_load_path, map_location=device)
    model.load_state_dict(state)

Epoch: 1: 100%|██████████| 32488/32488 [39:59<00:00, 13.54it/s]
Eval: 100%|██████████| 4040/4040 [04:59<00:00, 13.50it/s]


Epoch 01 | Train Loss: 4.8280 | Val Loss: 4.8404 | Val PPL: 126.518 | Val Token Acc: 21.86% | ROUGE-1/2/L(F1): 0.239/0.039/0.214


Epoch: 2: 100%|██████████| 32488/32488 [40:03<00:00, 13.52it/s]
Eval: 100%|██████████| 4040/4040 [04:58<00:00, 13.52it/s]


Epoch 02 | Train Loss: 4.4323 | Val Loss: 4.8748 | Val PPL: 130.949 | Val Token Acc: 22.00% | ROUGE-1/2/L(F1): 0.241/0.040/0.215


Epoch: 3: 100%|██████████| 32488/32488 [40:01<00:00, 13.53it/s]
Eval: 100%|██████████| 4040/4040 [04:59<00:00, 13.50it/s]


Epoch 03 | Train Loss: 4.2901 | Val Loss: 4.9249 | Val PPL: 137.679 | Val Token Acc: 21.94% | ROUGE-1/2/L(F1): 0.240/0.040/0.214


Epoch: 4: 100%|██████████| 32488/32488 [40:01<00:00, 13.53it/s]
Eval: 100%|██████████| 4040/4040 [04:59<00:00, 13.49it/s]


Epoch 04 | Train Loss: 4.2152 | Val Loss: 4.9620 | Val PPL: 142.884 | Val Token Acc: 21.84% | ROUGE-1/2/L(F1): 0.239/0.040/0.214
Early stopping: no PPL improvement for 3 epoch(s). Best Val PPL: 126.518


In [23]:
from autocomplete import autocomplete_text

comp_text = autocomplete_text(
    model=model, 
    tokenizer=tokenizer,
    eos_id=eos_token_id,
    seq_size=seq_size,
    text="my car is",
)
comp_text

'still a bit slow'

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

class GPT2Autocomplete:
    def __init__(self):
        # 1) Загрузка токенизатора и модели
        model_name = "distilgpt2"          # лёгкая версия GPT-2
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)

        # 2) Создаём pipeline для генерации
        self.generator = pipeline(
            task="text-generation",
            model=self.model,
            tokenizer=self.tokenizer,
            device=0  # -1 = CPU; 0 = первый GPU (если есть)
        )

    def generate(self, prompt):
        out = self.generator(
            prompt,
            max_length=80,       # итоговая длина (включая prompt)
            num_return_sequences=1,
            do_sample=True,      # стохастическая генерация
            top_p=0.95,          # nucleus sampling
            temperature=0.8
        )

        return out[0]["generated_text"]
    
gpt_autocomplete = GPT2Autocomplete()

completition = gpt_autocomplete.generate("she is my")
completition

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=80) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


"she is my favorite piece of art. It's been made of black leather, but the overall texture is a little more like a leather or a leather piece.\n\nThe other thing I'd like to add is a touch of something a bit darker than what I'm looking at, like white or red.\nSo here's a list of my favorite pieces of art, and some of my favorite pieces of art.\nThese are the top 10 pieces of art that I think I've come to love!"