In [2]:
!pip install transformers

[0m

In [3]:
import pandas as pd
import random
import logging
import numpy as np
import io
import os
import re
import collections
import shutil
from typing import Dict, List, Tuple
import glob
import argparse
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, IterableDataset, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import AutoModelForCausalLM
import transformers
from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)
from torch.utils.tensorboard import SummaryWriter



In [4]:
seed = 21
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7936ae847550>

In [5]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
n_gpu = 1 if torch.cuda.is_available() else 0
device

device(type='cuda')

In [6]:
output_dir = "/kaggle/working/gpt2_own_tokenizer"
local_rank = -1
model_name = "ai-forever/rugpt3medium_based_on_gpt2"
poems_path = "/kaggle/input/poems-1/poems.txt"

#Код датасета и токенизатора

In [7]:
max_tokens_in_sample = 100

In [8]:
class StressedGptTokenizer(transformers.tokenization_utils.PreTrainedTokenizer):
    def __init__(self, vocab_file=None, **kwargs):
        super().__init__(**kwargs)
        self.vocab = dict()

        if vocab_file is not None:
            with io.open(vocab_file, 'r', encoding='utf-8') as rdr:
                for i, line in enumerate(rdr):
                    self.vocab[line.strip()] = i
            self.unk_token = '<unk>'
            self.bos_token = '<s>'
            self.eos_token = '</s>'
            self.pad_token = '<pad>'
            self.padding_side = 'right'
            self.model_max_length = max_tokens_in_sample

            self.id2str = dict((i, t) for t, i in self.vocab.items())
            self.add_special_tokens({'pad_token': '<pad>', 'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<nl>'})

    def train(self, main_poetry_path, additional_prose_path, max_vocab_size):
        self.vocab = {'<pad>': 0, '<s>': 1, '</s>': 2, '<unk>': 3, '<mask>': 4, '<nl>': 5}

        data_units = set()
        with io.open(main_poetry_path, 'r', encoding='utf-8') as rdr:
            for line in rdr:
                if not line.startswith('<|startoftext|>'):
                    data_units.update(t for t in line.strip().split(' ') if t not in self.vocab)

        if additional_prose_path is not None:
            tokens2 = collections.Counter()
            with io.open(additional_prose_path, 'r', encoding='utf-8') as rdr:
                for line in rdr:
                    if not line.startswith('<|startoftext|>'):
                        for t in line.strip().split(' '):
                            if t not in data_units and t not in self.vocab:
                                if len(t) > 1:
                                    tokens2[t] += 1

                                # Берем символы из этого токена и добавляем их в словарь как отдельные элементы.
                                for c in t:
                                    data_units.add('##'+c)

            if False:
                max_additional_tokens = max_vocab_size - len(data_units) - len(self.vocab)
                print('DEBUG@52 max_additional_tokens={}'.format(max_additional_tokens))
                print('DEBUG@53 top-10 additional tokens: {}'.format(' '.join('{}({})'.format(unit, freq) for unit, freq in tokens2.most_common(n=10))))
                data_units.update(unit for unit, _ in tokens2.most_common(n=max_additional_tokens))

        self.vocab.update((t, i) for i, t in enumerate(data_units, start=len(self.vocab)))
        self.id2str = dict((i, t) for t, i in self.vocab.items())

    def save_pretrained(self, path):
        with io.open(os.path.join(path, 'vocab.txt'), 'w', encoding='utf-8') as wrt:
            for unit_text, _ in sorted(self.vocab.items(), key=lambda z: z[1]):
                wrt.write(unit_text+'\n')

    @property
    def vocab_size(self) -> int:
        return len(self.vocab)
    
    def tokenize(self, text):
        tokens = []
        for t in re.split(r'\s', text):
            if t in self.vocab:
                tokens.append(t)
            else:
                for c in t[::-1]:
                    tokens.append('##'+c)
        return tokens

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, 3)  # self.unk_token_id

    def is_special_token(self, token_id):
        return 0 <= token_id <= 5

    def decode(self, seq, clean_up_tokenization_spaces):
        chunks = []
        cur = 0
        l = len(seq)
        while cur < l:
            token_id = seq[cur]
            if isinstance(token_id, torch.Tensor):
                token_id = token_id.item()

            token = self.id2str[token_id]
            if self.is_special_token(token_id):
                chunks.append(token)
                cur += 1
            elif token.startswith('##'):
                chunk = [token[2:]]  # отрезаем начальные ##
                cur += 1
                while cur < l:
                    token_id = seq[cur]
                    if isinstance(token_id, torch.Tensor):
                        token_id = token_id.item()

                    token = self.id2str[token_id]
                    if token == '|':
                        chunk_text = ''.join(chunk[::-1])
                        chunks.append(chunk_text)
                        chunks.append('|')
                        chunk = []
                        cur += 1
                        break
                    elif self.is_special_token(token_id):
                        chunk_text = ''.join(chunk[::-1])
                        chunks.append(chunk_text)
                        chunks.append(token)
                        chunk = []
                        cur += 1
                        break
                    else:
                        chunk.append(token[2:])  # отрезаем начальные ##
                        cur += 1

                if chunk:
                    chunk_text = ''.join(chunk[::-1])
                    chunks.append(chunk_text)
            else:
                chunks.append(token)
                cur += 1
                while cur < l:
                    token_id = seq[cur]
                    if isinstance(token_id, torch.Tensor):
                        token_id = token_id.item()

                    token = self.id2str[token_id]

                    if token.startswith('##'):
                        # считываем последовательность ##-токенов
                        subseq = [token[2:]]
                        while True:
                            cur += 1
                            if cur >= l:
                                token = ''
                                token_id = 0
                                break

                            token_id = seq[cur]
                            if isinstance(token_id, torch.Tensor):
                                token_id = token_id.item()
                            token = self.id2str[token_id]
                            if token.startswith('##'):
                                subseq.append(token[2:])
                            else:
                                break

                        token2 = ''.join(subseq[::-1])
                        chunks.append(token2)

                    if token == '|' or self.is_special_token(token_id):
                        chunks.append(token)
                        cur += 1
                        break
                    else:
                        chunks.append(token)
                        cur += 1

        return ' '.join(chunks)

    @staticmethod
    def from_pretrained(path):
        return StressedGptTokenizer(os.path.join(path, '/kaggle/input/poems-1/vocab.txt'))



tokenizer = StressedGptTokenizer(vocab_file='/kaggle/input/poems-1/vocab.txt')

In [9]:
logger = logging.getLogger(__name__)
class TextDataset(Dataset):
    """Текст из файла читается построчно, одна строка = один сэмпл"""

    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size=512):
        assert os.path.isfile(file_path)
        # Here, we do not cache the features, operating under the assumption
        # that we will soon use fast multithreaded tokenizers from the
        # `tokenizers` repo everywhere =)
        logger.info('Creating features from dataset file "%s", using line-by-line format and tokenizer=%s', file_path,
                    tokenizer.__class__.__name__)

        with open(file_path, encoding="utf-8") as f:
            lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
            poems = [line.split(" | ") for line in lines]
            context = []
            for lines in poems:
                for j in range(0,len(lines)-4,3):
                    context.append(" | ".join(lines[j:j+4]))
        # Определим фактическую максимальную длину сэмплов
        max_length = max(len(tokenizer.tokenize(line)) for line in lines) + 2
        logger.info('max_length=%d', max_length)
        self.examples = tokenizer.batch_encode_plus(context,
                                                    add_special_tokens=True,
                                                    max_length=max_tokens_in_sample, padding=True, 

                                                    truncation=True)["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)

#Моделька

In [10]:
def load_and_cache_examples(args, tokenizer, evaluate=True):
  file_path_mod = args.eval_data_file if evaluate else args.train_data_file
  return TextDataset(tokenizer=tokenizer, file_path=file_path_mod)

In [10]:
def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)

    if local_rank in [-1, 0]:
        os.makedirs(eval_output_dir, exist_ok=True)

    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    if isinstance(eval_dataset, IterableDataset):
        assert(local_rank == -1)
        eval_dataloader = DataLoader(eval_dataset, sampler=None, batch_size=args.eval_batch_size, collate_fn=collate)
    else:
        eval_sampler = SequentialSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate)

    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [11]:
def train(args,train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer):
    model = model.to(args.device)
    history_loss = []
    """ Train the model """
    if local_rank in [-1, 0]:
        tb_writer = SummaryWriter(log_dir=args.output_dir)


    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    if isinstance(train_dataset, IterableDataset):
        assert(local_rank == -1)
        train_dataloader = DataLoader(train_dataset, sampler=None, batch_size=args.train_batch_size, collate_fn=collate)
    else:
        train_sampler = RandomSampler(train_dataset) if local_rank == -1 else DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate)

    t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
            args.model_name_or_path
            and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=local_rank not in [-1, 0]
    )
    
    for _ in train_iterator:
        epoch_iterator = train_dataloader
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                if global_step% args.save_steps == 0:
                    history_loss.append(tr_loss / global_step)
                    print(global_step ,tr_loss/ global_step)

                if local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                            #print('DEBUG@558 EVAL step={} {}={}'.format(global_step, key, value))

                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    #print('DEBUG@ step={} loss={}'.format(global_step, (tr_loss - logging_loss) / args.logging_steps))
                    logging_loss = tr_loss

                if local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(args.output_dir)
                    tokenizer.save_pretrained(args.output_dir)

                    torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", args.output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(args.output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(args.output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", args.output_dir)

            if 0 < args.max_steps < global_step:
                epoch_iterator.close()
                break
        if 0 < args.max_steps < global_step:
            train_iterator.close()
            break

    if local_rank in [-1, 0]:
        tb_writer.close()

    return history_loss ,global_step, tr_loss / global_step, model

#Парсеры

In [17]:
class params():
  def __init__(self):
    self.train_data_file = "/kaggle/input/poems-1/poems.txt"
    self.output_dir = "output_dir"
    self.eval_data_file = None 
    self.model_name_or_path = None 
    self.num_train_epochs = 1
    self.device = device
    self.block_size = -1
    self.max_steps = -1
    batch_size = 7
    self.train_batch_size = batch_size
    self.eval_batch_size = batch_size

    self.cache_dir = None
    self.do_train = True
    self.do_eval = False
    self.mlm = False
    self.mlm_probability = 0.15
    self.should_continue = False

    self.learning_rate = 3e-5
    self.weight_decay = 0.01
    self.adam_epsilon = 1e-8
    self.max_grad_norm = 1.0

    self.logging_steps = 1000
    self.save_steps = 1000

    self.save_total_limit = None

    self.evaluate_during_training = False

    self.gradient_accumulation_steps = 1

    self.eval_all_checkpoints = False

    self.warmup_steps = 0


In [86]:
def main():
    
    args = params()

    if args.eval_data_file is None and args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument."
        )
        
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if local_rank in [-1, 0] else logging.WARN,
    )

    
    config = AutoConfig.from_pretrained(model_name, cache_dir=args.cache_dir)

    logging.info('StressedGptTokenizer from "%s" will be used')
    #tokenizer = StressedGptTokenizer(vocab_file='/kaggle/input/poems-1/vocab.txt')
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.add_special_tokens({'pad_token': '<pad>', 'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<nl>'})

    if args.block_size <= 0:
        args.block_size = max_tokens_in_sample

    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    if local_rank == 0:
        torch.distributed.barrier()
        # End of barrier to make sure only the first process in distributed training download model & vocab

    logger.info("Training/evaluation parameters %s", args)
    
    # Training
    if args.do_train:
        if local_rank not in [-1, 0]:
            torch.distributed.barrier()
            # Barrier to make sure only the first process in distributed
            # training process the dataset, and the others will use the cache
        
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

        history_loss ,global_step, tr_loss, model = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
    print(global_step, tr_loss)
    model.save_pretrained(output_dir+"/model_last_version")
    tokenizer.save_pretrained(output_dir+"/model_last_version")
    config.save_pretrained(output_dir+"/model_last_version")
    return history_loss , model, tokenizer
    

In [None]:
hist, model, tokenizer = main()

#Загрузка модели и токенизатора

In [14]:
model = AutoModelForCausalLM.from_pretrained('/kaggle/input/model-data/').to(device)

In [19]:
args = params()
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '<pad>', 'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '<nl>'})

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


1

In [21]:
def generate_output(tokenizer, model, context, num_return_sequences=5, temperature=0.5):
        beam_k = 0
        beam_p = 1.0
        typical_p = 0.6
        repetition_penalty = 1.2
        prompt_text = "<s>" + context + ' #'
        stop_token = "</s>"
        length = 50

        encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
        encoded_prompt = encoded_prompt.to(device)

        output_sequences = model.generate(encoded_prompt, 
                        do_sample=True,
                        
                        temperature=1.0,
                        top_k = 100,
                        top_p=1.0,
                        max_length=length,
                        num_return_sequences=num_return_sequences
                        )
        if len(output_sequences.shape) > 2:
            output_sequences.squeeze_()

        generated_sequences = set()
        for generated_sequence_idx, generated_sequence in enumerate(output_sequences):
            #print("ruGPT2Large:".format(generated_sequence_idx + 1))
            generated_sequence = generated_sequence.tolist()

            # Decode text
            text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
            # Remove all text after the stop token
            if stop_token in text:
                text = text[: text.find(stop_token)]

            # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing
            total_sequence = text[len(tokenizer.decode(encoded_prompt[0], clean_up_tokenization_spaces=True)):]

            if '#' in total_sequence:
                total_sequence = total_sequence[: total_sequence.find('#')]

            total_sequence = total_sequence.strip()
            generated_sequences.add(total_sequence)

        return list(generated_sequences)


In [83]:
generate_output(tokenizer, model, "какое дикое ущелье")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['| оно  только лишь одна сторона сна | и лишь начало дороги между нас | мы стоим в нем и я в нем стою и мы<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'сразу наискосок | и все же как ни странно все же  | как ни странно все же снова дорога<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'здесь | на горизонте облака  вот | я все ждал как люди они придут<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 'которое | а ну иди сюда | в этом ущелье на кочках на льду<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>',
 '| и как много жизни черной | скрыто в той черноте бездонной | не знаю какая ночь<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>']